In [20]:
OPENAI_API_KEY = " "

In [14]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from openai import OpenAI
import os

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

with open('data.json', 'r') as f:
    data = json.load(f)
    
data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[{'question': 'Where can I buy a laptop for work?',
  'answer': 'You can browse our collection of business laptops under the Electronics section.'},
 {'question': 'I need a computer for office use. Any suggestions?',
  'answer': 'Check out our laptops designed for productivity in the Electronics section.'},
 {'question': 'Do you sell gaming headsets?',
  'answer': 'Yes, we have a variety of gaming headsets in our Accessories category.'},
 {'question': 'Looking for headphones for gaming.',
  'answer': 'We offer high-quality gaming headsets in the Accessories category.'},
 {'question': 'Best smartphones under $500?',
  'answer': 'Check out our affordable smartphones in the Mobile Phones section, filtered by price.'},
 {'question': 'Phones available below $500?',
  'answer': 'Explore our Mobile Phones section for budget-friendly options under $500.'},
 {'question': 'Do you sell kitchen appliances like blenders?',
  'answer': 'Yes, our Kitchen Appliances section has a range of blenders and

In [15]:
data_transformed = [f"Question: {d['question']} Answer: {d['answer']}" for d in data]
data_transformed

['Question: Where can I buy a laptop for work? Answer: You can browse our collection of business laptops under the Electronics section.',
 'Question: I need a computer for office use. Any suggestions? Answer: Check out our laptops designed for productivity in the Electronics section.',
 'Question: Do you sell gaming headsets? Answer: Yes, we have a variety of gaming headsets in our Accessories category.',
 'Question: Looking for headphones for gaming. Answer: We offer high-quality gaming headsets in the Accessories category.',
 'Question: Best smartphones under $500? Answer: Check out our affordable smartphones in the Mobile Phones section, filtered by price.',
 'Question: Phones available below $500? Answer: Explore our Mobile Phones section for budget-friendly options under $500.',
 'Question: Do you sell kitchen appliances like blenders? Answer: Yes, our Kitchen Appliances section has a range of blenders and more.',
 'Question: I need a blender for smoothies. Answer: Check out our c

In [16]:
len(data_transformed)

40

In [17]:
data_transformed[0:5]


['Question: Where can I buy a laptop for work? Answer: You can browse our collection of business laptops under the Electronics section.',
 'Question: I need a computer for office use. Any suggestions? Answer: Check out our laptops designed for productivity in the Electronics section.',
 'Question: Do you sell gaming headsets? Answer: Yes, we have a variety of gaming headsets in our Accessories category.',
 'Question: Looking for headphones for gaming. Answer: We offer high-quality gaming headsets in the Accessories category.',
 'Question: Best smartphones under $500? Answer: Check out our affordable smartphones in the Mobile Phones section, filtered by price.']

In [27]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    
    # Join tokens back
    return ' '.join(tokens)

def search(query, type="tf-idf", top_n=5):
    if type == "tf-idf":
        # Preprocess all texts including query
        processed_data = [preprocess_text(text) for text in data_transformed]
        processed_query = preprocess_text(query)
        
        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(processed_data)
        query_vector = vectorizer.transform([processed_query])
        
        # Calculate cosine similarities
        similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
        
    elif type == "embedding":
        client = OpenAI(api_key=OPENAI_API_KEY)
        
        # Get embeddings for all texts
        data_embeddings = []
        for text in data_transformed:
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=text
            )
            data_embeddings.append(response.data[0].embedding)
            
        # Get embedding for query
        query_embedding = client.embeddings.create(
            model="text-embedding-ada-002",
            input=query
        ).data[0].embedding
        
        # Calculate cosine similarities
        similarities = [
            cosine_similarity(
                np.array(query_embedding).reshape(1, -1),
                np.array(doc_embedding).reshape(1, -1)
            )[0][0]
            for doc_embedding in data_embeddings
        ]
    else:
        raise ValueError("Type must be either 'tf-idf' or 'embedding'")
    
    # Get top N matches
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    
    return [(data_transformed[i], similarities[i]) for i in top_indices]


In [28]:
search("best laptop for office?", type="tf-idf", top_n=10)

[('Question: Where can I find office chairs? Answer: Check out our Furniture section for ergonomic office chairs.',
  0.3114371464281391),
 ('Question: Where can I buy a laptop for work? Answer: You can browse our collection of business laptops under the Electronics section.',
  0.24160377097159058),
 ('Question: Best smartphones under $500? Answer: Check out our affordable smartphones in the Mobile Phones section, filtered by price.',
  0.19275813018380047),
 ('Question: Looking for ergonomic chairs for work. Answer: We have a collection of ergonomic office chairs in the Furniture section.',
  0.14676478357586123),
 ('Question: I need a computer for office use. Any suggestions? Answer: Check out our laptops designed for productivity in the Electronics section.',
  0.1450312499105394),
 ("Question: Do you sell children's winter jackets? Answer: Yes, check out our Kids' Clothing section for warm and stylish winter jackets.",
  0.0),
 ('Question: Are there any discounts on shoes? Answer:

In [29]:
search("best laptop for office?", type="embedding", top_n=10)

[('Question: I need a computer for office use. Any suggestions? Answer: Check out our laptops designed for productivity in the Electronics section.',
  0.8719268259522779),
 ('Question: Where can I buy a laptop for work? Answer: You can browse our collection of business laptops under the Electronics section.',
  0.8394006109677794),
 ('Question: Where can I find office chairs? Answer: Check out our Furniture section for ergonomic office chairs.',
  0.7934396587997761),
 ('Question: Looking for ergonomic chairs for work. Answer: We have a collection of ergonomic office chairs in the Furniture section.',
  0.7891419412694121),
 ('Question: Do you sell desks for home offices? Answer: Yes, we have a collection of study desks in our Furniture section.',
  0.7805182438602386),
 ('Question: Need a coffee machine for my kitchen. Answer: Explore our Kitchen Appliances section for coffee machines and more.',
  0.7708216085042308),
 ('Question: Where can I find study desks? Answer: Browse our Fur

In [30]:
search("gift for kids", type="tf-idf", top_n=10)

[("Question: Where can I find winter coats for kids? Answer: Browse our Kids' Clothing section for a variety of winter coats.",
  0.5007484606528442),
 ("Question: Do you sell children's winter jackets? Answer: Yes, check out our Kids' Clothing section for warm and stylish winter jackets.",
  0.2631241074642894),
 ('Question: Do you sell desks for home offices? Answer: Yes, we have a collection of study desks in our Furniture section.',
  0.0),
 ('Question: Where can I find study desks? Answer: Browse our Furniture section for study desks and tables.',
  0.0),
 ('Question: Looking for deals on sneakers. Answer: Visit our Footwear section for discounts on sneakers and other shoes.',
  0.0),
 ('Question: Are there any discounts on shoes? Answer: Check out our Footwear section for the latest discounts and deals.',
  0.0),
 ('Question: Do you have free delivery options? Answer: Yes, free delivery is available for orders exceeding $50.',
  0.0),
 ('Question: Can I get free shipping on order

In [31]:
search("gift for kids", type="embedding", top_n=10)

[("Question: Do you sell children's winter jackets? Answer: Yes, check out our Kids' Clothing section for warm and stylish winter jackets.",
  0.7915693864079231),
 ("Question: Where can I find winter coats for kids? Answer: Browse our Kids' Clothing section for a variety of winter coats.",
  0.7825186226710089),
 ('Question: Looking for dog food and toys. Answer: Browse our Pet Supplies section for a variety of dog food and toys.',
  0.7757764812023916),
 ('Question: Do you sell pet supplies? Answer: Yes, check out our Pet Supplies section for food, toys, and accessories.',
  0.7655160660367665),
 ('Question: Looking for headphones for gaming. Answer: We offer high-quality gaming headsets in the Accessories category.',
  0.7537807968539718),
 ('Question: Do you sell spatulas and kitchen tools? Answer: Yes, our Kitchen section has a wide range of cooking tools and utensils.',
  0.7470222362528278),
 ('Question: Looking for backpacks for long trips. Answer: Check out our collection of t