In [1]:
import sys
import os
from openai import OpenAI

# Use current working directory and go one level up
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)

# Now you can import your config
from config import api_key

client = OpenAI(api_key=api_key)

In [2]:
# Import required libraries
import os
import openai
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import chromadb
from scipy.spatial import distance
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

# Initialize your API key
openai_api_key = api_key

#EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_MODEL = "text-embedding-3-small"

# Load the dataset
reviews = pd.read_csv("Womens Clothing E-Commerce Reviews.csv").sample(n=1000, random_state=54).dropna()
print(reviews.shape)
review_texts = reviews["Review Text"]

# Create and store the embeddings for reviews in one API call
client = openai.OpenAI(api_key=openai_api_key)
responses = client.embeddings.create(input=review_texts.tolist(), model=EMBEDDING_MODEL).model_dump()
embeddings = [response["embedding"] for response in responses["data"]]

(831, 11)


In [None]:
# Apply t-SNE for dimensionality reduction
def apply_tsne(embeddings):
    tsne = TSNE(n_components=2, random_state=0)
    return tsne.fit_transform(embeddings)

embeddings_2d = apply_tsne(np.array(embeddings))

# Plotting the results of t-SNE
def plot_tsne(tsne_results):
    plt.figure(figsize=(12, 8))
    for i, point in enumerate(tsne_results):
        plt.scatter(point[0], point[1], alpha=0.5)
        plt.text(point[0], point[1], str(i), fontsize=8, verticalalignment='center')
    plt.title("t-SNE Visualization of Review Embeddings")
    plt.xlabel("t-SNE feature 1")
    plt.ylabel("t-SNE feature 2")
    plt.show()

plot_tsne(embeddings_2d)

In [None]:
# Define topics
categories = ["Quality", "Fit", "Style", "Comfort"]

# Create embeddings for all categories in one API call
category_responses = client.embeddings.create(input=categories, model=EMBEDDING_MODEL).model_dump()

# Extract embeddings from the responses and map them to their respective categories
category_embeddings = [embedding["embedding"] for embedding in category_responses["data"]]
print(len(category_embeddings))

# Function to categorize feedback
def categorize_feedback(text_embedding, category_embeddings):
    similarities = [{"distance": distance.cosine(text_embedding, cat_emb), "index":i}
                     for i, cat_emb in enumerate(category_embeddings)]
    closest = min(similarities, key=lambda x: x["index"])
    return categories[closest["index"]]

# Categorize feedback
feedback_categories = [categorize_feedback(embedding, category_embeddings) for embedding in embeddings]

In [12]:
# Initialize Chromadb instance for vector storage
client = chromadb.PersistentClient()

# Define vector database
review_embeddings_db = client.create_collection(
    name="review_embeddings",
    embedding_function=OpenAIEmbeddingFunction(api_key=openai_api_key),
    get_or_create=True)

# Store embeddings inside vector database
review_embeddings_db.add(
    documents=review_texts.tolist(),
    ids=[str(i) for i in range(len(review_texts))]
)

# Function for similarity search using vector db query function
def find_similar_reviews(input_text, vector_db, n=3):
    collection = client.get_collection(
        name="review_embeddings",
        embedding_function=OpenAIEmbeddingFunction(api_key=openai_api_key))
    results = collection.query(
        query_texts=[input_text],
        n_results=n
    )
    return results

# Example feedback and finding similar feedback
example_review = "Absolutely wonderful - silky and sexy and comfortable"
most_similar_reviews = find_similar_reviews(example_review, review_embeddings_db, 3)["documents"][0]
print(most_similar_reviews)

['So soft!! such beautiful fabric!! it flows so gracefully and looks lovely with black leggings as pictured. one of my favorite retailer purchases to date...', "Wasn't sure what to expect - but got this on sale, and glad i did.\nit is much prettier in person. soft and silky.\ngreat fit also - and versatile (mix and match)", 'Absolutely stunning dress. true to size and flows like a dream.']


In [4]:
collection = client.get_collection(name="review_embeddings")
num_rows = collection.count()
print(f"Number of rows: {num_rows}")

Number of rows: 831


In [5]:
collections = client.list_collections()
for col in collections:
    print(col.name)

review_embeddings


In [11]:
collection.peek(limit=2)

{'ids': ['0', '1'],
 'embeddings': array([[-0.03957852,  0.00538816, -0.01154047, ...,  0.00674664,
         -0.02011582, -0.03490224],
        [-0.02526411, -0.02035497, -0.00101609, ..., -0.0007762 ,
          0.00932603, -0.02535723]], shape=(2, 1536)),
 'documents': ["I am a 32dd, 5'4'' and 125lbs and this top was way too baggy. it looked awful. this is shapeless and oversized. i would size down or just pass.",
  "I've worn this shirt approx 3 times to work. each time someone comments on how much they like it. i'm attempting to add more color to my workwear. this shirt tends to swing so much i feel exposed. that is the biggest drawback."],
 'uris': None,
 'included': ['embeddings', 'metadatas', 'documents'],
 'data': None,
 'metadatas': [None, None]}

In [26]:
reviews.head(1)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
10898,10898,860,26,Not if you're busty,"I am a 32dd, 5'4'' and 125lbs and this top was...",2,0,0,General,Tops,Knits


In [44]:
ids = []
metadata = []
for index, irow in enumerate(reviews.iterrows()):
    i, row = irow
    ids.append(str(index))
    metadata.append({
        "rating": row["Rating"],
        "title": row["Title"],
        "class_name": row["Class Name"]
    })

In [46]:
review_embeddings_db.update(ids=ids, metadatas=metadata)

In [49]:
review_embeddings_db.peek(limit=2)

{'ids': ['0', '1'],
 'embeddings': array([[-0.03957852,  0.00538816, -0.01154047, ...,  0.00674664,
         -0.02011582, -0.03490224],
        [-0.02526411, -0.02035497, -0.00101609, ..., -0.0007762 ,
          0.00932603, -0.02535723]], shape=(2, 1536)),
 'documents': ["I am a 32dd, 5'4'' and 125lbs and this top was way too baggy. it looked awful. this is shapeless and oversized. i would size down or just pass.",
  "I've worn this shirt approx 3 times to work. each time someone comments on how much they like it. i'm attempting to add more color to my workwear. this shirt tends to swing so much i feel exposed. that is the biggest drawback."],
 'uris': None,
 'included': ['embeddings', 'metadatas', 'documents'],
 'data': None,
 'metadatas': [{'class_name': 'Knits',
   'rating': 2,
   'title': "Not if you're busty"},
  {'title': 'Always receive a complement when i wear this',
   'rating': 4,
   'class_name': 'Knits'}]}