To run this notebook, you will need to install: pandas, openai, transformers, plotly, matplotlib, scikit-learn, torch (transformer dep), torchvision, and scipy.

In [8]:
# imports
import pandas as pd
import tiktoken
import openai
import time
from openai.embeddings_utils import get_embedding
from tenacity import retry, wait_random_exponential, stop_after_attempt

openai.api_key = "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"

In [9]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [11]:
# load & inspect dataset
input_datapath = "updated_file.csv" # Load the dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Clothing ID", "Age", "Title", "Review Text", "Rating", "Recommended IND", "Positive Feedback Count", "Division Name", "Department Name", "Class Name", "Product Name"]]
df = df.dropna()
df["combined"] = ("Title: " + df["Title"] +". Age: " + df["Age"].astype(str) +". Review: " + df["Review Text"] + ". Rating: " + df["Rating"].astype(str) + ". Recommended: " + df["Recommended IND"].astype(str) + ". Positive Feedback Count: " + df["Positive Feedback Count"].astype(str) + ". Division Name: " + df["Division Name"] + ". Department Name: " + df["Department Name"] + ". Class Name: " + df["Class Name"] + ". Product Name: " + df["Product Name"])
df.head(2)

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Product Name,combined
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Maxi Dress,Title: Some major design flaws. Age: 60. Revie...
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,Slacks,Title: My favorite buy!. Age: 50. Review: I lo...


In [12]:
df = df.sort_values("Rating") # sort by rating

encoding = tiktoken.get_encoding(embedding_encoding)

#omit reviews that are too long
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]
len(df)

19662

In [13]:
# Retry up to 6 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]


In [14]:

df["embedding"] = None
for i, row in df.iterrows():
    print(f"Processing row {i}...")
    try:
        embedding = get_embedding(row["combined"])
        df.at[i, "embedding"] = embedding
    except Exception as e:
        print(f"Failed to get embedding for row {i}. Error: {e}. Skipping.")

print("Embeddings obtained")

# save embeddings
df.to_csv("embeddings.csv")
print("Done!")


Processing row 22467...
Processing row 15609...
Processing row 10186...
Processing row 8605...
Processing row 5186...
Processing row 20681...
Processing row 19714...
Processing row 17884...
Processing row 8627...
Processing row 5199...
Processing row 14201...
Processing row 2706...
Processing row 18718...
Processing row 22065...
Processing row 12623...
Processing row 17899...
Processing row 10153...
Processing row 22982...
Processing row 14191...
Processing row 18691...
Processing row 22978...
Processing row 2684...
Processing row 2681...
Processing row 22975...
Processing row 10147...
Processing row 20673...
Processing row 19705...
Processing row 5176...
Processing row 2740...
Processing row 18740...
Processing row 2797...
Processing row 5114...
Processing row 6916...
Processing row 6911...
Processing row 889...
Processing row 10246...
Processing row 5121...
Processing row 15593...
Processing row 901...
Processing row 19685...
Processing row 2774...
Processing row 10234...
Processing 