## Create a Vectordb from Amazon Product Dataset (pickle files)

In [48]:
import os
import re
import math 
import json 
from tqdm import tqdm
import random 
from dotenv import load_dotenv
from huggingface_hub import login 
import numpy as np 
import pickle 
from sentence_transformers import SentenceTransformer
from datasets import load_dataset 
import chromadb 
from sklearn.manifold import TSNE
import plotly.graph_objects as go 

### Internal Classes
from items import Item

In [49]:
### Environment 

load_dotenv(override=True)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
DB = "products_vectorstore"

In [50]:
hf_token = os.environ["HF_TOKEN"]
login(hf_token, add_to_git_credential=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Import the (previously cleaned and formatted) amazon product dataset (as pkl files)

Resource: https://drive.google.com/drive/folders/1f_IZGybvs9o0J5sb3xmtTEQB3BXllzrW?usp=drive_link 

In [51]:
with open("train.pkl", "rb") as f: 
    train = pickle.load(f)

In [52]:
print(train[0].prompt)

How much does this cost to the nearest dollar?

Main_Category: Toys & Games

CoComelon Official Musical Tractor w/Sounds & Exclusive 3-inch Farm JJ Toy, Play a Clip of “Old Macdonald” Song Plus More Sounds and Phrases
E-I-E-I WOAH! Get ready for a melodious ride through the farm with the CoComelon Musical Tractor! This brightly colored tractor comes with an exclusive 3-inch figure of JJ, a popular character from the educational CoComelon YouTube series! This musical CoComelon toy can seat any of your 3-inch CoComelon figures. Press on the grill of the tractor and listen as a clip of the “Old MacDonald” song plays for you and your child to sing along to! This exciting musical childrens toy also plays animal and tractor sounds, and even the CoComelon intro

Price is $13.00


In [53]:
client = chromadb.PersistentClient(path=DB)

In [54]:
### Check if the collection exists and delete it if it does
# collection_name = "product"

# existing_collection_names = client.list_collections()

# if any(c.name == collection_name for c in existing_collection_names): 
#     client.delete_collection(collection_name)
#     print(f"Deleted existing collection: {collection_name}")

# collection = client.create_collection(collection_name)

Deleted existing collection: product


In [62]:
### Use the existing collection, otherwise create it if none of it.
collection_name = "product"

existing_collection_names = client.list_collections()

if any(c.name == collection_name for c in existing_collection_names): 
    print("Collection already exists. Using it.")
    collection = client.get_collection(collection_name)
else: 
    print("Creating new collection.")
    collection = client.create_collection(collection_name)

Collection already exists. Using it.


## SentenceTransfomer

In [60]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
### Test:

vector = embedding_model.encode("hi there!")

vector

In [35]:
### Quick sidebar - Good mental model of how SSentenceTransformer works

import numpy as np
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def how_similar(text1, text2):
    vector1, vector2 = model.encode([text1, text2])
    similarity = cosine_similarity(vector1, vector2)
    print(f"Similarity between {text1} and {text2} is {similarity*100:.1f}%")

In [36]:
how_similar("Java", "C++")
how_similar("Java", "mug")
how_similar("Cup of Java", "mug")

Similarity between Java and C++ is 50.7%
Similarity between Java and mug is 25.8%
Similarity between Cup of Java and mug is 49.3%


In [57]:
def description(item): 
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n", "")
    return text.split("\n\nPrice is $")[0]

## Populate RAG Datastore 
With 150,000 items in Chroma.

In [63]:
NUMBER_OF_DOCS = len(train)

for i in tqdm(range(0, NUMBER_OF_DOCS, 1000)):
    documents = [description(item) for item in train[i:i+1000]]
    vectors = embedding_model.encode(documents).astype(float).tolist()
    metadatas = [{"category": item.category, "price": item.price} for item in train[i: i+1000]]
    ids = [f"doc_{j}" for j in range(i, i+len(documents))]

    collection.add(
        ids=ids, 
        documents=documents, 
        embeddings=vectors, 
        metadatas=metadatas
    )

100%|████████████████████████████████████████████████████████| 150/150 [17:11<00:00,  6.87s/it]
