## Create a Vectordb from Amazon Product Dataset (pickle files)

In [1]:
import os
from tqdm import tqdm
from dotenv import load_dotenv
from huggingface_hub import login
import pickle
from sentence_transformers import SentenceTransformer
import chromadb

### Internal Classes

  from .autonotebook import tqdm as notebook_tqdm
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 094e6a7a-5b81-43da-9b3f-b491b607c309)')' thrown while requesting HEAD https://huggingface.co/meta-llama/Llama-3.1-8B/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


In [2]:
### Environment 

load_dotenv(override=True)
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
DB = "products_vectorstore"

In [3]:
hf_token = os.environ["HF_TOKEN"]
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Import the (previously cleaned and formatted) amazon product dataset (as pkl files)

Resource: https://drive.google.com/drive/folders/1f_IZGybvs9o0J5sb3xmtTEQB3BXllzrW?usp=drive_link 

In [4]:
with open("train.pkl", "rb") as f:
    train = pickle.load(f)

In [5]:
print(train[0].prompt)

How much does this cost to the nearest dollar?

Main_Category: Toys & Games

CoComelon Official Musical Tractor w/Sounds & Exclusive 3-inch Farm JJ Toy, Play a Clip of “Old Macdonald” Song Plus More Sounds and Phrases
E-I-E-I WOAH! Get ready for a melodious ride through the farm with the CoComelon Musical Tractor! This brightly colored tractor comes with an exclusive 3-inch figure of JJ, a popular character from the educational CoComelon YouTube series! This musical CoComelon toy can seat any of your 3-inch CoComelon figures. Press on the grill of the tractor and listen as a clip of the “Old MacDonald” song plays for you and your child to sing along to! This exciting musical childrens toy also plays animal and tractor sounds, and even the CoComelon intro

Price is $13.00


In [6]:
client = chromadb.PersistentClient(path=DB)

In [7]:
### Check if the collection exists and delete it if it does
# collection_name = "product"

# existing_collection_names = client.list_collections()

# if any(c.name == collection_name for c in existing_collection_names): 
#     client.delete_collection(collection_name)
#     print(f"Deleted existing collection: {collection_name}")

# collection = client.create_collection(collection_name)

In [8]:
### Use the existing collection, otherwise create it if none of it.
collection_name = "products"

existing_collection_names = client.list_collections()

if any(c.name == collection_name for c in existing_collection_names): 
    print("Collection already exists. Using it.")
    collection = client.get_collection(collection_name)
else: 
    print("Creating new collection.")
    collection = client.create_collection(collection_name)

Creating new collection.


## SentenceTransfomer

In [9]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
### Test:

vector = embedding_model.encode("hi there!")

vector

In [11]:
### Quick sidebar - Good mental model of how SSentenceTransformer works

import numpy as np
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def how_similar(text1, text2):
    vector1, vector2 = model.encode([text1, text2])
    similarity = cosine_similarity(vector1, vector2)
    print(f"Similarity between {text1} and {text2} is {similarity*100:.1f}%")

In [12]:
how_similar("Java", "C++")
how_similar("Java", "mug")
how_similar("Cup of Java", "mug")

NameError: name 'model' is not defined

In [13]:
def description(item): 
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n", "")
    return text.split("\n\nPrice is $")[0]

## Populate RAG Datastore 
With 150,000 items in Chroma.

In [14]:
NUMBER_OF_DOCS = len(train)

for i in tqdm(range(0, NUMBER_OF_DOCS, 1000)):
    documents = [description(item) for item in train[i:i+1000]]
    vectors = embedding_model.encode(documents).astype(float).tolist()
    metadatas = [{"category": item.category, "price": item.price} for item in train[i: i+1000]]
    ids = [f"doc_{j}" for j in range(i, i+len(documents))]

    collection.add(
        ids=ids, 
        documents=documents, 
        embeddings=vectors, 
        metadatas=metadatas
    )

100%|████████████████████████████████████████████████████████████████| 150/150 [17:43<00:00,  7.09s/it]
