In [None]:
# This is going to be the exmerimentation file for creating categories from the initial data pull

# I was going off of this website for inspiration https://towardsdev.com/mastering-data-clustering-with-embedding-models-87a228d67405

# Step 0: Clean and tokenize the data, doing lowered depunctuated title + category ids separated by a single whitespace
# Step 1: Make semantic text embeddings from the item headings probably categories,
# Step 2: Use these embeddings to do some kind of clustering algorithm and have these be properly clustered
# Step 3: Have a llm go through these clusters of data and create meaningful category suggestions by feeding in each of 
# the clustered data through and suggest a category

!pip install nltk
!pip install transformers
!pip install sentence-transformers
!pip install --upgrade torch
!pip install seaborn
!pip install torch[cuda]

In [3]:
# Importing all libraries
from tqdm import tqdm
import seaborn as sns
import os
import sqlite3
import re
from sentence_transformers import SentenceTransformer
import transformers
import ast
import sklearn
import numpy as np
import torch
import nltk
nltk.download("stopwords")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Have to loginto huggingface to use the model
from huggingface_hub import login
import json
with open("huggingface_credentials.json", "r") as f:
    t = json.load(f)["token"]
login(token=t)

In [3]:
torch.cuda.is_available()

True

In [6]:
# Testing using a big model on cuda
tokenizer = transformers.AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

model = transformers.AutoModelForCausalLM.from_pretrained("openai/gpt-oss-20b",
                                                           device_map="auto",
                                                           offload_folder="offload",
                                                           torch_dtype=torch.float16,)

llm_model = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer)                    


# Test
print(llm_model("What is the capital of France?", max_new_tokens=10))

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards:   0%|          | 0/3 [00:32<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.98 GiB. GPU 0 has a total capacity of 11.99 GiB of which 0 bytes is free. Of the allocated memory 12.28 GiB is allocated by PyTorch, and 2.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
# Testing with db libarary

conn = sqlite3.connect("./databases/Playstation 5.db")
sql = "SELECT name FROM sqlite_master WHERE type='table'"
print(conn.execute(sql).fetchall())

[('Playstation 5',)]


In [4]:
# Step 0: Pull all of the data that is necessaary out and preprocess it into tokens for the embedding process

db_paths = []
db_path_str = "./databases"
for db in os.listdir(db_path_str):
    db_paths.append(os.path.join(db_path_str, db))

db_connections = [sqlite3.connect(conn) for conn in db_paths]
data = {} # Data in a dictnioary with {table : all items}
items = [] # All of the tables in each of the databases to look at, used as key to the data dict
for db_connection in db_connections:
    tables_sql = "SELECT name FROM sqlite_master WHERE type='table'"

    tables = db_connection.execute(tables_sql).fetchall()
    main_table = tables[0][0]
    print(main_table)
    items.append(main_table)

    select_all_sql = f"SELECT title, categories FROM '{main_table}'"
    item_data = db_connection.execute(select_all_sql).fetchall()

    for item in item_data:
        data[main_table] = item_data


Nintendo Switch 2
Playstation 5
Steam Deck
Xbox Series X


In [5]:
# Formatting and Cleaning the data


formatted_item_list = []
# Saving all of the unncecessary stopwords
stop_words = set(nltk.corpus.stopwords.words("english"))
for item in data["Playstation 5"]:
    # extracting the item category ids
    category_ids = []
    title = item[0]

    for category in ast.literal_eval(item[1]):
        category_ids.append(category["categoryId"])
        
    # Putting together the category ids and title
    formatted_item = " ".join([title, " ".join(category_ids)])

    # Joining together the lowered string 
    formatted_item = formatted_item.lower()

    # Removing all punctuation
    formatted_item = re.sub(r'[^\w\s]', '', formatted_item)

    # Sometimes the thing removes something that looks like this and leaves 2 spaces instead of 1
    # Going to make it to where there are no '' in the tokens
    formatted_item = " ".join([x for x in formatted_item.split(" ") if x != ''])

    # Removing all stop words and joining to final string
    formatted_item_list.append(" ".join([x for x in formatted_item.split(" ") if x not in stop_words]))
    print(formatted_item)

alphadia genesis playstation 5 ps5 brand new factory sealed 139973 1249
ghostrunner playstation 5 ps5 brand new factory sealed 139973 1249
sony playstation 5 slim disc ps5 video game console with extra controller 139971 1249
spirit of the north enhanced edition playstation 5 new and sealed 139973 1249
sony playstation 5 pulse elite wireless gaming headset white 171821 1249 54968
sony playstation 5 disc version with extra controller and charging dock bundle 139971 1249
sony playstation 5 dualsense wireless controller chroma teal 117042 1249 54968
ark survival ascendedasa1605 max meleecarchar colors pc xbox ps5 pvedino 139973 1249
unopened ps5 welcome to paradize sony playstation 5 3goo sealed jp wtracking 139973 1249
ark survival ascended griffin solid colors pve ps5xboxpc 139973 1249
call of duty black ops 6 crossgen bundle playstation 4 and playstation 5 139973 1249
madden nfl 22 sony playstation 5 139973 1249
godfall sony playstation 5 2020 new free shipping 139973 1249
playstation 5

In [None]:
# Tokenizing the playstation data
# Aparently the tokenizer is not necessary for this library
# the embedding model will take in raw text and do the tokenization itself
# Length of the numpy array embeddings for this model is 768

# Might have to make an embedding map so I can get the original text back out of the
# embedding model encoded output 

# Note: that embedded text is going to always be saved as a tuple into the embedding_map dictionary
# since the numpy array is a unhashable data type

model = SentenceTransformer("google/embeddinggemma-300m")
embedding_map = {}
embeddings = []
for item in formatted_item_list:
    
    encoded_text = model.encode_query(item)

    embedding_map[tuple(encoded_text)] = item
    embeddings.append(encoded_text)


NameError: name 'SentenceTransformer' is not defined

In [7]:
# Need to reformat the embeddings into a numpy array of shape (n_samples, n_features)
embeddings = np.stack(embeddings)

In [8]:
embeddings.shape

(10649, 768)

In [9]:
# Going to do the kmeans clustering on the data now
# Going to try out the elbow method and see how many clusters there are
# Will do a different method in the future though

kmeans_clusters = {} # going to be set up in {number_of_clusters : value}
for num_clusters in range(1, 100):
    cluster_model = sklearn.cluster.KMeans(n_clusters=num_clusters, random_state=0, n_init="auto")
    cluster = cluster_model.fit(embeddings)

    kmeans_clusters[num_clusters] = cluster
    print(cluster.inertia_)


4880.94921875
4250.96923828125
3970.162841796875
3817.803466796875
3723.3779296875
3612.739501953125
3541.2783203125
3490.174560546875
3430.664794921875
3364.02978515625
3320.71826171875
3293.3779296875
3267.366455078125
3202.466796875
3171.9296875
3145.52978515625
3122.16015625
3105.983154296875
3087.8525390625
3075.21044921875
3046.4677734375
3034.700439453125
3009.419921875
2988.18994140625
2975.66552734375
2963.989013671875
2950.552734375
2930.52587890625
2916.06591796875
2908.322265625
2905.3408203125
2891.251953125
2883.70166015625
2870.848876953125
2863.798828125
2844.919677734375
2838.24462890625
2828.5078125
2815.42822265625
2806.972412109375
2802.509765625
2798.161865234375
2779.686279296875
2766.876953125
2759.6748046875
2751.82080078125
2744.5107421875
2725.751953125
2720.733642578125
2713.248291015625
2710.69091796875
2697.171875
2690.914306640625
2685.2939453125
2673.335205078125
2667.059326171875
2654.16650390625
2648.3291015625
2645.282958984375
2636.346923828125
2629.6

In [10]:
kmeans_clusters[2].n_clusters

2

In [11]:
# Going to test out the clustering of 3 categories on the embeddings and see what the output gives us
num_of_clusters = 3
classification_lists = [[] for x in range(num_of_clusters)]
test_model = kmeans_clusters[num_of_clusters]

cluster_numbers_in_existance = set()
for embedding in tqdm(embeddings):
    prediction = test_model.predict(np.stack([embedding]))
    cluster_numbers_in_existance.add(prediction[0])

    # Adding the embedding into the classification list
    classification_lists[int(prediction[0])].append(embedding)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10649/10649 [00:01<00:00, 7675.43it/s]


In [None]:
for l in classification_lists:
    print(len(l))

6541
2212
1896


In [84]:
# Now going to go through and use the embedding model to get the original sentence out and convert this to a 3 column pandas dataframe so I can see a head
# Checking to see how the clustering algorithm is actually clustering things
cleaned_classification_lists = []
for l in classification_lists:
    cleaned_classification_lists.append([])
    for encoded_title in l:
        # Add the decoded title into the corresponding classification list
        cleaned_classification_lists[-1].append(embedding_map[tuple(encoded_title)])


In [90]:
cleaned_classification_lists[0]

['alphadia genesis playstation 5 ps5 brand new factory sealed 139973 1249',
 'ghostrunner playstation 5 ps5 brand new factory sealed 139973 1249',
 'sony playstation 5 slim disc ps5 video game console extra controller 139971 1249',
 'spirit north enhanced edition playstation 5 new sealed 139973 1249',
 'sony playstation 5 pulse elite wireless gaming headset white 171821 1249 54968',
 'sony playstation 5 disc version extra controller charging dock bundle 139971 1249',
 'sony playstation 5 dualsense wireless controller chroma teal 117042 1249 54968',
 'unopened ps5 welcome paradize sony playstation 5 3goo sealed jp wtracking 139973 1249',
 'call duty black ops 6 crossgen bundle playstation 4 playstation 5 139973 1249',
 'madden nfl 22 sony playstation 5 139973 1249',
 'godfall sony playstation 5 2020 new free shipping 139973 1249',
 'playstation 5 ps5 video games software sealed new furyu renatis 139973 1249',
 'american hero limited run games playstation 5 brand new 139973 1249',
 'ps5 

In [4]:
# Now feed these lists through a llm on hugging face to see what category all of these entries best fits into.
# Interesting idea also:
# Maybe prime the llm if it can be filtered down better also to gague if it should be filtered down anymore

llm_model = transformers.pipeline("text-generation", model="openai/gpt-oss-20b")

Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16
Loading checkpoint shards:  67%|██████▋   | 2/3 [00:54<00:27, 27.73s/it]

: 