In [None]:
# This is going to be the exmerimentation file for creating categories from the initial data pull

# I was going off of this website for inspiration https://towardsdev.com/mastering-data-clustering-with-embedding-models-87a228d67405

# Step 0: Clean and tokenize the data, doing lowered depunctuated title + category ids separated by a single whitespace
# Step 1: Make semantic text embeddings from the item headings probably categories,
# Step 2: Use these embeddings to do some kind of clustering algorithm and have these be properly clustered
# Step 3: Have a llm go through these clusters of data and create meaningful category suggestions by feeding in each of 
# the clustered data through and suggest a category

!pip install nltk
!pip install transformers
!pip install sentence-transformers
!pip install --upgrade torch
!pip install seaborn

In [None]:
# Importing all libraries
import seaborn as sns
import os
import sqlite3
import re
from sentence_transformers import SentenceTransformer
import ast
import sklearn
import nltk
nltk.download("stopwords")


In [None]:
# Testing with db libarary

conn = sqlite3.connect("./databases/Playstation 5.db")
sql = "SELECT name FROM sqlite_master WHERE type='table'"
print(conn.execute(sql).fetchall())

In [None]:
# Step 0: Pull all of the data that is necessaary out and preprocess it into tokens for the embedding process

db_paths = []
db_path_str = "./databases"
for db in os.listdir(db_path_str):
    db_paths.append(os.path.join(db_path_str, db))

db_connections = [sqlite3.connect(conn) for conn in db_paths]
data = {} # Data in a dictnioary with {table : all items}
items = [] # All of the tables in each of the databases to look at, used as key to the data dict
for db_connection in db_connections:
    tables_sql = "SELECT name FROM sqlite_master WHERE type='table'"

    tables = db_connection.execute(tables_sql).fetchall()
    main_table = tables[0][0]
    print(main_table)
    items.append(main_table)

    select_all_sql = f"SELECT title, categories FROM '{main_table}'"
    item_data = db_connection.execute(select_all_sql).fetchall()

    for item in item_data:
        data[main_table] = item_data


In [None]:
# Formatting and Cleaning the data


formatted_item_list = []
# Saving all of the unncecessary stopwords
stop_words = set(nltk.corpus.stopwords.words("english"))
for item in data["Playstation 5"]:
    # extracting the item category ids
    category_ids = []
    title = item[0]

    for category in ast.literal_eval(item[1]):
        category_ids.append(category["categoryId"])
        
    # Putting together the category ids and title
    formatted_item = " ".join([title, " ".join(category_ids)])

    # Joining together the lowered string 
    formatted_item = formatted_item.lower()

    # Removing all punctuation
    formatted_item = re.sub(r'[^\w\s]', '', formatted_item)

    # Sometimes the thing removes something that looks like this and leaves 2 spaces instead of 1
    # Going to make it to where there are no '' in the tokens
    formatted_item = " ".join([x for x in formatted_item.split(" ") if x != ''])

    # Removing all stop words and joining to final string
    formatted_item_list.append(" ".join([x for x in formatted_item.split(" ") if x not in stop_words]))
    print(formatted_item)

In [None]:
# Have to loginto huggingface to use the model
from huggingface_hub import login
import json
with open("huggingface_credentials.json", "r") as f:
    t = json.load(f)["token"]
login(token=t)

In [None]:
# Tokenizing the playstation data
# Aparently the tokenizer is not necessary for this library
# the embedding model will take in raw text and do the tokenization itself
# Length of the numpy array embeddings for this model is 768

model = SentenceTransformer("google/embeddinggemma-300m")
embeddings = []
for item in formatted_item_list:
    
    embeddings.append(model.encode_query(item))


In [None]:
# Going to do the kmeans clustering on the data now
# Going to try out the elbow method and see how many clusters there are
# Will do a different method in the future though

kmeans_clusters = {} # going to be set up in {number_of_clusters : value}
for num_clusters in range(1, 100):
    cluster_model = sklearn.cluster.KMeans(n_clusters=num_clusters, random_state=0, n_init="auto")
    cluster = cluster_model.fit(embeddings)

    kmeans_clusters[num_clusters] = cluster
    print(cluster.inertia_)


In [None]:
kmeans_clusters[2].labels_