In [81]:
from collections import defaultdict
from sklearn.decomposition import PCA # cf. 'reduce_diminsions()' in the below
import datetime
import numpy as np
import spacy
import math
import json
from sklearn.cluster import KMeans
possible = {
    "small":"en_core_web_sm",
    "mid":"en_core_web_md",
    "large":"en_core_web_lg"
}
pretrained_language_model = possible["mid"]
nlp = spacy.load(pretrained_language_model)
INPUT_FILE="data.psv" # 3000 products of something like that
# INPUT_FILE="test.psv" # 6 simple things  

msg = f"using '{pretrained_language_model}'"
print(f"{datetime.datetime.now().isoformat()} - {msg}") 



2023-05-01T08:14:33.728556 - using 'en_core_web_md'


In [83]:
def number_to_letter(num):
    x = num
    if num <= 0:
        raise ValueError("Input must be a positive integer.")
    
    letters = ""
    while num > 0:
        num -= 1
        # Map the remainder to the corresponding letter
        letters = chr(num % 26 + ord('A')) + letters  
        # Divide by 26 to move to the next digit
        num //= 26
        
    return letters

lookup = {}
sentences = []
seen = {} 
loop = 0 
with open(INPUT_FILE, 'r') as file:
    for line in file:
        
        fields = line.strip().split('|')
        if loop > 0 and loop < 10000: # 10000: # 100
            if len(fields[3]) > 3: 
                description = fields[3]
                activity = fields[2]
                if description in seen:
                    seen[description] += 1
                else:
                    seen[description] = 1
                    sentences.append(description)
                    letter = number_to_letter(len(sentences))
                    obj = {"id":letter, "group":-1, "activity": activity}
                    lookup[description] = obj
        loop += 1


In [84]:
# loop = 0
# for sentence in sentences:
#     count = len(sentence)
#     loop += 1
#     print(f"{loop} {count}")
# print("sentences number {}".format(len(sentences)))
print("lookup has {}".format( len(lookup)))
# Just show 10
loop = 0 
for key in lookup: 
    
    v = lookup[key]
    id = v["id"]
    loop += 1
    if loop < 10: 
        print("{}|{}|{}".format( loop, id, key))
print("len(lookup)={}".format(len(lookup)))
    

lookup has 2914
1|A|Feel limitless powered by our nulu fabric these butterysoft tights will have you bending and stretching with ease
2|B|Dinosaurs are from once upon a time.
3|C|feel limitless powered by our nulu fabric these butterysoft tights will have you bending and stretching with ease
4|D|this insulated tumbler has a folding straw lid and slipfree texture for easy drinking from first til last sip
5|E|a reputation a vibe a way of life designed in la for you we made these shorts with french terry fabric for softness you can take anywhere
6|F|stand out during sweaty studio workouts in these highrise tights made with fabric thats engineered to keep you feeling cool yet covered
7|G|voted most likely to be worn multiple times a week this allsport bra is a favourite for comfort and versatility
8|H|from sun salutations to restful moments in savasana this yoga tank flows with you the flexible ribbed texture with a cool smooth sensation feels great next to your skin
9|I|train hard adventu

In [87]:

try:
    vectors = []
    for sentence in sentences:
        tokens = nlp(sentence)
        word_vectors = [token.vector for token in tokens]
        mean_vector = np.mean(word_vectors, axis=0)
        lookup[sentence]["distance"] = mean_vector # step 1
        vectors.append(mean_vector)
    
    kmeans = KMeans(n_clusters=100, random_state=0).fit(vectors)
    group_member_count = {} 
    for i in range(len(sentences)):
        sentence = sentences[i]
        label = kmeans.labels_[i]
        obj = lookup[sentence]
        if label in group_member_count:
            group_member_count[label] += 1
        else:
            group_member_count[label] = 1
        lookup[sentence]["group"] = label
        # print("{} {} {}".format(obj["id"], label,  sentence)) 
        
    # Group sentence vectors by cluster label
    cluster_vectors = defaultdict(list)
    for i, sentence_vector in enumerate(vectors):
        label = kmeans.labels_[i]
        cluster_vectors[label].append(sentence_vector)
        
    # first_vector = vectors[0]
    zero_vector = np.zeros(vectors[0].shape)
    distances = [np.linalg.norm(vector - zero_vector) for vector in vectors]

    # remove noise
    # min_distance = min(distances)
    print("min_distance is {}".format(min_distance))
    distances = [distance - min_distance for distance in distances]
    
    # Normalize each distance to a scale of X ( 100 is my thinking right now)
    scale = 100
    max_distance = max(distances)
    distances = [(distance / max_distance) * scale for distance in distances]

    # Print vectors of each cluster
    loop = 0 
    cluster_file = "cluster_output.txt"
    file = open(cluster_file, "w")
    print("--- Writing to {}".format(cluster_file))
    print("loop|cluster|count|distance")
    file.write("loop|cluster|count|distance\n")
    for label, vectors in cluster_vectors.items():
        msg = f"{loop}|{label}|{group_member_count[label]}|{distances[loop]}"
        print(msg)
        file.write(msg + "\n")
        loop += 1
    file.close()        
    print("Wrote {} to {}".format(loop, cluster_file))
    
    # Print lookup information of each product
    loop = 0 
    product_file = "product_output.txt"
    file = open(product_file, "w")
    print("--- Writing to {}".format(cluster_file))
    print("loop|id|group|activity")
    file.write("loop|id|group|activity|description\n")
    for sentence in lookup:
        obj = lookup[sentence]
        msg = "{}|{}|{}|{}|{}".format(loop, obj["id"], obj["group"], obj["activity"], sentence)
        # print(msg)
        file.write(msg + "\n")
        loop += 1
    file.close()        
    print("Wrote {} to {}".format(loop, product_file))
    
    
    
    
    
except OverflowError as e:
    print("An OverflowError occurred! Likely too many clusters")
    print(e)
print("The end")

min_distance is 3.0771785549057666
--- Writing to cluster_output.txt
loop|cluster|count|distance
0|23|57|-4.65303111200587
1|67|48|36.94902535694451
2|40|17|-4.65303111200587
3|20|26|-1.786582399219177
4|66|11|22.550722159496058
5|21|38|20.348681113123096
6|73|10|8.34260395847846
7|11|31|27.679396475198693
8|94|50|1.2772647375130084
9|18|33|30.106786932703393
10|78|36|-14.181692567665822
11|57|26|33.34578349928143
12|82|39|27.10835886840055
13|30|56|29.944864630370212
14|76|78|14.334672588722205
15|43|26|24.134280929597484
16|8|27|20.868790005360538
17|86|18|20.49424187213668
18|45|35|50.32740642880733
19|6|57|10.138854227453661
20|4|72|6.477718399324424
21|31|41|29.46064739824552
22|7|56|5.837776775083257
23|84|25|27.654034093900965
24|41|39|14.200301931987294
25|19|95|34.575825086325544
26|74|30|-21.912093727065535
27|99|4|-24.803097623342758
28|97|52|10.018912401068695
29|27|39|-14.509256507980956
30|25|39|8.26451842229229
31|5|15|-9.32512584513
32|88|26|-16.144260257634357
33|12|58