In [None]:
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from sklearn.cluster import KMeans # type: ignore
from sentence_transformers import SentenceTransformer # type: ignore
from transformers import AutoTokenizer, AutoModelForCausalLM

### Model

In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Data

In [3]:
DATA_DIR = Path.cwd() / "am_reasoning" / "data_files"

In [4]:
df = pd.read_csv((DATA_DIR / "pe_dataset.csv"), index_col=0)

In [7]:
titles_l = df.title.unique().tolist()

### Title encoding

In [9]:
title_encodings = {}

In [10]:
for title in tqdm(titles_l, "Computing title encodings ...", len(titles_l)):
    
    title_encodings[title] = model.encode(title)

Computing title encodings ...: 100%|██████████| 395/395 [00:02<00:00, 159.24it/s]


In [27]:
title_encodings['Should students be taught to compete or to cooperate?'].shape

(384,)

In [28]:
combined_array = np.stack(list(title_encodings.values()))


In [29]:
combined_array.shape

(395, 384)

### K-means Clustering

In [55]:
kmeans = KMeans(n_clusters=6, random_state=0, n_init="auto").fit(combined_array)

In [56]:
kmeans

In [57]:
kmeans.labels_

array([1, 3, 5, 4, 4, 5, 4, 3, 3, 3, 2, 1, 1, 2, 0, 5, 4, 1, 3, 5, 4, 2,
       1, 3, 3, 4, 5, 2, 4, 3, 3, 5, 4, 5, 3, 5, 2, 3, 4, 4, 2, 5, 3, 3,
       4, 0, 4, 2, 1, 4, 2, 0, 5, 1, 2, 5, 2, 3, 4, 2, 4, 3, 1, 3, 2, 5,
       5, 2, 3, 5, 3, 2, 5, 5, 1, 0, 3, 5, 0, 5, 5, 3, 2, 5, 3, 2, 2, 3,
       2, 5, 5, 3, 3, 5, 3, 5, 1, 4, 4, 3, 5, 4, 3, 3, 0, 3, 5, 3, 0, 2,
       3, 1, 3, 5, 3, 5, 1, 3, 3, 4, 1, 1, 4, 2, 2, 1, 3, 3, 5, 4, 2, 2,
       3, 1, 5, 4, 0, 4, 2, 3, 2, 4, 3, 1, 1, 5, 4, 5, 5, 3, 4, 3, 3, 5,
       2, 3, 2, 3, 3, 3, 3, 2, 4, 3, 2, 2, 4, 2, 1, 4, 0, 5, 5, 5, 4, 3,
       1, 2, 4, 4, 5, 3, 3, 0, 4, 4, 1, 0, 4, 2, 3, 0, 4, 1, 3, 4, 0, 5,
       2, 2, 4, 4, 3, 3, 4, 4, 4, 3, 2, 3, 5, 3, 1, 1, 2, 1, 0, 1, 1, 3,
       1, 2, 3, 5, 2, 1, 3, 1, 3, 4, 4, 1, 5, 5, 2, 2, 3, 2, 2, 5, 2, 3,
       3, 1, 1, 5, 2, 1, 1, 3, 3, 4, 2, 1, 2, 2, 1, 2, 2, 2, 2, 5, 5, 5,
       1, 2, 4, 2, 3, 2, 1, 5, 1, 5, 2, 4, 3, 3, 1, 5, 4, 2, 4, 3, 5, 2,
       4, 1, 2, 1, 1, 2, 4, 4, 4, 3, 2, 3, 2, 0, 4,

In [88]:
cluster_idxs_dict = defaultdict(list)

In [89]:
for index, value in enumerate(kmeans.labels_):
    cluster_idxs_dict[int(value)].append(index)

In [90]:
cluster_idxs_dict = dict(sorted(cluster_idxs_dict.items()))

In [91]:
kmeans.cluster_centers_

array([[ 0.02362393,  0.06103813,  0.04923204, ..., -0.01084874,
         0.02775521,  0.03717209],
       [ 0.01677704,  0.04695819, -0.00564692, ...,  0.0410791 ,
         0.01428942,  0.01826944],
       [-0.00257124,  0.03332817,  0.0159751 , ..., -0.00063801,
         0.00992464,  0.01187411],
       [ 0.04007247,  0.03452406,  0.02244523, ..., -0.01101388,
         0.01670768,  0.01097618],
       [ 0.02358931,  0.0352295 ,  0.00740013, ..., -0.01481627,
        -0.00562769, -0.00391086],
       [ 0.03646265,  0.02083156,  0.00508696, ..., -0.02822257,
         0.00614797, -0.0016006 ]], shape=(6, 384), dtype=float32)

In [92]:
kmeans.cluster_centers_.shape

(6, 384)

In [93]:
cluster_centres_dict = {i: kmeans.cluster_centers_[i] for i in range(kmeans.cluster_centers_.shape[0])}

In [94]:
cluster_centres_dict.keys(), cluster_idxs_dict.keys()

(dict_keys([0, 1, 2, 3, 4, 5]), dict_keys([0, 1, 2, 3, 4, 5]))

### Sort clusters by similarity

In [137]:
def sort_clusters(cluster):
    
    #centroid = cluster_centres_dict[cluster]    
    #centroid_idxs = cluster_idxs_dict[cluster]
    
    cluster_encodings = combined_array[cluster_idxs_dict[cluster]].tolist()    
    similarities = model.similarity(cluster_centres_dict[cluster], cluster_encodings).sort() # type: ignore
    
    cluster_encodings_sorted = [cluster_encodings[i] for i in similarities.indices[0].tolist()] # type: ignore
    
    return cluster_encodings_sorted

In [139]:
sorted_clusters = {key: sort_clusters(key) for key in cluster_centres_dict.keys()}

In [140]:
sorted_clusters

{0: [[0.061657845973968506,
   0.028822295367717743,
   0.13753673434257507,
   0.07514689117670059,
   0.09605127573013306,
   0.0004939616774208844,
   0.06483599543571472,
   0.05221014842391014,
   0.030948206782341003,
   0.0837363451719284,
   0.04439782723784447,
   -0.05152476206421852,
   -0.024218348786234856,
   -0.03436050936579704,
   -0.01897100731730461,
   -0.053459007292985916,
   -0.09964042156934738,
   -0.0460071936249733,
   -0.016199570149183273,
   0.06733886897563934,
   -0.08173871785402298,
   0.07168467342853546,
   -0.020960461348295212,
   0.05793646723031998,
   -0.09370235353708267,
   -0.013984506018459797,
   0.009696018882095814,
   0.00875947903841734,
   -0.007065433543175459,
   0.017333541065454483,
   0.0019375089323148131,
   -0.02603902667760849,
   0.10001976042985916,
   0.043770838528871536,
   -0.0421585813164711,
   0.08527138084173203,
   0.0807703286409378,
   -0.037334468215703964,
   -0.02519158646464348,
   -0.04413285106420517,
   0.0

In [None]:
# sorted_clusters = {}

# for key in cluster_centres_dict.keys():
    
#     sorted_clusters[key] = sort_clusters(key)

In [None]:
# def sort_clusters(cluster):
    
#     #centroid = cluster_centres_dict[cluster]    
#     #centroid_idxs = cluster_idxs_dict[cluster]
    
#     #cluster_encodings = combined_array[cluster_idxs_dict[cluster]].tolist()    
#     #similarities = model.similarity(cluster_centres_dict[cluster], cluster_encodings).sort() # type: ignore
    
#     cluster_encodings_sorted = [combined_array[cluster_idxs_dict[cluster]].tolist()[i] for i in model.similarity(cluster_centres_dict[cluster], combined_array[cluster_idxs_dict[cluster]].tolist()).sort().indices[0].tolist()] # type: ignore
    
#     return cluster_encodings_sorted