In [9]:
import fasttext

kbd_model = fasttext.load_model('../data/processed/embeddings/fasttext_skipgram_kbd_100.bin')



In [10]:
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict


def get_clusters(vectors, n_clusters=20):
    agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, )
    labels = agg_clustering.fit_predict(vectors)
    return labels


def get_words_by_clusters(words, labels):
    words_by_clusters = defaultdict(list)
    for word_idx, cluster_label in enumerate(labels):
        words_by_clusters[cluster_label].append(words[word_idx])

    return words_by_clusters

In [None]:
import random
from tqdm import tqdm
import os

butch_size = 20000
cluster_num = 2000
cluster_factor = butch_size / cluster_num

for seed in range(100):
    # seed = random.randint(0, 100)
    export_path = f'../data/processed/embeddings_clusters/seed_{seed}/{cluster_factor}_{butch_size}_{cluster_num}'
    os.makedirs(export_path, exist_ok=True)
    
    all_words = kbd_model.get_words()
    
    random.shuffle(all_words)
    
    for offset in tqdm(range(0, len(kbd_model.get_words()), butch_size)):
        butch_words = all_words[offset:offset + butch_size]
        
        word_vectors = [kbd_model.get_word_vector(word) for word in tqdm(butch_words)]
        labels = get_clusters(word_vectors, n_clusters=cluster_num)
        words_by_clusters = get_words_by_clusters(butch_words, labels)
    
        for cluster_label, cluster_words in words_by_clusters.items():
            with open(f'{export_path}/cluster_{offset}_{offset + butch_size}_{cluster_label}.txt', 'w') as f:
                f.write('\n'.join(cluster_words))

  0%|          | 0/25 [00:00<?, ?it/s]
  0%|          | 0/20000 [00:00<?, ?it/s][A
100%|██████████| 20000/20000 [00:00<00:00, 116027.41it/s]A
  4%|▍         | 1/25 [00:13<05:18, 13.28s/it]
  0%|          | 0/20000 [00:00<?, ?it/s][A
 21%|██▏       | 4289/20000 [00:00<00:00, 42879.72it/s][A
100%|██████████| 20000/20000 [00:00<00:00, 76130.17it/s][A
  8%|▊         | 2/25 [00:25<04:52, 12.70s/it]
  0%|          | 0/20000 [00:00<?, ?it/s][A
 15%|█▌        | 3058/20000 [00:00<00:00, 30576.60it/s][A
100%|██████████| 20000/20000 [00:00<00:00, 66707.60it/s][A
 12%|█▏        | 3/25 [00:38<04:41, 12.81s/it]
  0%|          | 0/20000 [00:00<?, ?it/s][A
100%|██████████| 20000/20000 [00:00<00:00, 108103.96it/s]A
 16%|█▌        | 4/25 [00:50<04:25, 12.66s/it]
  0%|          | 0/20000 [00:00<?, ?it/s][A
100%|██████████| 20000/20000 [00:00<00:00, 120851.02it/s][A
 20%|██        | 5/25 [01:02<04:08, 12.42s/it]
  0%|          | 0/20000 [00:00<?, ?it/s][A
100%|██████████| 20000/20000 [00:00<00:0