In [1]:
import numpy as np
import pickle

def load_file(file):
    with open(file, 'rb') as f:
        return pickle.load(f)
    
idx_to_word = load_file("idx_to_word.pkl")
word_to_idx = load_file("word_to_idx.pkl")
vectors = np.load("output.npy")

def get_word_vector(word):
    idx = word_to_idx[word]
    return vectors[idx]

def get_word_at_index(index):
    return idx_to_word[index]

In [2]:
vectors

array([[-0.02985769, -0.03923954, -0.0193511 , ..., -0.04687731,
        -0.06903055, -0.05992738],
       [-0.0058338 , -0.05957004, -0.03742093, ..., -0.05318371,
        -0.02038277, -0.06202114],
       [-0.00809794,  0.02141076, -0.03063158, ..., -0.00790764,
        -0.07618432, -0.07131716],
       ...,
       [-0.00058425, -0.05786744, -0.0474753 , ..., -0.03131336,
        -0.04294971, -0.06015962],
       [ 0.01150776,  0.00129292, -0.04120871, ..., -0.03023674,
        -0.06795908, -0.07554299],
       [-0.03337324, -0.0394639 , -0.0612554 , ..., -0.04322901,
        -0.04506031, -0.07519749]], dtype=float32)

In [3]:
idx_to_word

{0: 'sharing priors',
 1: 'layer neural networks',
 2: 'solving nonconvex',
 3: 'unknown',
 4: 'largely disjoint efforts',
 5: 'targeting eeg',
 6: 'apcg method',
 7: 'entropy estimate',
 8: 'extended lasso',
 9: 'unbounded computational power',
 10: 'methods come',
 11: 'excluding logarithmic factors',
 12: 'probabilistic manner',
 13: 'image restoration',
 14: 'cost models suffice',
 15: 'transforms nuisance processes',
 16: 'consider approximate policy',
 17: 'obtain provable guarantees',
 18: 'allowed level sets',
 19: 'dbn structure learning',
 20: 'dynamics predictor learns',
 21: 'complex biological sample',
 22: 'cell consists',
 23: 'algorithm runs',
 24: 'recent models',
 25: 'linear copula projections',
 26: 'assuming agents honest',
 27: 'newton approximations',
 28: 'low rank',
 29: 'measure optimization',
 30: 'two hidden layers',
 31: 'human learning',
 32: 'first study',
 33: 'restricted matrix regression',
 34: 'sectional data',
 35: 'parsimonious statistical model',
 

In [4]:
word_to_idx

{'learning monotonic transformations': 14104,
 'flexible transfer learning': 30004,
 'convex': 25052,
 'natural situations': 36921,
 'finding metrics': 1676,
 'perception encodes': 24141,
 'matching tasks demonstrates': 29798,
 'algorithm draws': 36980,
 'visual contour integration': 34389,
 'talairach space': 28298,
 'present applications': 36088,
 'modern biomedical research': 7180,
 'count objects': 18919,
 'noisy labels problem': 9518,
 'single gamma process': 26422,
 'search oracle models': 12469,
 '25': 30040,
 'pac maximum selection': 16887,
 'numerous attempts': 3502,
 'model scales': 39609,
 'art random projection': 9827,
 'connectivity based bcis': 11189,
 'linear plants': 17493,
 'denoised': 9973,
 'scale ica': 24367,
 'neurally': 6487,
 'excellent approximation': 33198,
 'algorithm along': 37987,
 'local image representations': 28670,
 'approach requires': 25514,
 'empty convex': 37572,
 'data stream summaries': 6795,
 'nonlinear recursive structure': 15115,
 'many importan

In [5]:
from scipy.cluster.vq import kmeans2

In [6]:
km = kmeans2(vectors, 30, iter=25)

In [7]:
words = []

for i in range(len(vectors)):
    if vectors[i] in km[0]:
        words.append(get_word_at_index(i))
        
words

['given model',
 'likelihood terms need',
 'kernelized version',
 'particular distributions',
 'tractable aggregation',
 'genomic sequence analysis',
 'form tighter bounds',
 'implicit metric spaces',
 'rademacher complexities',
 'recent design',
 'empirically verifying',
 'tackle nonsmooth',
 'mode estimation']

In [8]:
km[0]

array([[-0.01120191, -0.0135988 , -0.03626807, ..., -0.04667561,
        -0.04127087, -0.0700739 ],
       [-0.00658421, -0.03381037, -0.03812243, ..., -0.01622671,
        -0.03997023, -0.03461964],
       [ 0.0064937 , -0.02490102, -0.03834748, ..., -0.02256488,
        -0.07598568, -0.04546002],
       ...,
       [-0.0166932 , -0.03061913, -0.04167364, ..., -0.02399887,
        -0.02608895, -0.05251006],
       [ 0.00027969, -0.00051671, -0.02910781, ..., -0.02948768,
        -0.02899347, -0.05866459],
       [ 0.02487311,  0.04015868, -0.04608824, ...,  0.01881651,
        -0.08632889, -0.06493115]], dtype=float32)

In [9]:
import math

def euc_distance(centroid, vec):
    val = 0
    for i in range(len(centroid)):
        val += math.pow(centroid[i]-vec[i], 2)
    return math.sqrt(val)

In [12]:
def cos_similarity(centroid, vec):
    top = 0
    for i in range(len(centroid)):
        top += centroid[i]*vec[i]
    bottom1 = 0
    for e in centroid:
        bottom1 += math.pow(e, 2)
    bottom2 = 0
    for e in vec:
        bottom2 += math.pow(e, 2)
    return top/(bottom1*bottom2)

In [15]:
clusters = []

for i in range(len(km[1])):
    clusters.append({
        'cluster': km[1][i],
        'keyword': idx_to_word[i],
        'euc_distance': euc_distance(km[0][km[1][i]], vectors[i]),
        'cos_similarity': cos_similarity(km[0][km[1][i]], vectors[i])
    })

clusters

[{'cluster': 9,
  'cos_similarity': 1.243229900955704,
  'euc_distance': 0.6799743641516682,
  'keyword': 'sharing priors'},
 {'cluster': 27,
  'cos_similarity': 1.1382539904530344,
  'euc_distance': 0.4066224491552771,
  'keyword': 'layer neural networks'},
 {'cluster': 3,
  'cos_similarity': 1.077206775653366,
  'euc_distance': 0.5424483790860379,
  'keyword': 'solving nonconvex'},
 {'cluster': 26,
  'cos_similarity': 1.0024547651959406,
  'euc_distance': 0.723367113922304,
  'keyword': 'unknown'},
 {'cluster': 8,
  'cos_similarity': 1.0828208678971727,
  'euc_distance': 0.6398383091346004,
  'keyword': 'largely disjoint efforts'},
 {'cluster': 28,
  'cos_similarity': 1.0407526915259884,
  'euc_distance': 0.6524782371500798,
  'keyword': 'targeting eeg'},
 {'cluster': 19,
  'cos_similarity': 1.172238332113973,
  'euc_distance': 0.5274207338152221,
  'keyword': 'apcg method'},
 {'cluster': 24,
  'cos_similarity': 0.8485048699969868,
  'euc_distance': 0.7423386275900694,
  'keyword': '

In [16]:
with open('cluster_kmeans2_sz30.csv', 'w', newline='') as f:
    header_present = False
    for word in clusters:
        if not header_present:
            w = csv.DictWriter(f, word.keys())
            w.writeheader()
            header_present = True
        w.writerow(word)