# Exploring representative tuples by clustering the embedding space

In [1]:
import warnings
warnings.filterwarnings('ignore')

import time
from sklearn.cluster import KMeans, Birch
from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd

import numpy as np
import h5py

## Perform clustering and return the cluster centers

In [9]:
fastTextModelPath = 'EmbeddingsFastText.w2v'

Clustering with KMeans

In [14]:
def getClusterCentersWithKMeans(model, numberOfClusters):
    # Get the word vectors of the model
    word_vectors = model.wv.syn0
    n_words = word_vectors.shape[0]
    vec_size = word_vectors.shape[1]
    print("Number of words = {0}, vector size = {1}".format(n_words, vec_size))

    # Cluster using KMeans
    start = time.time()
    print("Clustering ... ", end="", flush=True)
    kmeans = KMeans(n_clusters=numberOfClusters, n_jobs=-1, random_state=0)
    idx = kmeans.fit_predict(word_vectors)
    print("Finished clustering in {:.2f} sec.".format(time.time() - start), flush=True)

    # Return cluster centers
    return kmeans.cluster_centers_

## Get the closest vector to each of the cluster centers
We'll pass the number of cluster centers as an argument. This can be thought of as a drill down equivalent. Greater the number of cluster centers, more detailed will be the resulting results returned. 

Number of clusters chosen is 3 by default. This can be overriden, if needed.

In [15]:
def getClosestWordEmbedding(modelPath, numberOfClusters = 15):
    # Load the model
    start = time.time()
    model = KeyedVectors.load(modelPath)
    print("Finished loading model in {:.2f} sec.".format(time.time() - start), flush=True)
    
    clusterCenters = getClusterCentersWithKMeans(model, numberOfClusters)
    
    # Create an empty numpy array of size equal to cluster centers to store the closest words
    closestWords = []
    
    
    # Get the closest word for each of the cluster centers
    #for clusterCenter in clusterCenters:
    closestWords.append(model.similar_by_vector('3x640'))
            #closestWords.append(model.similar_by_vector(clusterCenter))
    
    return closestWords

In [None]:
getClosestWordEmbedding(word2VecModelPath)

In [16]:
getClosestWordEmbedding(fastTextModelPath, 10)

Finished loading model in 0.10 sec.
Number of words = 777, vector size = 100
Clustering ... Finished clustering in 0.33 sec.


[[('MORGAN', 0.9681735038757324),
  ('HARTSELLE', 0.9302594661712646),
  ('35640', 0.858690619468689),
  ('2567736511', 0.7711230516433716),
  ('DECATUR', 0.7628130316734314),
  ('201 PINE STREET NORTHWEST', 0.7309060096740723),
  ('HARTSELLE MEDICAL CENTER', 0.48642146587371826),
  ('35609', 0.47125619649887085),
  ('ETOWAH', 0.4642176032066345),
  ('1201 7TH STREET SE', 0.4625547528266907)]]