In [20]:
import warnings
import time

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.preprocessing import normalize
from sklearn.preprocessing import minmax_scale

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabaz_score
from sklearn.metrics import davies_bouldin_score

pd.set_option('precision', 20)
warnings.simplefilter('ignore')

Read Bag of Words Data

In [21]:
dt = pd.read_csv('./word2vec.csv', header = None)

dt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.0135948248207569,-0.068771593272686,-0.0131260938942432,0.0060244412161409,0.0379218868911266,-0.0208724178373813,-0.0103245936334133,0.0080185951665043,0.008063780143857,0.0113811697810888,...,-0.0209649987518787,0.062472891062498,-5.98852820985e-05,-0.0626214668154716,-0.0195863768458366,0.0460067316889762,0.0038109917659312,0.011133030988276,-0.0402083806693553,-0.0419677905738353
1,0.0250091981142759,-0.0255860704928636,0.0122374715283513,0.0205671042203903,0.0145229594781994,-0.0187047235667705,-0.0583946853876113,0.0280539095401763,-0.0323618501424789,0.0205821022391319,...,-0.0349767245352268,0.0304748844355344,0.0388990752398967,0.0169257745146751,-0.0006659875507466,0.0073109208606183,0.001348104327917,0.0084143327549099,-0.0139268636703491,0.0023993947543203
2,0.0315438583493232,-0.0258821956813335,-0.0606167055666446,0.0359916426241397,0.037742294371128,-0.0211320202797651,-0.0410231985151767,0.011076271533966,0.0032471425365656,0.0144266700372099,...,-0.0001459375052945,0.0002925574663095,0.0293335225433111,-0.005734703503549,-0.0256829299032688,0.0286939274519681,0.0286918338388204,0.0490430071949958,-0.0112496055662632,-0.0294231493026018
3,0.0587140209972858,-0.0802512541413307,-0.0261009987443685,0.0604606568813324,0.0167144946753978,-0.0052052671089768,-0.0374407581984996,0.0514717474579811,-0.0635047405958175,0.0032471478916704,...,-0.0218793526291847,0.0171033106744289,0.0164721198379993,-0.0049530612304806,-0.0015206087846308,-0.0188216306269168,0.0202157106250524,0.0257079694420099,-0.0165431164205074,0.0129728140309452
4,0.0233661979436874,-0.0656670257449149,-0.0156006179749965,0.0226019974797964,0.030365414917469,-0.0605902820825576,-0.071055382490158,-0.0086413025856018,-0.0187280010432004,0.0428804829716682,...,0.014219170436263,0.0516133457422256,0.0273156724870204,-0.030006006360054,-0.0282086934894323,0.0642558857798576,-0.019291017204523,0.0037249843589961,-0.0499372296035289,-0.0159748680889606


In [22]:
data = normalize(dt.values, norm = 'l2')

In [23]:
start = time.time()

max_k = 13000
step = 250
init = 'k-means++' # k-means++ / random

calinski_data = []
davies_data = []

for k in range(0, max_k + 1, step):
    
    if (k == 0):
        current_k = 2
    else:
        current_k = k
    
    kmeans = KMeans(n_clusters = current_k,
                    n_init = 1,
                    max_iter = 1000,
                    init = init,
                    algorithm = 'auto',
                    precompute_distances = 'auto',
                    random_state = 0,
                    verbose = False)

    kmeans.fit(data)
    
    labels = kmeans.labels_
    
    calinski = calinski_harabaz_score(dt.values, labels)
    
    calinski_data.append([k, calinski])
    
    davies = davies_bouldin_score(dt.values, labels)
    
    davies_data.append([k, davies])
    

finish = time.time()

calculated_time = (finish - start) / 60

print("Time in Minutes: " + str(calculated_time))

Time in Minutes: 57.92094096342723


In [26]:
calinski_data

[[0, 403.8634176578489],
 [250, 19.325077580940313],
 [500, 12.032376580223273],
 [750, 9.238280126264312],
 [1000, 7.716802681478168],
 [1250, 6.875181434288943],
 [1500, 6.273224866572058],
 [1750, 5.808546334660351],
 [2000, 5.487073292475251],
 [2250, 5.240549750344757],
 [2500, 5.032067929896188],
 [2750, 4.877337556197983],
 [3000, 4.768680665872016],
 [3250, 4.656441274949057],
 [3500, 4.571313294632823],
 [3750, 4.493120511902193],
 [4000, 4.434849071091201],
 [4250, 4.397574268674866],
 [4500, 4.363123079674342],
 [4750, 4.32786961731445],
 [5000, 4.314766232423701],
 [5250, 4.298898519419127],
 [5500, 4.2879588140916605],
 [5750, 4.293920858387226],
 [6000, 4.304213434536595],
 [6250, 4.31388844241494],
 [6500, 4.34619073380572],
 [6750, 4.385906193183208],
 [7000, 4.430326588405799],
 [7250, 4.4800849849382995],
 [7500, 4.543399134533104],
 [7750, 4.622683857290843],
 [8000, 4.7079551541419145],
 [8250, 4.801397484372366],
 [8500, 4.921841154142741],
 [8750, 5.07027377450829

In [27]:
davies_data

[[0, 5.580444553613749],
 [250, 3.633921271018352],
 [500, 3.206666599121096],
 [750, 2.917095331317504],
 [1000, 2.66399080066141],
 [1250, 2.450643454340322],
 [1500, 2.288577126783996],
 [1750, 2.142791432454932],
 [2000, 2.020961693146879],
 [2250, 1.902741729102962],
 [2500, 1.8063783515368956],
 [2750, 1.7088963651411646],
 [3000, 1.6413214546570531],
 [3250, 1.5549701934420417],
 [3500, 1.4871246320318512],
 [3750, 1.4294007743543313],
 [4000, 1.3717606530427977],
 [4250, 1.31142948206546],
 [4500, 1.2618445710934465],
 [4750, 1.2164071850819906],
 [5000, 1.1772786272271547],
 [5250, 1.1384407474386276],
 [5500, 1.1021287269092142],
 [5750, 1.0679822412563695],
 [6000, 1.0412843987058373],
 [6250, 1.0129322548609432],
 [6500, 0.9863140318912464],
 [6750, 0.9587507243656732],
 [7000, 0.9338982201207473],
 [7250, 0.9110736294791196],
 [7500, 0.8899312733329956],
 [7750, 0.8571002980611069],
 [8000, 0.8382560423651051],
 [8250, 0.8264386436488779],
 [8500, 0.8061822424154459],
 [87