In [6]:
import warnings
import time

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.preprocessing import normalize
from sklearn.preprocessing import minmax_scale

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabaz_score
from sklearn.metrics import davies_bouldin_score

pd.set_option('precision', 20)
warnings.simplefilter('ignore')

Read Bag of Words Data

In [7]:
dt = pd.read_csv('./bags.csv', header = None)

dt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1193,1194,1195,1196,1197,1198,1199,1200,1201,1202
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5522006058587953,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
data = normalize(dt.values, norm = 'l2')

In [9]:
start = time.time()

max_k = 13000
step = 250
init = 'k-means++' # k-means++ / random

calinski_data = []
davies_data = []

for k in range(0, max_k + 1, step):
    
    if (k == 0):
        current_k = 2
    else:
        current_k = k
    
    kmeans = KMeans(n_clusters = current_k,
                    n_init = 1,
                    max_iter = 1000,
                    init = init,
                    algorithm = 'auto',
                    precompute_distances = 'auto',
                    random_state = 0,
                    verbose = False)

    kmeans.fit(data)
    
    labels = kmeans.labels_
    
    calinski = calinski_harabaz_score(dt.values, labels)
    
    calinski_data.append([k, calinski])
    
    davies = davies_bouldin_score(dt.values, labels)
    
    davies_data.append([k, davies])
    

finish = time.time()

calculated_time = (finish - start) / 60

print("Time in Minutes: " + str(calculated_time))

Time in Minutes: 356.79833902517953


In [10]:
calinski_data

[[0, 86.51548305903975],
 [250, 18.86281421272417],
 [500, 14.423729221434874],
 [750, 12.116101764848956],
 [1000, 10.596406074891002],
 [1250, 9.46001791615663],
 [1500, 8.68266807706511],
 [1750, 8.039338868515266],
 [2000, 7.521729335524091],
 [2250, 7.089440786808177],
 [2500, 6.715659536046627],
 [2750, 6.42648215177695],
 [3000, 6.182499751346564],
 [3250, 5.959654343618081],
 [3500, 5.784867317170164],
 [3750, 5.652353698601457],
 [4000, 5.539689753013521],
 [4250, 5.4481467725642725],
 [4500, 5.391481385830535],
 [4750, 5.355197398256021],
 [5000, 5.33674349603401],
 [5250, 5.340152178546885],
 [5500, 5.350345294472261],
 [5750, 5.380748035317732],
 [6000, 5.4187310448609605],
 [6250, 5.468614789822055],
 [6500, 5.542983636825431],
 [6750, 5.625008609382618],
 [7000, 5.740999839033861],
 [7250, 5.858592077342056],
 [7500, 6.024062421470066],
 [7750, 6.222237176142101],
 [8000, 6.46496799181014],
 [8250, 6.776138803439068],
 [8500, 7.150579566415019],
 [8750, 7.5796062398710395

In [11]:
davies_data

[[0, 7.7755374188832596],
 [250, 2.768631903089347],
 [500, 2.369489851436256],
 [750, 2.0963358202823397],
 [1000, 1.909429430943829],
 [1250, 1.8376198998029905],
 [1500, 1.7376398374188193],
 [1750, 1.682813110341777],
 [2000, 1.5868576363375186],
 [2250, 1.5816221050509298],
 [2500, 1.4857205770384194],
 [2750, 1.4806479913987436],
 [3000, 1.385567755329316],
 [3250, 1.329948091679406],
 [3500, 1.278876759686928],
 [3750, 1.2391606921022358],
 [4000, 1.198191981004079],
 [4250, 1.1552110359257763],
 [4500, 1.1121715643010233],
 [4750, 1.0607551520466316],
 [5000, 1.0229343766696137],
 [5250, 0.9970621360282017],
 [5500, 0.9551388508120253],
 [5750, 0.9226649696411887],
 [6000, 0.893287366612835],
 [6250, 0.8666132812499064],
 [6500, 0.8418781213248338],
 [6750, 0.8172223244959613],
 [7000, 0.7936774461563832],
 [7250, 0.7733089803702501],
 [7500, 0.750550391665179],
 [7750, 0.7261803800497815],
 [8000, 0.7081681280557368],
 [8250, 0.6809285279549584],
 [8500, 0.6523199251606577],
 