In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

df = pd.read_csv('./clean_text.csv')

In [3]:
from gensim.models import KeyedVectors

glove_output_file = 'glove.twitter.27B.100d.word2vec'
model = KeyedVectors.load_word2vec_format(glove_output_file, binary=False)

In [12]:
import numpy as np

def try_map_to_vector(word):
    try:
        return model[word]
    except KeyError:
        return [0.0] * 100

def create_vector(text):
    word_vectors = [try_map_to_vector(word) for word in text.split()]
    
    return np.mean(word_vectors, axis=0).tolist()

vectors = [create_vector(text) for text in df['clean_text'].values]

In [13]:
from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=8, batch_size=750)

In [14]:
size = 250
start, end = 0, size
v = vectors[start:end]

clustering = kmeans.partial_fit(v)
labels = clustering.labels_

from sklearn import metrics

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(v, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(v, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(v, labels)))

Silhouette Coefficient: 0.047054357479257305
Calinski-Harabaz Index: 13.231204483023264
Davies-Bouldin Index: 3.213406742825041


In [15]:
start += size
end += size
v = vectors[start:end]

clustering = kmeans.partial_fit(v)
labels = clustering.labels_

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(v, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(v, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(v, labels)))

Silhouette Coefficient: 0.04926730356267753
Calinski-Harabaz Index: 15.34905088622966
Davies-Bouldin Index: 3.0845637380476703


In [16]:
start += size
end += size
v = vectors[start:end]

clustering = kmeans.partial_fit(v)
labels = clustering.labels_

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(v, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(v, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(v, labels)))

Silhouette Coefficient: 0.04961588568537917
Calinski-Harabaz Index: 14.579268664162775
Davies-Bouldin Index: 3.20127680738101


In [17]:
start += size
end += size
v = vectors[start:end]

clustering = kmeans.partial_fit(v)
labels = clustering.labels_

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(v, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(v, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(v, labels)))

Silhouette Coefficient: 0.03845198454509695
Calinski-Harabaz Index: 12.650780098757973
Davies-Bouldin Index: 3.3726673029462466


In [18]:
start += size
end += size
v = vectors[start:end]

clustering = kmeans.partial_fit(v)
labels = clustering.labels_

print('Silhouette Coefficient: {}'.format(metrics.silhouette_score(v, labels)))
print('Calinski-Harabaz Index: {}'.format(metrics.calinski_harabaz_score(v, labels)))
print('Davies-Bouldin Index: {}'.format(metrics.davies_bouldin_score(v, labels)))

Silhouette Coefficient: 0.052113138080286205
Calinski-Harabaz Index: 16.27298944075369
Davies-Bouldin Index: 3.0977727611536947
