Clustering speeches with the BIRCH algorithm

In [3]:
from sklearn.cluster import Birch
from LSA import LSA
import numpy as np
import pandas as pd

In [3]:
X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
brc = Birch(n_clusters=None)
brc.fit(X)

In [4]:
brc.predict(X)

array([0, 0, 0, 1, 1, 1])

In [4]:
lsi_obj = LSA.load_object('lsa.pkl')

In [5]:
reduced_tf_idf = np.dot(lsi_obj.U, np.diag(lsi_obj.S))

In [6]:
reduced_tf_idf.shape

(295813, 779)

We have 295813 speeches of 779 dimensions each after SVD

In [6]:
# Measure the size of reduced_tf_idf
import sys
print(f'{sys.getsizeof(reduced_tf_idf) / 1024 / 1024} MB')


1758.1050338745117 MB


Constructing a distance matrix is not possible due to memory constraints. Hence BIRCH is used

In [5]:
# Run Birch on reduced_tf_idf
brc = Birch(n_clusters=10, threshold=0.5, branching_factor=50)
brc.fit(reduced_tf_idf)

Cluster analysis to see if the clustering is indeed good and if not tune the parameters

In [None]:
results = brc.predict(reduced_tf_idf)

In [7]:
# Find how the clusters are distributed
from collections import Counter
Counter(results)

Counter({np.int64(8): 108413,
         np.int64(2): 73350,
         np.int64(4): 67506,
         np.int64(0): 27226,
         np.int64(1): 9252,
         np.int64(9): 4786,
         np.int64(5): 1607,
         np.int64(3): 1341,
         np.int64(7): 1221,
         np.int64(6): 1111})

In [None]:
# Save only the labels
np.save('labels.npy', results)

In [3]:
# Read the labels
labels = np.load('labels.npy')

In [None]:
# Find shilouette score for each cluster
from sklearn.metrics import silhouette_score
score = silhouette_score(reduced_tf_idf, results)
print(score)

np.float64(0.01931933169206312)

Silhouette Score is not negative!

In [None]:
# Compute Hopkins statistic - Code from https://github.com/prathmachowksey/Hopkins-Statistic-Clustering-Tendency/blob/master/Hopkins-Statistic-Clustering-Tendency.ipynb
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
def hopkins_statistic(X):

    sample_size = int(X.shape[0]*0.05) #0.05 (5%) based on paper by Lawson and Jures
    
    
    #a uniform random sample in the original data space
    X_uniform_random_sample = uniform(X.min(axis=0), X.max(axis=0) ,(sample_size , X.shape[1]))
    
    
    
    #a random sample of size sample_size from the original data X
    random_indices=sample(range(0, X.shape[0], 1), sample_size)
    X_sample = X[random_indices]
   
    
    #initialise unsupervised learner for implementing neighbor searches
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs=neigh.fit(X)
    
    #u_distances = nearest neighbour distances from uniform random sample
    u_distances , u_indices = nbrs.kneighbors(X_uniform_random_sample , n_neighbors=2)
    u_distances = u_distances[: , 0] #distance to the first (nearest) neighbour
    
    #w_distances = nearest neighbour distances from a sample of points from original data X
    w_distances , w_indices = nbrs.kneighbors(X_sample , n_neighbors=2)
    #distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
    w_distances = w_distances[: , 1]
    
 
    
    u_sum = np.sum(u_distances)
    w_sum = np.sum(w_distances)
    
    #compute and return hopkins' statistic
    H = u_sum/ (u_sum + w_sum)
    return H
    

hopkins_statistic(reduced_tf_idf)

np.float64(0.957866984297328)