In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN, Birch
from sklearn.mixture import GaussianMixture
from sklearn.metrics import normalized_mutual_info_score

In [3]:
# read data
with open('./Tweets.txt') as f:
    data = [json.loads(l[:-1]) for l in f.readlines()]
data

[{'text': 'brain fluid buildup delay giffords rehab', 'cluster': 37},
 {'text': 'trailer talk week movie rite mechanic week opportunity',
  'cluster': 14},
 {'text': 'rnc appoints chairman tampa convention effort visit tampa republican nati tampa fl',
  'cluster': 100},
 {'text': 'gbagbo camp futile cut ivory coast economy', 'cluster': 110},
 {'text': 'chinese president lost translation powerful leader meet expect tran',
  'cluster': 61},
 {'text': 'england fishing community current management system broken edf',
  'cluster': 60},
 {'text': 'protest reform start yemen hundred anti government protester gathered sanaa',
  'cluster': 79},
 {'text': 'stuxnet lead chernobyl russian', 'cluster': 83},
 {'text': 'iphone share smartphones phone', 'cluster': 81},
 {'text': 'uploaded youtube video nba final lakers celtic game memory highlig',
  'cluster': 67},
 {'text': 'feed epic sci fi car ad kia optimum super bowl commercial video trendhunter supe',
  'cluster': 99},
 {'text': 'naughty faceboo

In [4]:
# construct tf-idf matrix
corpus = [d['text'] for d in data]
vectorizer = TfidfVectorizer(min_df=2, sublinear_tf=True)
X = vectorizer.fit_transform(corpus).todense()
tfidf_matrix = np.array(X)

In [5]:
# get labels of each tweet
labels = [d['cluster'] for d in data]

In [6]:
# K-Means clustering test
k_means = KMeans(n_clusters=max(labels), n_jobs=-1).fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, k_means.labels_, average_method='arithmetic')
print('The NMI score of K-Means is:', nmi)

The NMI score of K-Means is: 0.8083287921393979


In [7]:
# Affinity propagation clustering test
ap = AffinityPropagation().fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, ap.labels_, average_method='arithmetic')
print('The NMI score of Affinity propagation is:', nmi)

The NMI score of Affinity propagation is: 0.7733356331577484


In [8]:
# MeanShift clustering test
ms = MeanShift(bandwidth=0.44, bin_seeding=True, min_bin_freq=2, cluster_all=True, n_jobs=-1).fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, ms.labels_, average_method='arithmetic')
print('The NMI score of MeanShift is:', nmi)
print('Its shape is:', np.unique(ms.labels_).shape)

The NMI score of MeanShift is: 0.6901183198703617
Its shape is: (85,)


In [9]:
# SpectralClustering test
sc = SpectralClustering(n_clusters=max(labels)).fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, sc.labels_, average_method='arithmetic')
print('The NMI score of Spectral Clustering is:', nmi)

The NMI score of SpectralClustering is: 0.698345651992622


In [10]:
# Ward hierarchical clustering test
whc = AgglomerativeClustering(n_clusters=max(labels), linkage='ward').fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, whc.labels_, average_method='arithmetic')
print('The NMI score of Ward hierarchical clustering is:', nmi)

The NMI score of Ward hierarchical clustering is: 0.7864984641747731


In [11]:
# AgglomerativeClustering test
ac = AgglomerativeClustering(n_clusters=max(labels), affinity='cosine', linkage='single').fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, ac.labels_, average_method='arithmetic')
print('The NMI score of Agglomerative Clustering with single linkage is:', nmi)
ac = AgglomerativeClustering(n_clusters=max(labels), affinity='cosine', linkage='average').fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, ac.labels_, average_method='arithmetic')
print('The NMI score of Agglomerative Clustering with average linkage is:', nmi)

The NMI score of Agglomerative Clustering with single linkage is: 0.22496764238754718
The NMI score of Agglomerative Clustering with average linkage is: 0.9003064131754916


In [12]:
# DBSCAN test
dbscan = DBSCAN(eps=0.1, n_jobs=-1).fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, dbscan.labels_, average_method='arithmetic')
print('The NMI score of DBSCAN with eps=0.1 is:', nmi)
print('Its shape is:', np.unique(dbscan.labels_).shape)
dbscan = DBSCAN(eps=1, n_jobs=-1).fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, dbscan.labels_, average_method='arithmetic')
print('The NMI score of DBSCAN with eps=1 is:', nmi)
print('Its shape is:', np.unique(dbscan.labels_).shape)
dbscan = DBSCAN(eps=2, n_jobs=-1).fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, dbscan.labels_, average_method='arithmetic')
print('The NMI score of DBSCAN with eps=2 is:', nmi)
print('Its shape is:', np.unique(dbscan.labels_).shape)

The NMI score of DBSCAN with eps=0.1 is: 0.027537116536474605
Its shape is: (5,)
The NMI score of DBSCAN with eps=1 is: 0.568925646557483
Its shape is: (64,)
The NMI score of DBSCAN with eps=2 is: -8.557811214631876e-17
Its shape is: (1,)


In [13]:
# Birch test
brc = Birch(n_clusters=max(labels)).fit(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, brc.labels_, average_method='arithmetic')
print('The NMI score of Birch is:', nmi)

The NMI score of Birch is: 0.8050399281197088


In [14]:
# Gaussian mixtures test
gm_labels = GaussianMixture(n_components=max(labels), covariance_type='tied').fit_predict(tfidf_matrix)
nmi = normalized_mutual_info_score(labels, gm_labels, average_method='arithmetic')
print('The NMI score of Gaussian mixtures is:', nmi)

The NMI score of Gaussian mixtures is: 0.7987454617917258
