In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
import sys
import matplotlib.pyplot as plt
sys.path.append("../../")
import diffusion_map
import plotly.express as px

In [3]:
data = pd.read_csv("../../data/politifact/politifact_all.csv")
data

Unnamed: 0.1,Unnamed: 0,id,news_url,title,tweet_ids,label
0,0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0
...,...,...,...,...,...,...
1051,619,politifact14731,https://www.flake.senate.gov/public/index.cfm/...,Flake: “Religious tests should have no place i...,,1
1052,620,politifact329,https://web.archive.org/web/20080131000131/htt...,Change We Can Believe In,634287923135909888\t946743411100536832\t946816...,1
1053,621,politifact1576,http://www.youtube.com/watch?v=4O8CxZ1OD58,deputy director of national health statistics ...,,1
1054,622,politifact4720,http://www.youtube.com/watch?v=EhyMplwY6HY,Romneys ProLife Conversion Myth or Reality Jun...,188871706637647874,1


In [4]:
vectors = np.load("../../data/politifact/politifact_count_vectorizer_1.npy")
X = vectors.reshape(1056, 3375)
X.shape

(1056, 3375)

In [5]:
ground_truths = data.label

# Clustering

## Metrics

In [6]:
def intrinsic_metrics(data, predictions):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, predictions, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, predictions)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, predictions)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

In [7]:
def extrinsic_metrics(ground_truths, predictions):
    # rand index score (higher is better, max 1)
    rs = metrics.rand_score(ground_truths, predictions)
    print("Random Index:", rs)
    # homogeneity - closer to 1 is better
    homo = metrics.homogeneity_score(ground_truths, predictions)
    print("Homogeneity:", homo)
    
    return [rs, homo]

## Using Euclidean Distance Only

### KMeans Clustering

#### Finding the optimal k

In [36]:
def run_kMeans(X, k):
    best_model = 0
    best_rand_score = 0
    for i in range(10):
        clusterer = KMeans(n_clusters=k, init='k-means++')
        sk_labels = clusterer.fit_predict(X)
        rand_score = metrics.rand_score(ground_truths, sk_labels)
        if rand_score > best_rand_score:
            best_rand_score = rand_score
            best_model = sk_labels
        return sk_labels        

In [40]:
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 30):
    labels = run_kMeans(X, i)
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(X, labels))
    rand_idx.append(metrics.rand_score(ground_truths, labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, labels))

In [41]:
df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [42]:
fig = px.line(df, x='k', y='Homogeneity')
fig.show()

In [43]:
fig = px.line(df, x='k', y='Calinski-Harabasz Score')
fig.show()

In [46]:
predictions = run_kMeans(X, 5)

In [47]:
intrinsic_metrics(X, predictions)

Sillhouette score: -0.011878307033985045
Calinski-Harabasz Index: 14.984937757199868
Davies-Bouldin Index: 4.202313025625445


[-0.011878307033985045, 14.984937757199868, 4.202313025625445]

In [49]:
extrinsic_metrics(ground_truths, predictions)

Random Index: 0.5102847192302169
Homogeneity: 0.047412252187544794


[0.5102847192302169, 0.047412252187544794]

### PCA

In [25]:
pca = PCA()
components = pca.fit_transform(X)

df = pd.DataFrame({"cumulative explained variance": np.cumsum(pca.explained_variance_ratio_)})
fig = px.line(df, x=df.index, y="cumulative explained variance")
fig.show()

In [50]:
pca = PCA(n_components=5)
pca_data = pca.fit_transform(X)

In [52]:
# finding the best number of clusters for PCA-- it's still 3
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 21):
    labels = run_kMeans(pca_data, i)
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(pca_data, labels))
    rand_idx.append(metrics.rand_score(ground_truths, labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, labels))

df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [53]:
labels = run_kMeans(pca_data, 5)

In [54]:
intrinsic_metrics(pca_data, labels)

Sillhouette score: 0.41715124592242325
Calinski-Harabasz Index: 294.93219155615816
Davies-Bouldin Index: 0.8790616157795617


[0.41715124592242325, 294.93219155615816, 0.8790616157795617]

In [55]:
extrinsic_metrics(ground_truths, labels)

Random Index: 0.521993034611518
Homogeneity: 0.05613486465195775


[0.521993034611518, 0.05613486465195775]

### Euclidean Diffusion Map

In [56]:
mapper = diffusion_map.DiffusionMap()
diff_map = mapper.map(X)

In [63]:
# finding the best number of clusters for diffusion map -- it's k=4
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 20):
    labels = run_kMeans(diff_map, i)
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(diff_map, labels))
    rand_idx.append(metrics.rand_score(ground_truths, labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, labels))

df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [64]:
labels = run_kMeans(diff_map, 14)

In [65]:
intrinsic_metrics(diff_map, labels)

Sillhouette score: 0.9335379636305458
Calinski-Harabasz Index: 136.3968322221274
Davies-Bouldin Index: 0.8984099477320334


[0.9335379636305458, 136.3968322221274, 0.8984099477320334]

In [66]:
extrinsic_metrics(ground_truths, labels)

Random Index: 0.5214778112882378
Homogeneity: 0.01867179952804918


[0.5214778112882378, 0.01867179952804918]

### Spectral Clustering

In [67]:
def run_SpectralClustering(X, k):
    best_model = 0
    best_rand_score = 0
    for i in range(10):
        clusterer = SpectralClustering(n_clusters=k)
        sk_labels = clusterer.fit_predict(X)
        rand_score = metrics.rand_score(ground_truths, sk_labels)
        if rand_score > best_rand_score:
            best_rand_score = rand_score
            best_model = sk_labels
        return sk_labels        

In [68]:
# finding the best number of clusters for spectral clustering -- it's k=5
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 21):
    labels = run_SpectralClustering(X, i)
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(X, labels))
    rand_idx.append(metrics.rand_score(ground_truths, labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, labels))

df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()


Number of distinct clusters (11) found smaller than n_clusters (14). Possibly due to duplicate points in X.


Number of distinct clusters (14) found smaller than n_clusters (16). Possibly due to duplicate points in X.


Number of distinct clusters (15) found smaller than n_clusters (17). Possibly due to duplicate points in X.


Number of distinct clusters (17) found smaller than n_clusters (18). Possibly due to duplicate points in X.


Number of distinct clusters (17) found smaller than n_clusters (19). Possibly due to duplicate points in X.


Number of distinct clusters (19) found smaller than n_clusters (20). Possibly due to duplicate points in X.



In [69]:
labels = run_SpectralClustering(X, 2)

In [70]:
intrinsic_metrics(X, labels)

Sillhouette score: 0.6760677152679737
Calinski-Harabasz Index: 17.37158347896344
Davies-Bouldin Index: 0.23068697302970662


[0.6760677152679737, 17.37158347896344, 0.23068697302970662]

In [71]:
extrinsic_metrics(ground_truths, labels)

Random Index: 0.5157277753841735
Homogeneity: 0.0007368599681159


[0.5157277753841735, 0.0007368599681159]

## Using Cosine Distances

### KMeans Cosine

In [72]:
# import sys
# sys.path.append("../../")
import kMeans as cos_kMeans

In [73]:
# finding the best number of clusters cosine kMeans (2)
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 10):
    clusterer = cos_kMeans.kMeans(k=i)
    clusterer.train(X, ground_truths)
    predict_labels = clusterer.labels
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(X, predict_labels))
    rand_idx.append(metrics.rand_score(ground_truths, predict_labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, predict_labels))

In [74]:
df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [75]:
model = cos_kMeans.kMeans(k=2)
model.train(X, ground_truths)

In [76]:
intrinsic_metrics(X, model.labels)

Sillhouette score: 0.0627209670362962
Calinski-Harabasz Index: 17.135283067081918
Davies-Bouldin Index: 6.805787989004544


[0.0627209670362962, 17.135283067081918, 6.805787989004544]

In [77]:
extrinsic_metrics(ground_truths, model.labels)

Random Index: 0.5643329024845612
Homogeneity: 0.07332680335410253


[0.5643329024845612, 0.07332680335410253]

### PCA with cosine KMeans

In [78]:
# finding the best number of clusters cosine kMeans
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 10):
    clusterer = cos_kMeans.kMeans(k=i)
    clusterer.train(pca_data, ground_truths)
    predict_labels = clusterer.labels
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(pca_data, predict_labels))
    rand_idx.append(metrics.rand_score(ground_truths, predict_labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, predict_labels))

In [79]:
df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [83]:
clusterer = cos_kMeans.kMeans(k=4)
clusterer.train(pca_data, ground_truths)

In [84]:
intrinsic_metrics(pca_data, clusterer.labels)

Sillhouette score: 0.3816008884386169
Calinski-Harabasz Index: 236.3342324165176
Davies-Bouldin Index: 1.3060880763181153


[0.3816008884386169, 236.3342324165176, 1.3060880763181153]

In [85]:
extrinsic_metrics(ground_truths, clusterer.labels)

Random Index: 0.5240862415625449
Homogeneity: 0.06645287807080028


[0.5240862415625449, 0.06645287807080028]

### Cosine Diffusion Map and Euclidean KMeans

In [97]:
mapper = diffusion_map.DiffusionMap(distance='cosine')
diff_map = mapper.map(X)

In [98]:
# finding the best number of clusters cosine kMeans
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 10):
    labels = run_kMeans(diff_map, i)
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(pca_data, labels))
    rand_idx.append(metrics.rand_score(ground_truths, labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, labels))

In [99]:
df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [103]:
labels = run_kMeans(diff_map, 3)

In [104]:
intrinsic_metrics(diff_map, labels)

Sillhouette score: 0.6346321485006231
Calinski-Harabasz Index: 144.89943697550058
Davies-Bouldin Index: 0.2552064044532332


[0.6346321485006231, 144.89943697550058, 0.2552064044532332]

In [105]:
extrinsic_metrics(ground_truths, labels)

Random Index: 0.5088593278759156
Homogeneity: 0.01718479466067655


[0.5088593278759156, 0.01718479466067655]

### Cosine Diffusion Map and Cosine KMeans

In [87]:
# finding the best number of clusters cosine kMeans
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 10):
    clusterer = KMeans(n_clusters=i, init='k-means++')
    sk_labels = clusterer.fit_predict(diff_map)
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(pca_data, sk_labels))
    rand_idx.append(metrics.rand_score(ground_truths, sk_labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, sk_labels))

In [88]:
df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [89]:
model = cos_kMeans.kMeans(k=4)
model.train(pca_data, ground_truths)

In [90]:
intrinsic_metrics(diff_map, model.labels)

Sillhouette score: 0.21848074748455268
Calinski-Harabasz Index: 108.10498786917998
Davies-Bouldin Index: 1.8416524525056888


[0.21848074748455268, 108.10498786917998, 1.8416524525056888]

In [91]:
extrinsic_metrics(ground_truths, model.labels)

Random Index: 0.5238923596151084
Homogeneity: 0.06581804446766969


[0.5238923596151084, 0.06581804446766969]

### Cosine Spectral Clustering with Euclidean KMeans

In [92]:
affinity = squareform(pdist(X, 'cosine'))

In [93]:
# finding the best number of clusters for spectral clustering -- it's k=5
k = []
ch_idx = []
rand_idx = []
homogeneity = []
for i in range(2, 10):
    clusterer = SpectralClustering(n_clusters=i, affinity='precomputed')
    sk_labels = clusterer.fit_predict(affinity)
    k.append(i)
    ch_idx.append(metrics.calinski_harabasz_score(X, sk_labels))
    rand_idx.append(metrics.rand_score(ground_truths, sk_labels))
    homogeneity.append(metrics.homogeneity_score(ground_truths, sk_labels))

df = pd.DataFrame({'k': k, 'Calinski-Harabasz Score': ch_idx, "Rand Index": rand_idx, "Homogeneity": homogeneity})
fig = px.line(df, x='k', y='Rand Index')
fig.show()

In [94]:
clusterer = SpectralClustering(n_clusters=2, affinity='precomputed')
sk_labels = clusterer.fit_predict(affinity)

In [95]:
intrinsic_metrics(X, sk_labels)

Sillhouette score: -0.11960823058632404
Calinski-Harabasz Index: 1.0231687619559289
Davies-Bouldin Index: 8.97941131179575


[-0.11960823058632404, 1.0231687619559289, 8.97941131179575]

In [96]:
extrinsic_metrics(ground_truths, sk_labels)

Random Index: 0.5075847335918426
Homogeneity: 0.017543877121388617


[0.5075847335918426, 0.017543877121388617]