In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from sklearn.decomposition import PCA
import plotly.express as px
import diffusion_map

In [2]:
data = pd.read_csv("data/politifact/politifact_all.csv")
data

Unnamed: 0.1,Unnamed: 0,id,news_url,title,tweet_ids,label
0,0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0
...,...,...,...,...,...,...
1051,619,politifact14731,https://www.flake.senate.gov/public/index.cfm/...,Flake: “Religious tests should have no place i...,,1
1052,620,politifact329,https://web.archive.org/web/20080131000131/htt...,Change We Can Believe In,634287923135909888\t946743411100536832\t946816...,1
1053,621,politifact1576,http://www.youtube.com/watch?v=4O8CxZ1OD58,deputy director of national health statistics ...,,1
1054,622,politifact4720,http://www.youtube.com/watch?v=EhyMplwY6HY,Romneys ProLife Conversion Myth or Reality Jun...,188871706637647874,1


In [3]:
flat_embeddings = np.load("data/politifact/politifact_embeddings.npy")
num_cols = int(flat_embeddings.size / len(data))
sentence_embeddings = flat_embeddings.reshape(len(data), int(num_cols))
sentence_embeddings.shape

(1056, 384)

# Clustering

## Metrics

In [4]:
def get_scores(data, labels):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, labels, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, labels)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, labels)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

I want to look at what kind of scores I would get if the clusters perfectly grouped the data by truth-level.

In [5]:
get_scores(sentence_embeddings, data.label)

Sillhouette score: 0.01902008
Calinski-Harabasz Index: 19.110797606270467
Davies-Bouldin Index: 7.2986822921237025


[0.01902008, 19.110797606270467, 7.2986822921237025]

### KMeans Clustering

In [6]:
sentence_embeddings[sentence_embeddings == None]

array([], dtype=float32)

In [7]:
sentence_embeddings[np.isnan(sentence_embeddings)]

array([], dtype=float32)

In [8]:
clusterer = KMeans(n_clusters=5)
sk_labels = clusterer.fit_predict(sentence_embeddings)



In [9]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.022097401
Calinski-Harabasz Index: 20.73442106856794
Davies-Bouldin Index: 4.824819034476252


[0.022097401, 20.73442106856794, 4.824819034476252]

### PCA

In [10]:
pca = PCA(n_components=1)
pca_data = pca.fit_transform(sentence_embeddings)

In [11]:
sk_labels = clusterer.fit_predict(pca_data)



In [12]:
get_scores(pca_data, sk_labels)

Sillhouette score: 0.52955246
Calinski-Harabasz Index: 4037.806257685246
Davies-Bouldin Index: 0.5482121205924531


[0.52955246, 4037.806257685246, 0.5482121205924531]

### Diffusion Map

In [13]:
mapper = diffusion_map.DiffusionMap()
diff_map = mapper.map(sentence_embeddings)

matrix gotten


In [14]:
sk_labels = clusterer.fit_predict(diff_map)



In [15]:
get_scores(diff_map, sk_labels)

Sillhouette score: 0.5900118849050666
Calinski-Harabasz Index: 122.80260453103922
Davies-Bouldin Index: 0.30781870914149495


[0.5900118849050666, 122.80260453103922, 0.30781870914149495]

### Spectral Clustering

In [16]:
clusterer = SpectralClustering()
sk_labels = clusterer.fit_predict(sentence_embeddings)



In [17]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.018998915
Calinski-Harabasz Index: 16.758661895976736
Davies-Bouldin Index: 3.8944061428793413


[0.018998915, 16.758661895976736, 3.8944061428793413]

### KMeans using cosine distance

In [18]:
import kMeans as cos_kMeans

In [19]:
# get data labels
labels = np.array(data.label)

# run kMeans
model = cos_kMeans.kMeans(k=2)
model.train(sentence_embeddings, labels)

In [20]:
model.labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [21]:
model.rand_score()

IndexError: invalid index to scalar variable.

In [None]:
model.get_cluster_labels()

In [None]:
model.clusters