In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
import plotly.express as px
import diffusion_map

In [2]:
data = pd.read_csv("data/liar_embedded.csv")
data.head()

Unnamed: 0,label,statement,subjects,title,state,affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context,embedding
0,false,Says the Annies List political group supports ...,abortion,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,[-2.36570239e-02 -8.62008557e-02 -4.44836281e-...
1,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,[ 1.40513331e-02 3.18020508e-02 5.52115180e-...
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,[-1.65689383e-02 1.25167137e-02 -3.50067168e-...
3,false,Health care reform legislation is likely to ma...,health-care,,,none,7.0,19.0,3.0,5.0,44.0,a news release,[-7.41785616e-02 6.39113188e-02 3.35710347e-...
4,half-true,The economic turnaround started at the end of ...,"economy,jobs",,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,[ 9.05880891e-03 -9.23928991e-03 5.64881079e-...


In [3]:
flat_embeddings = np.load("data/flattened_embeddings.npy")
num_cols = int(flat_embeddings.size / len(data))
sentence_embeddings = flat_embeddings.reshape(len(data), int(num_cols))
sentence_embeddings.shape

(11507, 384)

# Clustering

## Metrics

In [4]:
def get_scores(data, labels):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, labels, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, labels)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, labels)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

### Direct Clustering

In [5]:
labels = set(data.label)
labels

{'barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire', 'true'}

In [6]:
clusterer = KMeans(n_clusters=len(labels))
sk_labels = clusterer.fit_predict(sentence_embeddings)

In [7]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.02414232
Calinski-Harabasz Index: 220.08171753461343
Davies-Bouldin Index: 4.718484780704455


[0.02414232, 220.08171753461343, 4.718484780704455]

### PCA

In [8]:
pca = PCA(n_components=1)
pca_data = pca.fit_transform(sentence_embeddings)

In [9]:
sk_labels = clusterer.fit_predict(pca_data)

In [10]:
get_scores(pca_data, sk_labels)

Sillhouette score: 0.532594
Calinski-Harabasz Index: 58772.8541872647
Davies-Bouldin Index: 0.5230355897233093


[0.532594, 58772.8541872647, 0.5230355897233093]

Thoughts:
- try with cosine similarity instead of euclidean distance
- test out diffusion maps
- test out spectral clustering
- test out other datasets (Politifact?)
- go to OH

### Diffusion Map

In [11]:
mapper = diffusion_map.DiffusionMap()
diff_map = mapper.map(sentence_embeddings)

In [12]:
clusterer = KMeans(n_clusters=len(labels))
sk_labels = clusterer.fit_predict(diff_map)

In [13]:
get_scores(diff_map, sk_labels)

Sillhouette score: 0.12865253259743215
Calinski-Harabasz Index: 1060.1953563834677
Davies-Bouldin Index: 1.9711557836889835


[0.12865253259743215, 1060.1953563834677, 1.9711557836889835]