Thoughts:
- try with cosine similarity instead of euclidean distance
- test out diffusion maps
- test out spectral clustering
- test out other datasets (Politifact?)
- go to OH

In [14]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from sklearn.decomposition import PCA
import plotly.express as px
import diffusion_map

In [2]:
data = pd.read_csv("data/liar_all.csv")
data.head()

Unnamed: 0,label,statement,subjects,title,state,affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context,embedding
0,false,Says the Annies List political group supports ...,abortion,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,[-2.36570239e-02 -8.62008557e-02 -4.44836281e-...
1,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,[ 1.40513331e-02 3.18020508e-02 5.52115180e-...
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,[-1.65689383e-02 1.25167137e-02 -3.50067168e-...
3,false,Health care reform legislation is likely to ma...,health-care,,,none,7.0,19.0,3.0,5.0,44.0,a news release,[-7.41785616e-02 6.39113188e-02 3.35710347e-...
4,half-true,The economic turnaround started at the end of ...,"economy,jobs",,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,[ 9.05880891e-03 -9.23928991e-03 5.64881079e-...


In [3]:
flat_embeddings = np.load("data/liar_embeddings.npy")
num_cols = int(flat_embeddings.size / len(data))
sentence_embeddings = flat_embeddings.reshape(len(data), int(num_cols))
sentence_embeddings.shape

(11507, 384)

# Clustering

## Metrics

In [4]:
def get_scores(data, labels):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, labels, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, labels)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, labels)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

I want to look at what kind of scores I would get if the clusters perfectly grouped the data by truth-level.

In [46]:
def simple_label(label):
    if label in ["false", "barely-true", "pants-fire"]:
        return False
    else:
        return True

def label_to_idx(label):
    if label == "pants-fire":
        return 0
    elif label == "false":
        return 1
    elif label == "barely-true":
        return 2
    elif label == "half-true":
        return 3
    elif label == "mostly-true":
        return 4
    elif label == "true":
        return 5

In [54]:
simple_labels = map(simple_label, list(data.label))
get_scores(sentence_embeddings, np.array(list(simple_labels)))

Sillhouette score: 0.0019048563
Calinski-Harabasz Index: 21.620586318039358
Davies-Bouldin Index: 22.866642737754894


[0.0019048563, 21.620586318039358, 22.866642737754894]

In [55]:
labels_as_nums = map(label_to_idx, list(data.label))
get_scores(sentence_embeddings, np.array(list(labels_as_nums)))

Sillhouette score: -0.004734566
Calinski-Harabasz Index: 7.719753509924825
Davies-Bouldin Index: 35.501960516475805


[-0.004734566, 7.719753509924825, 35.501960516475805]

### KMeans Clustering

In [5]:
labels = set(data.label)
labels

{'barely-true', 'false', 'half-true', 'mostly-true', 'pants-fire', 'true'}

In [56]:
clusterer = KMeans(n_clusters=2)
sk_labels = clusterer.fit_predict(sentence_embeddings)

In [57]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.02963337
Calinski-Harabasz Index: 367.7585495837938
Davies-Bouldin Index: 5.560759864821069


[0.02963337, 367.7585495837938, 5.560759864821069]

### PCA

In [8]:
pca = PCA(n_components=1)
pca_data = pca.fit_transform(sentence_embeddings)

In [9]:
sk_labels = clusterer.fit_predict(pca_data)

In [10]:
get_scores(pca_data, sk_labels)

Sillhouette score: 0.532594
Calinski-Harabasz Index: 58772.8541872647
Davies-Bouldin Index: 0.5230355897233093


[0.532594, 58772.8541872647, 0.5230355897233093]

### Diffusion Map

In [11]:
mapper = diffusion_map.DiffusionMap()
diff_map = mapper.map(sentence_embeddings)

In [12]:
clusterer = KMeans(n_clusters=len(labels))
sk_labels = clusterer.fit_predict(diff_map)

In [13]:
get_scores(diff_map, sk_labels)

Sillhouette score: 0.12865253259743215
Calinski-Harabasz Index: 1060.1953563834677
Davies-Bouldin Index: 1.9711557836889835


[0.12865253259743215, 1060.1953563834677, 1.9711557836889835]

### Spectral Clustering

In [26]:
clusterer = SpectralClustering()
sk_labels = clusterer.fit_predict(sentence_embeddings)

In [27]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.022871412
Calinski-Harabasz Index: 174.48618249622183
Davies-Bouldin Index: 4.852943222011046


[0.022871412, 174.48618249622183, 4.852943222011046]

### KMeans using cosine distance

In [28]:
import kMeans as cos_kMeans

In [36]:
# get data labels
labels = np.array(data.label)

# run kMeans
model = cos_kMeans.kMeans(k=6)
model.train(sentence_embeddings, labels)

Updating. 3729 points changed.
Updating. 1763 points changed.
Updating. 1084 points changed.
Updating. 826 points changed.
Updating. 574 points changed.
Updating. 393 points changed.
Updating. 298 points changed.
Updating. 274 points changed.
Updating. 251 points changed.
Updating. 251 points changed.
Updating. 290 points changed.
Updating. 286 points changed.
Updating. 294 points changed.
Updating. 327 points changed.
Updating. 269 points changed.
Updating. 246 points changed.
Updating. 184 points changed.
Updating. 150 points changed.
Updating. 139 points changed.
Updating. 128 points changed.
Updating. 140 points changed.
Updating. 138 points changed.
Updating. 140 points changed.
Updating. 161 points changed.
Updating. 187 points changed.
Updating. 247 points changed.
Updating. 264 points changed.
Updating. 292 points changed.
Updating. 279 points changed.
Updating. 267 points changed.
Updating. 207 points changed.
Updating. 171 points changed.
Updating. 144 points changed.
Updatin

In [37]:
model.rand_score()

0.5901663617537287

In [38]:
model.cluster_counts()

[1744, 2373, 1610, 1379, 1801, 2600]

In [40]:
model.get_cluster_labels()

['m', 'f', 'f', 'f', 'm', 'h']


In [41]:
# write function to get labels of entire data