In [31]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from sklearn.decomposition import PCA
import plotly.express as px
import diffusion_map

In [2]:
data = pd.read_csv("data/politifact/politifact_all.csv")
data

Unnamed: 0.1,Unnamed: 0,id,news_url,title,tweet_ids,label
0,0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0
...,...,...,...,...,...,...
1051,619,politifact14731,https://www.flake.senate.gov/public/index.cfm/...,Flake: “Religious tests should have no place i...,,1
1052,620,politifact329,https://web.archive.org/web/20080131000131/htt...,Change We Can Believe In,634287923135909888\t946743411100536832\t946816...,1
1053,621,politifact1576,http://www.youtube.com/watch?v=4O8CxZ1OD58,deputy director of national health statistics ...,,1
1054,622,politifact4720,http://www.youtube.com/watch?v=EhyMplwY6HY,Romneys ProLife Conversion Myth or Reality Jun...,188871706637647874,1


In [3]:
flat_embeddings = np.load("data/politifact/politifact_embeddings.npy")
num_cols = int(flat_embeddings.size / len(data))
sentence_embeddings = flat_embeddings.reshape(len(data), int(num_cols))
sentence_embeddings.shape

(1056, 384)

# Clustering

## Metrics

In [4]:
def get_scores(data, labels):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, labels, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, labels)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, labels)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

I want to look at what kind of scores I would get if the clusters perfectly grouped the data by truth-level.

In [5]:
get_scores(sentence_embeddings, data.label)

Sillhouette score: 0.01902008
Calinski-Harabasz Index: 19.110797606270467
Davies-Bouldin Index: 7.2986822921237025


[0.01902008, 19.110797606270467, 7.2986822921237025]

### KMeans Clustering

In [6]:
sentence_embeddings[sentence_embeddings == None]

array([], dtype=float32)

In [18]:
sentence_embeddings[np.isnan(sentence_embeddings)]

array([], dtype=float32)

In [7]:
sentence_embeddings[sentence_embeddings == np.nan]

array([], dtype=float32)

In [19]:
test = np.array([np.nan, np.nan, 1, 2])

In [20]:
np.isnan(test)

array([ True,  True, False, False])

In [10]:
sentence_embeddings

array([[-0.05337802,  0.04711616, -0.04323821, ..., -0.02248419,
         0.07464816, -0.01281774],
       [ 0.02060846,  0.11946598,  0.02535219, ..., -0.01384378,
         0.0385513 , -0.09493323],
       [-0.08164155, -0.02521669, -0.0270285 , ..., -0.04338511,
         0.0106045 , -0.03550814],
       ...,
       [-0.06057667,  0.00395338, -0.09825242, ..., -0.05685387,
        -0.01005526,  0.08571234],
       [-0.03491964, -0.03432665, -0.04987245, ..., -0.01057882,
        -0.00316605,  0.04316002],
       [ 0.01210653, -0.04727036, -0.08902845, ..., -0.05093165,
        -0.12302828,  0.03865448]], dtype=float32)

In [None]:
! pip install numpy==1.21.4 --user

In [None]:
! pip uninstall numpy

In [None]:
import numpy as np
np.__version__

In [23]:
clusterer = KMeans(n_clusters=2)
clusterer.fit(sentence_embeddings)

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
get_scores(sentence_embeddings, sk_labels)

### PCA

In [None]:
pca = PCA(n_components=1)
pca_data = pca.fit_transform(sentence_embeddings)

In [None]:
sk_labels = clusterer.fit_predict(pca_data)

In [None]:
get_scores(pca_data, sk_labels)

### Diffusion Map

In [None]:
mapper = diffusion_map.DiffusionMap()
diff_map = mapper.map(sentence_embeddings)

In [None]:
clusterer = KMeans(n_clusters=2)
sk_labels = clusterer.fit_predict(diff_map)

In [None]:
get_scores(diff_map, sk_labels)

### Spectral Clustering

In [None]:
clusterer = SpectralClustering()
sk_labels = clusterer.fit_predict(sentence_embeddings)

In [None]:
get_scores(sentence_embeddings, sk_labels)

### KMeans using cosine distance

In [None]:
import kMeans as cos_kMeans

In [None]:
# get data labels
labels = np.array(data.label)

# run kMeans
model = cos_kMeans.kMeans(k=2)
model.train(sentence_embeddings, labels)

In [None]:
model.labels

In [None]:
model.rand_score()

In [None]:
model.get_cluster_labels()

In [None]:
model.clusters