Thoughts:
- use a different vectorizer, like doc2vec

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from sklearn.decomposition import PCA
import plotly.express as px
import diffusion_map

In [28]:
data = pd.read_csv("data/politifact_all.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1056 non-null   int64 
 1   id          1056 non-null   object
 2   news_url    995 non-null    object
 3   title       1056 non-null   object
 4   tweet_ids   801 non-null    object
 5   label       1056 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 49.6+ KB


In [27]:
flat_embeddings = np.load("data/politifact_embeddings.npy")
num_cols = int(flat_embeddings.size / len(data))
sentence_embeddings = flat_embeddings.reshape(len(data), int(num_cols))
sentence_embeddings.shape

(1056, 384)

# Clustering

## Metrics

In [6]:
def get_scores(data, labels):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, labels, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, labels)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, labels)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

I want to look at what kind of scores I would get if the clusters perfectly grouped the data by truth-level.

In [30]:
get_scores(sentence_embeddings, data.label)

Sillhouette score: 0.01902008
Calinski-Harabasz Index: 19.110797606270467
Davies-Bouldin Index: 7.2986822921237025


[0.01902008, 19.110797606270467, 7.2986822921237025]

### KMeans Clustering

In [38]:
clusterer = KMeans(n_clusters=2)
sk_labels = clusterer.fit_predict(sentence_embeddings)

AttributeError: 'NoneType' object has no attribute 'split'

In [57]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.02963337
Calinski-Harabasz Index: 367.7585495837938
Davies-Bouldin Index: 5.560759864821069


[0.02963337, 367.7585495837938, 5.560759864821069]

### PCA

In [39]:
pca = PCA(n_components=1)
pca_data = pca.fit_transform(sentence_embeddings)

In [40]:
sk_labels = clusterer.fit_predict(pca_data)

AttributeError: 'NoneType' object has no attribute 'split'

In [10]:
get_scores(pca_data, sk_labels)

Sillhouette score: 0.532594
Calinski-Harabasz Index: 58772.8541872647
Davies-Bouldin Index: 0.5230355897233093


[0.532594, 58772.8541872647, 0.5230355897233093]

### Diffusion Map

In [41]:
mapper = diffusion_map.DiffusionMap()
diff_map = mapper.map(sentence_embeddings)

matrix gotten


In [42]:
clusterer = KMeans(n_clusters=2)
sk_labels = clusterer.fit_predict(diff_map)

AttributeError: 'NoneType' object has no attribute 'split'

In [13]:
get_scores(diff_map, sk_labels)

Sillhouette score: 0.12865253259743215
Calinski-Harabasz Index: 1060.1953563834677
Davies-Bouldin Index: 1.9711557836889835


[0.12865253259743215, 1060.1953563834677, 1.9711557836889835]

### Spectral Clustering

In [43]:
clusterer = SpectralClustering()
sk_labels = clusterer.fit_predict(sentence_embeddings)

AttributeError: 'NoneType' object has no attribute 'split'

In [27]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.022871412
Calinski-Harabasz Index: 174.48618249622183
Davies-Bouldin Index: 4.852943222011046


[0.022871412, 174.48618249622183, 4.852943222011046]

### KMeans using cosine distance

In [44]:
import kMeans as cos_kMeans

In [45]:
# get data labels
labels = np.array(data.label)

# run kMeans
model = cos_kMeans.kMeans(k=2)
model.train(sentence_embeddings, labels)

In [51]:
model.labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [46]:
model.rand_score()

IndexError: invalid index to scalar variable.

In [49]:
model.get_cluster_labels()

IndexError: invalid index to scalar variable.

In [52]:
model.clusters

array([array([   1,    2,    3,    4,    5,    6,    7,    8,   10,   11,   12,
                13,   14,   15,   16,   17,   19,   20,   21,   22,   23,   24,
                25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,
                36,   37,   39,   40,   41,   42,   43,   44,   45,   48,   50,
                51,   52,   53,   55,   56,   57,   58,   61,   62,   64,   65,
                66,   67,   68,   69,   70,   72,   74,   75,   76,   77,   78,
                79,   80,   81,   82,   83,   84,   86,   88,   89,   91,   93,
                94,   95,   96,   97,   99,  100,  102,  103,  106,  107,  109,
               110,  111,  112,  114,  115,  116,  117,  119,  121,  122,  123,
               125,  127,  128,  129,  131,  132,  134,  135,  136,  139,  140,
               141,  143,  144,  147,  148,  149,  150,  153,  155,  156,  157,
               159,  160,  161,  162,  163,  164,  165,  166,  167,  168,  169,
               170,  171,  172,  173,  1