Thoughts:
- use a different vectorizer, like doc2vec

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from sklearn.decomposition import PCA
import plotly.express as px
import diffusion_map

In [2]:
data = pd.read_csv("data/kaggle.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3984 entries, 0 to 3983
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  3984 non-null   int64  
 1   Id          3983 non-null   float64
 2   Headline    3983 non-null   object 
 3   Body        3983 non-null   object 
 4   Label       3984 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 155.8+ KB


In [4]:
flat_embeddings = np.load("data/kaggle_headline_embeddings.npy")
num_cols = int(flat_embeddings.size / len(data))
sentence_embeddings = flat_embeddings.reshape(len(data), int(num_cols))
sentence_embeddings.shape

(3984, 384)

# Clustering

## Metrics

In [5]:
def get_scores(data, labels):
    #sillhouette score ranges from -1 to 1, where 1 is best and 0 indicates cluster overlap
    ss = metrics.silhouette_score(data, labels, metric='euclidean')
    print("Sillhouette score:", ss)
    # variance ratio criterion-- how tightly clustered (higher is better)
    chs = metrics.calinski_harabasz_score(data, labels)
    print("Calinski-Harabasz Index:", chs)
    # similarity between clusters (lower is better)
    dbs = metrics.davies_bouldin_score(data, labels)   
    print("Davies-Bouldin Index:", dbs)
    return [ss, chs, dbs]

I want to look at what kind of scores I would get if the clusters perfectly grouped the data by truth-level.

In [6]:
get_scores(sentence_embeddings, data.Label)

Sillhouette score: 0.00951841
Calinski-Harabasz Index: 34.665775724056886
Davies-Bouldin Index: 10.694472821082597


[0.00951841, 34.665775724056886, 10.694472821082597]

### KMeans Clustering

In [38]:
clusterer = KMeans(n_clusters=2)
sk_labels = clusterer.fit_predict(sentence_embeddings)

AttributeError: 'NoneType' object has no attribute 'split'

In [57]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.02963337
Calinski-Harabasz Index: 367.7585495837938
Davies-Bouldin Index: 5.560759864821069


[0.02963337, 367.7585495837938, 5.560759864821069]

### PCA

In [39]:
pca = PCA(n_components=1)
pca_data = pca.fit_transform(sentence_embeddings)

In [40]:
sk_labels = clusterer.fit_predict(pca_data)

AttributeError: 'NoneType' object has no attribute 'split'

In [10]:
get_scores(pca_data, sk_labels)

Sillhouette score: 0.532594
Calinski-Harabasz Index: 58772.8541872647
Davies-Bouldin Index: 0.5230355897233093


[0.532594, 58772.8541872647, 0.5230355897233093]

### Diffusion Map

In [41]:
mapper = diffusion_map.DiffusionMap()
diff_map = mapper.map(sentence_embeddings)

matrix gotten


In [42]:
clusterer = KMeans(n_clusters=2)
sk_labels = clusterer.fit_predict(diff_map)

AttributeError: 'NoneType' object has no attribute 'split'

In [13]:
get_scores(diff_map, sk_labels)

Sillhouette score: 0.12865253259743215
Calinski-Harabasz Index: 1060.1953563834677
Davies-Bouldin Index: 1.9711557836889835


[0.12865253259743215, 1060.1953563834677, 1.9711557836889835]

### Spectral Clustering

In [43]:
clusterer = SpectralClustering()
sk_labels = clusterer.fit_predict(sentence_embeddings)

AttributeError: 'NoneType' object has no attribute 'split'

In [27]:
get_scores(sentence_embeddings, sk_labels)

Sillhouette score: 0.022871412
Calinski-Harabasz Index: 174.48618249622183
Davies-Bouldin Index: 4.852943222011046


[0.022871412, 174.48618249622183, 4.852943222011046]

### KMeans using cosine distance

In [7]:
import kMeans as cos_kMeans

In [8]:
# get data labels
labels = np.array(data.Label)

# run kMeans
model = cos_kMeans.kMeans(k=2)
model.train(sentence_embeddings, labels)

In [51]:
model.labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [46]:
model.rand_score()

IndexError: invalid index to scalar variable.

In [49]:
model.get_cluster_labels()

IndexError: invalid index to scalar variable.