In [1]:
import os
import numpy as np
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics.cluster import normalized_mutual_info_score
from scipy.sparse import csr_matrix

In [2]:
def load_data(data_path):
        def sparse_to_dense(sparse_r_d, vocab_size):
            r_d = [0.0 for _ in range(vocab_size)]
            indices_tfidfs = sparse_r_d.split()
            for index_tfidf in indices_tfidfs:
                index = int(index_tfidf.split(":")[0])
                tfidf = float(index_tfidf.split(":")[1])
                r_d[index] = tfidf
            return np.array(r_d)

        with open(data_path) as f:
            d_lines = f.read().splitlines()

        with open(os.getcwd()+"/20news-bydate/20news-full-words-idfs.txt") as f:
            vocab_size = len(f.read().splitlines())

        data = []
        labels = []
        for d in d_lines:
            features = d.split("<fff>")
            label, doc_id = int(features[0]), int(features[1])
            if label not in labels:
                print(f"Loading cluster {label}")
            r_d = sparse_to_dense(sparse_r_d=features[2], vocab_size=vocab_size)
            data.append(r_d)
            labels.append(label)
        return data, labels

In [3]:
def purity_score(y_true, y_pred):
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

In [4]:
def clustering_with_KMeans():
    data, labels = load_data(os.getcwd()+"/20news-bydate/20news-full-tf-idf.txt")
    X = csr_matrix(data)
    print(f"Data shape: {X.shape}")
    print(f"Label shape: {len(labels)}")
    kmeans  = KMeans(n_clusters = 20, init = 'random', n_init = 5, tol = 1e-3, random_state = 2018).fit(X)
    print(f"Purity: {purity_score(labels, kmeans.labels_)}")
    print(f"{normalized_mutual_info_score(labels, kmeans.labels_)}")

In [5]:
clustering_with_KMeans()

Loading cluster 0
Loading cluster 1
Loading cluster 2
Loading cluster 3
Loading cluster 4
Loading cluster 5
Loading cluster 6
Loading cluster 7
Loading cluster 8
Loading cluster 9
Loading cluster 10
Loading cluster 11
Loading cluster 12
Loading cluster 13
Loading cluster 14
Loading cluster 15
Loading cluster 16
Loading cluster 17
Loading cluster 18
Loading cluster 19
Data shape: (18846, 14234)
Label shape: 18846
Purity: 0.4092115037673777
0.44204061784146925
