In [None]:
#%reload_ext autoreload
#%autoreload 2
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn import metrics
import nmslib
dataset = fetch_20newsgroups(subset='all', shuffle=True, download_if_missing=True)

np.random.seed(123)
texts = dataset.data # Extract text
target = dataset.target # Extract target
texts[0:10]
target[0:10]

array([10,  3, 17,  3,  4, 12,  4, 10, 10, 19])

Since we want to cluster newsgroup posts we are more interested in words that appear in special groups but not in the others. By setting max_df = 0.3 we ensure that only words are considered that are not too common, i.e. only in 30% of all posts. By contrast, words that are very seldom but are concerned with special topics discussed in the groups are most important for our endeavor. Hence, there is no limit for min_df.

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.3)
X = vectorizer.fit_transform(texts)

In [None]:
print(f'{X.shape[0]}, {X.shape[1]}')
X[0:10, 0:10].todense()

18846, 1890


matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.27196019, 0.    

In [None]:
solution = SpectralClustering(n_clusters=20, assign_labels='kmeans', affinity='nearest_neighbors',eigen_solver='amg', n_neighbors=10).fit(X)
type(solution)

sklearn.cluster._spectral.SpectralClustering

In [None]:
print(f'{solution.labels_[0:20]}\n{target[0:20]}')
# https://en.wikipedia.org/wiki/Rand_index
metrics.adjusted_rand_score(solution.labels_, target)

[12  1  7 19  1  1  1 12 12  0  0 10  1  1  1  8  1  1  1  1]
[10  3 17  3  4 12  4 10 10 19 19 11 19 13  0 17 12 12 11  8]


0.04663688800450031

In [None]:
solutionKMeans = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1).fit(X)
print(f'{solutionKMeans.labels_[0:20]}\n{target[0:20]}')
metrics.adjusted_rand_score(solutionKMeans.labels_, target)
      

[ 0 10 13  3 19 18 19  0  0 19  2  7 18 19 19 19  5 17  4  5]
[10  3 17  3  4 12  4 10 10 19 19 11 19 13  0 17 12 12 11  8]


0.12851546664087976