In [11]:
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn import mixture

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',]

dataset = fetch_20newsgroups(subset='all', categories=categories,shuffle=True, random_state=42)
true = dataset.target
true_k = np.unique(true).shape[0]

print('init\t\tNMI\thomo\tcompl\t')


def bench(name,true,pre):
    print('%-9s\t%.3f\t%.3f\t%.3f\t'
          % (name,
             metrics.normalized_mutual_info_score(true, pre,average_method='arithmetic'),
             metrics.homogeneity_score(true, pre),
             metrics.completeness_score(true, pre),
             ))

vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,min_df=2, stop_words='english',use_idf=True)
data = vectorizer.fit_transform(dataset.data)

# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
data = lsa.fit_transform(data)


km = KMeans().fit(data)
pre= km.labels_
bench('k-means',true, pre)

af = AffinityPropagation().fit(data)
pre= af.labels_
bench('Affinity',true, pre)

ms = MeanShift().fit(data)
pre= ms.labels_
bench('MeanShift',true, pre)

sc = SpectralClustering(n_clusters=true_k).fit(data)
pre= ms.labels_
bench('Spectral',true, pre)

wh = AgglomerativeClustering(linkage="ward").fit(data)
pre= wh.labels_
bench('WH',true,pre)

ag = AgglomerativeClustering().fit(data)
pre= ag.labels_
bench('Agglomerative',true, pre)

db = DBSCAN(eps=0.008, min_samples=8).fit(data)
pre= db.labels_
bench('DBSCAN',true, pre)

pre = mixture.GaussianMixture(n_components=4).fit_predict(data)
bench('GaussianMixture',true, pre)

init		NMI	homo	compl	
k-means  	0.377	0.467	0.315	
Affinity 	0.287	0.976	0.168	
MeanShift	0.531	0.397	0.804	
Spectral 	0.531	0.397	0.804	
WH       	0.525	0.391	0.796	
Agglomerative	0.525	0.391	0.796	
DBSCAN   	0.505	0.394	0.703	
GaussianMixture	0.427	0.427	0.427	
