In [16]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_mutual_info_score, silhouette_score

In [21]:

train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train.data)
X_test = vectorizer.fit_transform(test.data)

In [24]:
k = 20
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init='auto', verbose=0)
kmeans.fit(X_train)
nmi = normalized_mutual_info_score(train.target, kmeans.labels_)
ami = adjusted_mutual_info_score(train.target, kmeans.labels_)
silhouette = silhouette_score(X_train, kmeans.labels_)

print(nmi)
print(ami)
print(silhouette)

0.007855628225413892
0.0025738165408000346
0.879938732997808


In [32]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_mutual_info_score, silhouette_score

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Vectorize CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
X_count = count_vectorizer.fit_transform(newsgroups.data)

# Vectorize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(newsgroups.data)

# Fit KMeans CountVectorizer
kmeans_count = KMeans(n_clusters=20, verbose=1)
kmeans_count.fit(X_count)

# Fit KMeans TfidfVectorizer
kmeans_tfidf = KMeans(n_clusters=20, verbose=1)
kmeans_tfidf.fit(X_tfidf)

# NMI, AMI, Silhouette CountVectorizer
nmi_count = normalized_mutual_info_score(newsgroups.target, kmeans_count.labels_)
ami_count = adjusted_mutual_info_score(newsgroups.target, kmeans_count.labels_)
silhouette_count = silhouette_score(X_count, kmeans_count.labels_)

# NMI, AMI, Silhouette TfidfVectorizer
nmi_tfidf = normalized_mutual_info_score(newsgroups.target, kmeans_tfidf.labels_)
ami_tfidf = adjusted_mutual_info_score(newsgroups.target, kmeans_tfidf.labels_)
silhouette_tfidf = silhouette_score(X_tfidf, kmeans_tfidf.labels_)


  super()._check_params_vs_input(X, default_n_init=10)


Initialization complete
Iteration 0, inertia 8171965.0.
Iteration 1, inertia 7057891.616101957.
Iteration 2, inertia 7046021.121276022.
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 7676138.0.
Iteration 1, inertia 6899630.201559503.
Iteration 2, inertia 6809910.855784581.
Iteration 3, inertia 6781480.658186523.
Iteration 4, inertia 6759566.687189982.
Iteration 5, inertia 6731012.754436237.
Iteration 6, inertia 6701982.239289278.
Iteration 7, inertia 6695884.907269796.
Iteration 8, inertia 6690785.877195099.
Iteration 9, inertia 6687684.121436886.
Iteration 10, inertia 6687019.128926629.
Iteration 11, inertia 6686635.7462210115.
Iteration 12, inertia 6686417.409218384.
Iteration 13, inertia 6686313.216546389.
Iteration 14, inertia 6686235.086433494.
Converged at iteration 14: strict convergence.
Initialization complete
Iteration 0, inertia 7620518.0.
Iteration 1, inertia 6848809.321567322.
Iteration 2, inertia 6775747.223245833.
Iteration 3, 

In [33]:
print("nmi count :", nmi_count)
print("ami count :", ami_count)
print("silhouette count :", silhouette_count)
print("nmi tfidf :", nmi_tfidf)
print("ami tfidf :", ami_tfidf)
print("silhouette tfidf :", silhouette_tfidf)

nmi count : 0.005821118437284576
ami count : 0.002994365035439919
silhouette count : 0.8685433358211204
nmi tfidf : 0.30772318791411823
ami tfidf : 0.3052378429762645
silhouette tfidf : 0.0027139634214150238
