In [16]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_mutual_info_score, silhouette_score

In [21]:

train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train.data)
X_test = vectorizer.fit_transform(test.data)

In [24]:
k = 20
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init='auto', verbose=0)
kmeans.fit(X_train)
nmi = normalized_mutual_info_score(train.target, kmeans.labels_)
ami = adjusted_mutual_info_score(train.target, kmeans.labels_)
silhouette = silhouette_score(X_train, kmeans.labels_)

print(nmi)
print(ami)
print(silhouette)

0.007855628225413892
0.0025738165408000346
0.879938732997808


In [25]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_mutual_info_score, silhouette_score

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Vectorize the documents with CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
X_count = count_vectorizer.fit_transform(newsgroups.data)

# Vectorize the documents with TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(newsgroups.data)

# Cluster the documents with KMeans using CountVectorizer
kmeans_count = KMeans(n_clusters=20)
kmeans_count.fit(X_count)

# Cluster the documents with KMeans using TfidfVectorizer
kmeans_tfidf = KMeans(n_clusters=20)
kmeans_tfidf.fit(X_tfidf)

# Compute the NMI, AMI, and Silhouette Coefficient for CountVectorizer
nmi_count = normalized_mutual_info_score(newsgroups.target, kmeans_count.labels_)
ami_count = adjusted_mutual_info_score(newsgroups.target, kmeans_count.labels_)
silhouette_count = silhouette_score(X_count, kmeans_count.labels_)

# Compute the NMI, AMI, and Silhouette Coefficient for TfidfVectorizer
nmi_tfidf = normalized_mutual_info_score(newsgroups.target, kmeans_tfidf.labels_)
ami_tfidf = adjusted_mutual_info_score(newsgroups.target, kmeans_tfidf.labels_)
silhouette_tfidf = silhouette_score(X_tfidf, kmeans_tfidf.labels_)


  super()._check_params_vs_input(X, default_n_init=10)


In [26]:
print("nmi count :", nmi_count)
print("ami count :", ami_count)
print("silhouette count :", silhouette_count)
print("nmi tfidf :", nmi_tfidf)
print("ami tfidf :", ami_tfidf)
print("silhouette tfidf :", silhouette_tfidf)

nmi count : 0.004286212499423054
ami count : 0.0016052885404538998
silhouette count : 0.9108335947605655
nmi tfidf : 0.3547000032588736
ami tfidf : 0.35235976108466505
silhouette tfidf : 0.0051505107824209315
