In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, mixture
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(newsgroups_train.data)

vectors_test = vectorizer.transform(newsgroups_test.data)

labels = newsgroups_train.target
true_k = np.unique(labels).shape[0]

# K-mean
print("K-Mean")
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
km.fit(vectors)

pre_labels = km.predict(vectors_test)

# Decide label with majority rule for k clusters
lbls = [[] for i in range(true_k)]
i = 0
all_labels =len(pre_labels)

while i < all_labels:
    cluster = pre_labels[i]
    label = newsgroups_train.target[i]
    lbls[cluster].append(label)
    i += 1

label_dict = {}
for cluster in range(true_k):
    label, count = Counter(lbls[cluster]).most_common()[0]
    label_dict[cluster] = label    
print (label_dict)

print("Weighted F-1 score", f1_score(newsgroups_test.target, pre_labels, average='weighted'))

# Gaussian Mixture Models
print("GMM")
gmix = mixture.GMM(n_components=true_k, covariance_type='full')
gmix.fit(vectors.todense())
pre_labels = gmix.predict(vectors_test.todense())

lbls = [[] for i in range(true_k)]
i = 0
all_labels =len(pre_labels)

while i < all_labels:
    cluster = pre_labels[i]
    label = newsgroups_train.target[i]
    lbls[cluster].append(label)
    i += 1

label_dict = {}
for cluster in range(true_k):
    label, count = Counter(lbls[cluster]).most_common()[0]
    label_dict[cluster] = label    
print (label_dict)

print("Weighted F-1 score", f1_score(newsgroups_test.target, pre_labels, average='weighted'))

K-Mean
{0: 4, 1: 2, 2: 14, 3: 2, 4: 6, 5: 13, 6: 15, 7: 2, 8: 11, 9: 1, 10: 17, 11: 7, 12: 9, 13: 6, 14: 8, 15: 8, 16: 15, 17: 3, 18: 9, 19: 7}
Weighted F-1 score 0.045980395293
GMM
{0: 14, 1: 13, 2: 7, 3: 6, 4: 5, 5: 10, 6: 6, 7: 3, 8: 2, 9: 8, 10: 9, 11: 3, 12: 2, 13: 1, 14: 5, 15: 6, 16: 12, 17: 2, 18: 0, 19: 4}
Weighted F-1 score 0.0282796500416
