In [64]:
from sklearn.datasets import fetch_20newsgroups
from matplotlib import pyplot as plt
import numpy as np
from numpy import array

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn import metrics
from sklearn.cluster import KMeans

import time

import pickle

In [65]:
computer_technology_subclasses = ['comp.graphics',
                                  'comp.os.ms-windows.misc',
                                  'comp.sys.ibm.pc.hardware',
                                  'comp.sys.mac.hardware']

recreational_activity_subclasses = ['rec.autos',
                                    'rec.motorcycles',
                                    'rec.sport.baseball',
                                    'rec.sport.hockey']

categories = computer_technology_subclasses + recreational_activity_subclasses

In [66]:
dataset = fetch_20newsgroups(subset='all',categories=categories, shuffle=True, 
                             random_state=42, remove=('headers', 'footers'))

dataset_binary_target = [0 if t<4 else 1 for t in dataset.target]

In [67]:
# Generate TF-IDF Data, with min_df=3, removed stop words, no stemming
vectorizer = CountVectorizer(min_df=3, stop_words='english')
X_counts = vectorizer.fit_transform(dataset.data)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Print the shape of the datasets
print("QUESTION 1: Report the dimensions of the TF-IDF matrix you get\n")
print('X_tfidf:',X_tfidf.shape)


QUESTION 1: Report the dimensions of the TF-IDF matrix you get

X_tfidf: (7882, 23522)


In [68]:
# Apply K-means clustering with k=2 using the TF-IDF data. Use random_state=0, max_iter >= 1000, n_init>=30.
N_CLUSTERS = 2
RANDOM_STATE = 0
MAX_ITER = 1000
N_INIT = 1

km = KMeans(n_clusters=N_CLUSTERS,random_state=RANDOM_STATE,max_iter=MAX_ITER,n_init=N_INIT)
start = time.time()
km.fit(X_tfidf)
print("Finished clustering with %i seeds in %f seconds" % (time.time()-start))


Finished clustering in 6.312490 seconds


In [62]:
print("QUESTION 2: Show Contingency Matrix:")
contin_matrix = metrics.cluster.contingency_matrix(dataset_binary_target, km.labels_)
print(contin_matrix)

QUESTION 2: Show Contingency Matrix:
[[3214  689]
 [  64 3915]]


In [63]:
# Homogeneity, completeness, V-measure, adjusted Rand Index, adjusted mutual information score

homogeneity = metrics.homogeneity_score(dataset_binary_target, km.labels_)
completeness = metrics.completeness_score(dataset_binary_target, km.labels_)
v_measure = metrics.v_measure_score(dataset_binary_target, km.labels_)
adjusted_rand_index = metrics.adjusted_rand_score(dataset_binary_target, km.labels_)
adjusted_mutual_info = metrics.adjusted_mutual_info_score(dataset_binary_target, km.labels_)

print("QUESTION 3: Report 5 Measures for K-Means Clustering\n ")
print("Homogeneity Score: %f" % homogeneity)
print("Completeness Score: %f" % completeness)
print("V-Measure Score: %f" % v_measure)
print("Adjusted Rand Index: %f" % adjusted_rand_index)
print("Adjusted Mutual Information: %f" % adjusted_mutual_info)

QUESTION 3: Report 5 Measures for K-Means Clustering
 
Homogeneity Score: 0.586555
Completeness Score: 0.598799
V-Measure Score: 0.592614
Adjusted Rand Index: 0.654327
Adjusted Mutual Information: 0.586517
