## Lab 4.3. 

#### Setup your imports

In [38]:
import pandas as pd 
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import matplotlib.pyplot as plt
from pprint import pprint

#### 1. Pull the training set from the newsgroup data
The data has 20 different categories. Try to shrink down to smaller number of groups according to the definition here:
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

In [52]:
categories = [
    #'alt.atheism',
    'talk.religion.misc',
     'rec.sport.baseball',
     'comp.sys.mac.hardware',
    #'comp.graphics',
    'sci.space',
]
newsgroups_train = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, categories=categories)

In [53]:
labels = newsgroups_train.target
true_k = np.unique(labels).shape[0]
# or true_k = len(list(newsgroups_train.target_names))
list(newsgroups_train.target_names)

['comp.sys.mac.hardware',
 'rec.sport.baseball',
 'sci.space',
 'talk.religion.misc']

In [54]:
# mappy = {
#     0: [0],
#     1: [1,2,3,4,5],
#     2: [6],
#     3: [7,8,9,10],
#     4: [11,12,13,14],
#     5: [15],
#     6: [16,17,18,19]
# }
# def getkey(num):
#     for x, y in mappy.items():
#         if num in y:
#             return x
# y_new = [getkey(num) for num in y]
# y = y_new
# labels = ['alt','comp','misc','rec','sci','soc','talk']
# print labels

#### 2. Create the vectorizer 

In [69]:
vectorizer = TfidfVectorizer(max_df=0.5, 
                             max_features=1000,
                             min_df=2,
                             ngram_range=(1,1),
                             stop_words='english',
                             use_idf=True,
                             sublinear_tf=False)

X = vectorizer.fit_transform(newsgroups_train.data)

In [70]:
print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 3572, n_features: 1000


In [71]:
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.

In [97]:
svd = TruncatedSVD(350)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
x_lsa = lsa.fit_transform(X) # this creates an imput for kmeans


explained_variance = svd.explained_variance_ratio_.sum()
print explained_variance

0.693524430266


In [101]:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=30, verbose=0)
km.fit(x_lsa)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=30, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [107]:
#Score the model
print 'Homogeneity :', metrics.homogeneity_score(labels, km.labels_)
print 'Completeness :', metrics.completeness_score(labels, km.labels_)
print 'V-measure :', metrics.v_measure_score(labels, km.labels_)
print 'Adjusted Rand Score :', metrics.adjusted_rand_score(labels, km.labels_)
print 'Silhouette', metrics.silhouette_score(x_lsa, km.labels_, sample_size=1000)

 Homogeneity : 0.65225662218
Completeness : 0.682193886835
V-measure : 0.666889445944
Adjusted Rand Score : 0.606539558276
Silhouette 0.0249197142678


#### 8. Check the performance of our kmeans

#### Classification Report

#### Confusion Matrix. Hint: create a map to translate the label between k-means clustering and the original target (newsgroups_train.target). 

#### Note: Repeat the lab with:
- varying values of "k" 
- trying a different way to pick starting centroids ('k-means++' is the default method for centroids). For example, pick one point from each newsgroup. 