# Textual data: unsupervised algorithms (clustering)


### Data: The 20 newsgroup dataset from sklearn.datasets
The dataset contains (labeled) news with different topic

In [1]:
#Set up 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import NMF, LatentDirichletAllocation

import numpy as np

### Load data

In [2]:
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=100, remove=('headers', 'footers', 'quotes'))

In [9]:
#Exploring data structure
print("There are %d documents" % len(dataset.data))
print("With %d categories" % len(dataset.target_names))
print("First five label names %s" % dataset.target_names[:5])
labels = dataset.target #our Y (our outcome)
true_k = np.unique(labels).shape[0] #true number of different labels
print("The true k is %d" % true_k)
#dataset["data"][0] to print first news

There are 18846 documents
With 20 categories
First five label names ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
The true k is 20


In [11]:
#There are 1000 features
n_features=1000

In [12]:
#Let's start by 
vectorizer = TfidfVectorizer(max_df=0.5, #document must appear in max 50% of the documents
                             max_features=n_features, #max features = vocabulary size = 1000
                             min_df=2, #must appear in at least 2 doc
                             stop_words='english', #remove stopwords
                             use_idf=True)

In [13]:
#fitting and transforming the data with the vectorizer we just specified
X = vectorizer.fit_transform(dataset.data)
print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 18846, n_features: 1000


### K means (MiniBatch Kmeans version)

It is a more efficient version of the kmeans algorithm

In [14]:
km = MiniBatchKMeans(n_clusters=true_k)
km.fit(X)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=20,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [15]:
#Let's look at what the clusters look like
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: use using windows need does server don know like problem
Cluster 1: god jesus christ christians believe faith bible sin people christian
Cluster 2: known note non long used time end edu effect email
Cluster 3: card video drivers cards monitor driver windows vga know does
Cluster 4: book books read edu good time people source reading called
Cluster 5: 00 sale 50 15 10 20 11 30 shipping list
Cluster 6: government disease medical patients market people health federal cancer insurance
Cluster 7: read bible church does note people know believe don greek
Cluster 8: key chip encryption clipper keys government algorithm public des security
Cluster 9: thanks advance does know hi windows mail anybody program help
Cluster 10: armenian armenians turkish turkey armenia genocide people russian muslim soviet
Cluster 11: windows dos scsi mac modem port pc bus serial file
Cluster 12: ve bike know seen like don just good got new
Cluster 13: com away said ibm mail sun hp offer does edu
Cluster

### Topic extraction clustering (NMF and LSA algorithms)

In [16]:
#Reloading dataset
dataset = fetch_20newsgroups(shuffle=True, random_state=1, 
                             remove=('headers', 'footers', 'quotes'))
documents = dataset.data

n_features = 1000
n_topics = 20

#### LDA

LDA is based on frequency of terms, so we will use the CountVectorizer

In [17]:
tf_vectorizer = CountVectorizer(max_df=0.5, min_df=2, max_features=n_features, 
                                stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [18]:
#Applying model
lda = LatentDirichletAllocation(n_components=n_topics, 
                                learning_method="batch").fit(tf)
lda_W = lda.transform(tf)
lda_H = lda.components_

In [23]:
#Function to display the 'top10 words' in each cluster, ie those who represent the cluster the most.

def display_topics_full(H, feature_names, documents, no_top_words):
    for topic_idx, topic in enumerate(H):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
      

In [25]:
n_top_words = 10
n_top_documents = 10

display_topics_full(lda_H,  tf_feature_names, documents, 
                    n_top_words)

Topic 0:
key encryption chip keys clipper government security law use des
Topic 1:
car just bike like right cars engine speed dod good
Topic 2:
drive card scsi disk video bit hard drives pc bus
Topic 3:
00 1993 use number 10 health title 50 control page
Topic 4:
people mr gun think government don right state president know
Topic 5:
people said armenian armenians turkish did didn went children know
Topic 6:
price new good sale offer mail sell interested buy like
Topic 7:
cx ah w7 chz 34u lk pl air jews 17
Topic 8:
game team year games play good season think players win
Topic 9:
thanks know does help problem like hi just advance use
Topic 10:
available window edu server version graphics use motif application software
Topic 11:
edu com mail internet send list information email anonymous address
Topic 12:
god jesus people does believe say bible christian think life
Topic 13:
don just like think time know good use people ve
Topic 14:
space nasa data launch science earth research program sat

### NMF

With NMF we will use the TD IDF vectorizer

In [26]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, max_features=n_features, 
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [27]:
#Applying model
nmf = NMF(n_components=n_topics).fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_

In [28]:
#Displaying top 10 words for each cluster, with same function as previously
n_top_words = 10
n_top_documents = 10
display_topics_full(nmf_H,  tfidf_feature_names, documents, 
                    n_top_words)

Topic 0:
people government right armenian gun armenians law said turkish rights
Topic 1:
card video monitor cards drivers bus vga driver color memory
Topic 2:
god jesus bible believe christ faith christian christians church life
Topic 3:
game team year games season players play hockey win league
Topic 4:
new 00 sale 10 price offer shipping condition 20 50
Topic 5:
thanks mail advance hi looking info information address help email
Topic 6:
windows file files use dos window program using problem running
Topic 7:
edu soon university cs ftp email article internet david pub
Topic 8:
key chip encryption clipper keys use government escrow public algorithm
Topic 9:
drive scsi hard drives disk ide floppy controller cd mac
Topic 10:
just thought tell ll mean oh little wanted work maybe
Topic 11:
does anybody mean say work exist actually make doesn help
Topic 12:
think don say mean try want better really believe case
Topic 13:
like sounds looks look things lot sound use make doesn
Topic 14:
know 