# Clustering – Finding Related Posts


## Preprocessing – similarity measured as a similar number of common words

### 1. Converting raw text into a bag of words


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
print(vectorizer)

CountVectorizer()


In [2]:
content = ["How to format my hard disk", " Hard disk format problems "]
X = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']

In [3]:
print(X.toarray().transpose())

[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


### 2. Counting words


In [4]:
import os
posts = [open(os.path.join(r'C:\Users\neha\Documents\ML-PA\ML-pa3\data', f)).read() for f in os.listdir(r'C:\Users\neha\Documents\ML-PA\ML-pa3\data')]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 24


In [5]:
print(vectorizer.get_feature_names())

['about', 'actually', 'can', 'contains', 'databases', 'get', 'huge', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'save', 'store', 'stuff', 'this', 'toy']


In [6]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec)

  (0, 4)	1
  (0, 8)	1


In [7]:
print(new_post_vec.toarray())

[[0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [8]:
import scipy as sp
def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

In [9]:
import sys
best_doc = None
best_dist = sys.maxsize
best_i = None
for i in range(num_samples):
    if posts[i] == new_post:
        continue   
    post_vec = X_train.getrow(i)    
    d = dist_raw(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[i]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=4.00: This is a toy post about machine learning. Actually, it contains not much 
interesting stuff.
=== Post 1 with dist=1.73: Imaging databases can get huge.
=== Post 2 with dist=2.00: Most imaging databases save images permanently.
=== Post 3 with dist=1.41: Imaging databases store images.
=== Post 4 with dist=5.10: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is 3 with dist=1.41


In [10]:
print(X_train.getrow(3).toarray())

[[0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]


In [11]:
print(X_train.getrow(4).toarray())

[[0 0 0 0 3 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]


### 3. Normalizing word count vectors

In [12]:
def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [13]:
best_doc = None
best_dist = sys.maxsize
best_i = None
for i in range(num_samples):
    if posts[i] == new_post:
        continue   
    post_vec = X_train.getrow(i)    
    d = dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[i]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much 
interesting stuff.
=== Post 1 with dist=0.86: Imaging databases can get huge.
=== Post 2 with dist=0.92: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store images.
=== Post 4 with dist=0.77: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is 3 with dist=0.77


### 4. Removing less important words


In [14]:
vectorizer = CountVectorizer(min_df=1, stop_words='english')
sorted(vectorizer.get_stop_words())[0:20]
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape

print("#samples: %d, #features: %d" % (num_samples,  num_features))
print(vectorizer.get_feature_names())

new_post_vec = vectorizer.transform([new_post])

#samples: 5, #features: 15
['actually', 'contains', 'databases', 'huge', 'images', 'imaging', 'interesting', 'learning', 'machine', 'permanently', 'post', 'save', 'store', 'stuff', 'toy']


In [15]:
best_doc = None
best_dist = sys.maxsize
best_i = None
for i in range(num_samples):
    if posts[i] == new_post:
        continue   
    post_vec = X_train.getrow(i)    
    d = dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[i]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much 
interesting stuff.
=== Post 1 with dist=0.61: Imaging databases can get huge.
=== Post 2 with dist=0.86: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store images.
=== Post 4 with dist=0.77: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is 1 with dist=0.61


## Stemming

### Installing and using NLTK


In [16]:
import nltk

In [17]:
import nltk.stem
s = nltk.stem.SnowballStemmer('english')
s.stem("graphics")

'graphic'

In [18]:
s.stem("imaging")

'imag'

In [19]:
s.stem("image")

'imag'

In [20]:
s.stem("imagination")

'imagin'

In [21]:
s.stem("imagine")

'imagin'

In [22]:
s.stem("buys")

'buy'

In [23]:
s.stem("buying")

'buy'

In [24]:
s.stem("bought")

'bought'

### Extending the vectorizer with NLTK's stemmer


In [25]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [26]:
X_train = vectorizer.fit_transform(posts)
new_post_vec = vectorizer.transform([new_post])

best_doc = None
best_dist = sys.maxsize
best_i = None
for i in range(num_samples):
    if posts[i] == new_post:
        continue   
    post_vec = X_train.getrow(i)    
    d = dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[i]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much 
interesting stuff.
=== Post 1 with dist=0.61: Imaging databases can get huge.
=== Post 2 with dist=0.63: Most imaging databases save images permanently.
=== Post 3 with dist=0.52: Imaging databases store images.
=== Post 4 with dist=0.52: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is 3 with dist=0.52


### Stop words on steroids


In [27]:
import numpy as np
def tfidf(term, doc, corpus):
    tf = doc.count(term) / len(doc)
    num_docs_with_term = len([d for d in corpus if term in d])
    idf = np.log(len(corpus) / num_docs_with_term)
    return tf * idf

In [28]:
a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a, abb, abc]
print(tfidf("a", a, D))

0.0


In [29]:
print(tfidf("a", abb, D))

0.0


In [30]:
print(tfidf("a", abc, D))

0.0


In [31]:
print(tfidf("b", abb, D))

0.27031007207210955


In [32]:
print(tfidf("a", abc, D))

0.0


In [33]:
print(tfidf("b", abc, D))

0.13515503603605478


In [34]:
print(tfidf("c", abc, D))

0.3662040962227032


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', decode_error='ignore')

X_train = vectorizer.fit_transform(posts)
new_post_vec = vectorizer.transform([new_post])

best_doc = None
best_dist = sys.maxsize
best_i = None
for i in range(num_samples):
    if posts[i] == new_post:
        continue   
    post_vec = X_train.getrow(i)    
    d = dist_norm(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, posts[i]))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much 
interesting stuff.
=== Post 1 with dist=0.87: Imaging databases can get huge.
=== Post 2 with dist=0.86: Most imaging databases save images permanently.
=== Post 3 with dist=0.63: Imaging databases store images.
=== Post 4 with dist=0.63: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is 3 with dist=0.63


# Clustering: K-means

In [36]:
import sklearn.datasets
all_data = sklearn.datasets.fetch_20newsgroups(subset='all')
print(len(all_data.filenames))

18846


In [37]:
print(all_data.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [38]:
train_data = sklearn.datasets.fetch_20newsgroups(subset='train')
print(len(train_data.filenames))

11314


In [39]:
test_data = sklearn.datasets.fetch_20newsgroups(subset='test')
print(len(test_data.filenames))

7532


In [40]:
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data = sklearn.datasets.fetch_20newsgroups(subset='train', categories=groups)
print(len(train_data.filenames))

3529


In [41]:
test_data = sklearn.datasets.fetch_20newsgroups(subset='test', categories=groups)
print(len(test_data.filenames))

2349


## Clustering posts

In [42]:
vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore')
vectorized = vectorizer.fit_transform(train_data.data)
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 3529, #features: 4712


In [43]:
num_clusters = 50
from sklearn.cluster import KMeans
km = KMeans(n_clusters=num_clusters, init='random', n_init=1, verbose=1, random_state=3)
km.fit(vectorized)

Initialization complete
Iteration 0, inertia 5899.5595831471655
Iteration 1, inertia 3218.297747726279
Iteration 2, inertia 3184.3328334733214
Iteration 3, inertia 3164.867358130041
Iteration 4, inertia 3152.003949571175
Iteration 5, inertia 3143.1109963529184
Iteration 6, inertia 3136.2559774422048
Iteration 7, inertia 3129.3248717684405
Iteration 8, inertia 3124.5674798201394
Iteration 9, inertia 3121.9001105797406
Iteration 10, inertia 3120.209894571872
Iteration 11, inertia 3118.62745619288
Iteration 12, inertia 3117.362525978361
Iteration 13, inertia 3116.8112664390364
Iteration 14, inertia 3116.587892365764
Iteration 15, inertia 3116.417048753848
Iteration 16, inertia 3115.760414808626
Iteration 17, inertia 3115.3736535034473
Iteration 18, inertia 3115.155454436256
Iteration 19, inertia 3114.9491175607545
Iteration 20, inertia 3114.5149932662175
Iteration 21, inertia 3113.9369169464094
Iteration 22, inertia 3113.719999300366
Iteration 23, inertia 3113.547519005385
Iteration 24, i

KMeans(init='random', n_clusters=50, n_init=1, random_state=3, verbose=1)

In [44]:
print(km.labels_)

[38 17 47 ... 41 14 16]


In [45]:
print(km.labels_.shape)

(3529,)


## Solving our initial challenge


In [46]:
new_post = "Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks."
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]

In [47]:
similar_indices = (km.labels_==new_post_label).nonzero()[0]

In [48]:
similar = []
for i in similar_indices:
    dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
    similar.append((dist, all_data.data[i]))
similar = sorted(similar)
print(len(similar))

166


In [53]:
show_at_1 = similar[0]
show_at_2 = similar[int(len(similar)/10)]
show_at_3 = similar[int(len(similar)/2)]

print(show_at_1, "\n \n")
print(show_at_2, "\n \n")
print(show_at_3)

(1.0378441731334074, 'From: gtd597a@prism.gatech.EDU (Hrivnak)\nSubject: Goalie mask poll update 4/22/93\nSummary: *** KEEP SENDING IN THOSE VOTES!!! ***\nOrganization: Georgia Institute of Technology\nLines: 38\n\n\n\tCurtis Joseph and Ray LeBlanc have made some big moves in the\npoll recently. Hextall has shown some strong movement as well. Kirk\nMcLean and Tom Barrasso (I can\'t see why) have been added to the list\nrecently. Keep sending in those votes.\n\nCurrent votes for favorite goalie masks (3pts - 1st, 2pts - 2nd, 1pt - 3rd)\n\nPlayer                    Team                 Pts       Votes\n--------------------------------------------------------------\n1. Ed Belfour             Chicago              32         15\n   Curtis Joseph          St. Louis            32         13 \n3. Andy Moog              Boston               30         13\n4. Brian Hayward          San Jose             26         10 \n5. Ron Hextall            Quebec               16          8\n6. Grant Fuhr   

## Another look at noise

In [54]:
post_group = zip(train_data.data, train_data.target)
all = [(len(post[0]), post[0], train_data.target_names[post[1]]) for post in post_group]
graphics = sorted([post for post in all if post[2]=='comp.graphics'])
print(graphics[5])



In [55]:
noise_post = graphics[5][1]
analyzer = vectorizer.build_analyzer()
print(list(analyzer(noise_post)))

['situnaya', 'ibm3090', 'bham', 'ac', 'uk', 'subject', 'test', 'sorri', 'organ', 'univers', 'birmingham', 'unit', 'kingdom', 'line', 'nntp', 'post', 'host', 'ibm3090', 'bham', 'ac', 'uk']


In [56]:
useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
print(sorted(useful))

['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']


In [57]:
for term in sorted(useful):
    print('IDF(%s)=%.2f'%(term, vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]))

IDF(ac)=3.51
IDF(birmingham)=6.77
IDF(host)=1.74
IDF(kingdom)=6.68
IDF(nntp)=1.77
IDF(sorri)=4.14
IDF(test)=3.83
IDF(uk)=3.70
IDF(unit)=4.42
IDF(univers)=1.91
