In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string
from nltk.corpus import words

K-means clustering of movie reviews dataset.

Pre-process the data by tokenizing, removing stop words and punctuation.

Convert each review from a list of words to a tf-idf weighted vector. 

Define the similarity of documents as the cosine similarity of the document vectors.

Apply K-means on these set of vectors

Attempt to interpret the strange clusters obtained o_o

In [41]:
data = nltk.corpus.movie_reviews

In [42]:
text_words = data.words()

In [43]:
len(text_words)

1583820

In [44]:
len(data.paras())

2000

In [45]:
stop_words = set(stopwords.words('English'))
valid_words = set(words.words())
counter = FreqDist(w.lower() for w in text_words)
for k, v in list(counter.items()):
    if k in stop_words:
        del counter[k]
    if k in string.punctuation:
        del counter[k]
    if k not in valid_words:
        del counter[k]

In [7]:
print(len(counter.keys()))

18308


In [133]:
vocab = counter.most_common()[:500]
word_to_idx = {word[0]: i for i, word in enumerate(vocab)}

In [134]:
def doc_to_vec(words, vocab, word_to_idx, n_docs, doc_freq):
    vec = np.zeros((len(vocab), ))
    # term frequencies - tf
    for word in words:
        if word in word_to_idx:
            vec[word_to_idx[word]] += 1
    for i in range(vec.shape[0]):
        # tf-idf weighting
        if vec[i] > 0:
            vec[i] = vec[i] * np.log(n_docs / doc_freq[vocab[i][0]])
    return vec

In [135]:
features = []
doc_freq = {k: 0 for k, v in word_to_idx.items()}
n_docs = len(data.fileids())
for fileid in data.fileids():
     # collect document frequencies - df
    doc_words = set([w.lower() for w in data.words(fileid)])
    for word in word_to_idx:
        if word in doc_words:
            doc_freq[word] += 1

for fileid in data.fileids():
    # collect term frequencies - tf
    doc_words = [w.lower() for w in data.words(fileid)]
    f = doc_to_vec(doc_words, vocab, word_to_idx, n_docs, doc_freq)
    features.append(f)

features = np.array(features)

In [136]:
features[0][features[0] > 0].shape

(101,)

In [138]:
features.shape

(2000, 500)

In [139]:
class Cluster:
    
    def __init__(self, mean):
        self.mean = mean
        self.pts = []
    
    def update_mean(self):
        new_mean = np.zeros_like(self.mean)
        for pt in self.pts:
            new_mean += pt
        self.mean = new_mean / len(self.pts)
    
    def clear_pts(self):
        self.pts = []

In [140]:
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

In [141]:
def eucl_dist(a, b):
    return -np.linalg.norm(a - b)

In [142]:
n, d = features.shape
k = 2
means = features[np.random.choice(n, k)]
clust = [Cluster(m) for m in means]
# print(means.shape)
n_iters = 15
for e in range(n_iters):
    # assign each point to nearest clusters
    for i in range(n):
        max_sim = -1
        arg_max = 0
        for j in range(k):
            sim = cosine_sim(features[i], clust[j].mean)
#             print(sim)
            if sim > max_sim:
                max_sim = sim
                arg_max = j
        clust[arg_max].pts.append(features[i])
    
    for j in range(k):
        clust[j].update_mean()
    
    sim = 0
    for j in range(k):
        for vec in clust[j].pts:
            sim += cosine_sim(vec, clust[j].mean)
    print('iter = {}, objective = {:.4f}'.format(e + 1, sim))
    
    for j in range(k):
        clust[j].clear_pts()

iter = 1, objective = 715.2283
iter = 2, objective = 718.9043
iter = 3, objective = 721.4973
iter = 4, objective = 722.9342
iter = 5, objective = 723.8884
iter = 6, objective = 724.5757
iter = 7, objective = 725.0663
iter = 8, objective = 725.7106
iter = 9, objective = 726.5087
iter = 10, objective = 727.8137
iter = 11, objective = 728.8014
iter = 12, objective = 729.4785
iter = 13, objective = 729.8417
iter = 14, objective = 729.9561
iter = 15, objective = 729.9619


In [143]:
avg_vecs = []
for j in range(k):
    mean = clust[j].mean
    word_idx = np.nonzero(mean > 0.9)[0]
    avg_vecs.append([vocab[i] for i in word_idx])

In [144]:
# mean vector of cluster 0 - the average representation of the cluster
avg_vecs[0]

[]

In [145]:
# mean vector of cluster 1 - the average representation of the cluster
avg_vecs[1]

[('action', 1172),
 ('star', 761),
 ('original', 712),
 ('effects', 649),
 ('special', 574),
 ('series', 548),
 ('horror', 473),
 ('human', 432),
 ('alien', 378),
 ('summer', 334),
 ('earth', 317),
 ('computer', 273),
 ('space', 267),
 ('ship', 264),
 ('scream', 262),
 ('fiction', 258),
 ('planet', 243),
 ('smith', 236),
 ('science', 235),
 ('crew', 214),
 ('mission', 213)]

In [146]:
clust[0].mean[clust[0].mean > 0.7]

array([0.75642478])

In [147]:
clust[1].mean[clust[1].mean > 0.7]

array([0.89347136, 0.81788881, 0.70788938, 0.71622312, 0.79392251,
       1.04170727, 0.71005731, 0.70251227, 0.76585342, 1.59047126,
       1.0459618 , 1.95331565, 1.46327477, 1.18413765, 1.55040296,
       0.97317938, 3.31374067, 0.90694969, 1.59336817, 0.70155978,
       0.79717898, 1.08250741, 1.51140766, 2.3904302 , 2.15879713,
       1.24189822, 1.92197936, 0.84775047, 1.55108568, 1.55273209,
       1.21857142, 1.01829405, 0.73852813])