In [None]:
import pandas as pd
import nltk
import re

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()

In [None]:
from nltk.tokenize import SpaceTokenizer
tokenizer = SpaceTokenizer()

In [None]:
def tokenize_only(text):
    tokens = tokenizer.tokenize(text)
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    
    return filtered_tokens

def tokenize_and_stem(text):
    tokens = tokenize_only(text)
    stems = map(stemmer.stem, tokens)
    return stems

Below I use my stemming/tokenizing and tokenizing functions to iterate over the list of synopses to create two vocabularies: one stemmed and one only tokenized. 

In [None]:
df = pd.read_csv('../data/wiki/wiki.csv.gz', encoding='utf8', index_col=None)

In [None]:
df['text'] = df.text.str[:3000]

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for doc_text in df.text:
    allwords_stemmed = tokenize_and_stem(doc_text) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(doc_text)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

## Tf-idf and document similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.01, 
                                   stop_words='english',
                                   tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(df.text)

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
terms[100]

In [None]:
len(vocab_frame)

In [None]:
vocab_frame.ix[terms[100]]

## Lets cluster!

In [None]:
from sklearn.cluster import KMeans

num_clusters = 15

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
len(clusters)

## But what did we get?

In [None]:
clustered = df.join(pd.DataFrame({'cluster': clusters}))

In [None]:
clustered.head()

In [None]:
km.cluster_centers_

In [None]:
order_centroids = km.cluster_centers_.argsort()

In [None]:
order_centroids

In [None]:
vocab_frame.ix[terms[3396]].head()

In [None]:
term_words = vocab_frame.ix[terms[3396]]

In [None]:
term_words['words'].value_counts().keys()[0]

In [None]:
cluster_names = []

for cluster_centeroids in order_centroids:
    words = []
    for ind in cluster_centeroids[-6:]:
        term_words = vocab_frame.ix[terms[ind].split(' ')]
        
        best_word = term_words['words'].value_counts().keys()[0]
        
        words.append(best_word)
        
    cluster_names.append(', '.join(words))

In [None]:
cluster_names

In [None]:
clustered['cluster_name'] =clustered.cluster.map(lambda cluster: cluster_names[cluster])

In [None]:
clustered.head(60)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

In [None]:
import seaborn as sns

In [None]:
#set up colors per clusters using a dict
cluster_colors = sns.color_palette(n_colors=len(clusters))


In [None]:
%matplotlib inline

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=frame.name)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(30, 30)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
#plt.show() #show the plot

#uncomment the below to save the plot if need be
plt.savefig('clusters_small_noaxes2.png', dpi=300)