In [1]:
import pandas as pd
import numpy as np
import nltk
import re

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
full_df = pd.read_csv('data/JEOPARDY_CSV.csv', encoding='utf-8')
print full_df.shape
full_df.head()

(216930, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [6]:
df = full_df.sample(frac=0.01)
df.reset_index(drop=True, inplace=True)

In [7]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,5505,2008-07-11,Double Jeopardy!,YOUR STATE IS PARKED,"$2,500",Cherokee Landing State Park,Oklahoma
1,4515,2004-04-02,Double Jeopardy!,PICTURE ME!,$1600,He's been called the final victim of the reign...,Robespierre
2,5445,2008-04-18,Jeopardy!,LAMPLIGHTERS,$600,In the evenings she carried a lamp while walki...,Florence Nightingale
3,5515,2008-07-25,Jeopardy!,THE NEW YORK TIMES 2008 NEWS,$800,A tentative deal with producers reported on Fe...,the Writers Guild strike
4,4103,2002-06-05,Double Jeopardy!,THERE OUGHTA BE A LAW,$400,"Consumers use chapter 7 or 13 of this code, a ...",bankruptcy


In [8]:
# Remove the dumb spaces
df.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

# Convert to Datetime
df['Air Date'] = pd.to_datetime(df['Air Date'])

# Clean out Value column
df['Value'] = df['Value'].str.replace('$','')
df['Value'] = df['Value'].str.replace(',','')
df['Value'] = df['Value'].apply(lambda x: None if x == 'None' else int(x))
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,5505,2008-07-11,Double Jeopardy!,YOUR STATE IS PARKED,2500.0,Cherokee Landing State Park,Oklahoma
1,4515,2004-04-02,Double Jeopardy!,PICTURE ME!,1600.0,He's been called the final victim of the reign...,Robespierre
2,5445,2008-04-18,Jeopardy!,LAMPLIGHTERS,600.0,In the evenings she carried a lamp while walki...,Florence Nightingale
3,5515,2008-07-25,Jeopardy!,THE NEW YORK TIMES 2008 NEWS,800.0,A tentative deal with producers reported on Fe...,the Writers Guild strike
4,4103,2002-06-05,Double Jeopardy!,THERE OUGHTA BE A LAW,400.0,"Consumers use chapter 7 or 13 of this code, a ...",bankruptcy


In [9]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
print len(stopwords)
print stopwords[:10]

153
[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [10]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [11]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [12]:
questions = df['Question'].values

In [13]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in questions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)


In [14]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [15]:
vocab_frame.head()

Unnamed: 0,words
cheroke,cherokee
land,landing
state,state
park,park
he,he


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(questions)

print(tfidf_matrix.shape)

CPU times: user 1.28 s, sys: 32 ms, total: 1.32 s
Wall time: 1.28 s
(2169, 1)


In [17]:
terms = tfidf_vectorizer.get_feature_names()

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

MemoryError: 

In [30]:
from sklearn.cluster import KMeans

num_clusters = 20

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 9.46 s, sys: 4 ms, total: 9.46 s
Wall time: 9.46 s


In [31]:
from sklearn.externals import joblib

#joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

IOError: [Errno 2] No such file or directory: 'doc_cluster.pkl'