In [3]:
import pandas as pd
import numpy as np
import nltk
import re

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
full_df = pd.read_csv('data/JEOPARDY_CSV.csv', encoding='utf-8')
print full_df.shape
full_df.head()

(216930, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
df = full_df.sample(frac=0.1)
df.reset_index(drop=True, inplace=True)

In [6]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,6098,2011-03-02,Double Jeopardy!,ART -ISMS,$800,Abstract expressionism has been called the fir...,the United States
1,3690,2000-09-22,Jeopardy!,"""PH""OOEY!",$400,Fashionable critter seen here,Pheasant
2,2910,1997-04-04,Double Jeopardy!,MEDICINE,$1000,"Streptococci cause this childhood ""fever"" char...",Scarlet fever
3,3928,2001-10-03,Jeopardy!,SECOND-LARGEST CITIES,$400,"In Spain, it's second to Madrid",Barcelona
4,5836,2010-01-18,Jeopardy!,SIDEKICKS,$600,"Smee, a sidekick of this handicaptain, had a c...",Captain Hook


In [7]:
# Remove the dumb spaces
df.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

# Convert to Datetime
df['Air Date'] = pd.to_datetime(df['Air Date'])

# Clean out Value column
df['Value'] = df['Value'].str.replace('$','')
df['Value'] = df['Value'].str.replace(',','')
df['Value'] = df['Value'].apply(lambda x: None if x == 'None' else int(x))
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,6098,2011-03-02,Double Jeopardy!,ART -ISMS,800.0,Abstract expressionism has been called the fir...,the United States
1,3690,2000-09-22,Jeopardy!,"""PH""OOEY!",400.0,Fashionable critter seen here,Pheasant
2,2910,1997-04-04,Double Jeopardy!,MEDICINE,1000.0,"Streptococci cause this childhood ""fever"" char...",Scarlet fever
3,3928,2001-10-03,Jeopardy!,SECOND-LARGEST CITIES,400.0,"In Spain, it's second to Madrid",Barcelona
4,5836,2010-01-18,Jeopardy!,SIDEKICKS,600.0,"Smee, a sidekick of this handicaptain, had a c...",Captain Hook


In [8]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
print len(stopwords)
print stopwords[:10]

153
[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [9]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [10]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [11]:
questions = df['Question'].values

In [None]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in questions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)


In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [None]:
vocab_frame.head()

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
#                                  min_df=0.2, stop_words='english',
#                                  use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(questions)

print(tfidf_matrix.shape)

CPU times: user 1.15 s, sys: 40 ms, total: 1.19 s
Wall time: 1.14 s
(2169, 34278)


In [36]:
terms = tfidf_vectorizer.get_feature_names()

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [44]:
from sklearn.cluster import KMeans

num_clusters = 20

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 2.37 s, sys: 16 ms, total: 2.38 s
Wall time: 1.22 s


In [45]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [46]:
len(clusters)

2169

In [47]:
df['Cluster'] = clusters

In [48]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,Cluster
0,5505,2008-07-11,Double Jeopardy!,YOUR STATE IS PARKED,2500.0,Cherokee Landing State Park,Oklahoma,7
1,4515,2004-04-02,Double Jeopardy!,PICTURE ME!,1600.0,He's been called the final victim of the reign...,Robespierre,3
2,5445,2008-04-18,Jeopardy!,LAMPLIGHTERS,600.0,In the evenings she carried a lamp while walki...,Florence Nightingale,3
3,5515,2008-07-25,Jeopardy!,THE NEW YORK TIMES 2008 NEWS,800.0,A tentative deal with producers reported on Fe...,the Writers Guild strike,1
4,4103,2002-06-05,Double Jeopardy!,THERE OUGHTA BE A LAW,400.0,"Consumers use chapter 7 or 13 of this code, a ...",bankruptcy,1


In [49]:
df['Cluster'].value_counts()

1     852
3     253
6     113
7     111
10     74
17     72
15     68
8      68
19     67
11     61
12     56
2      51
18     50
16     48
4      46
14     43
9      43
5      37
0      29
13     27
Name: Cluster, dtype: int64

In [62]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()
    print()
    print("Cluster %d categories:" % i, end='')
    for cat in df.ix[i]['Category']:#.values.tolist():
        print(' %s,' % cat, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: people, sequel, saved, willie, bruce, speaking,

Cluster 0 categories: Y, O, U, R,  , S, T, A, T, E,  , I, S,  , P, A, R, K, E, D,

Cluster 1 words: use, type, years, films, century, like,

Cluster 1 categories: P, I, C, T, U, R, E,  , M, E, !,

Cluster 2 words: target=, _blank, href=, http, href=, /a,

Cluster 2 categories: L, A, M, P, L, I, G, H, T, E, R, S,

Cluster 3 words: 's, city, state, home, seen, type,

Cluster 3 categories: T, H, E,  , N, E, W,  , Y, O, R, K,  , T, I, M, E, S,  , 2, 0, 0, 8,  , N, E, W, S,

Cluster 4 words: once, chief, die, arizona, territory, napoleon,

Cluster 4 categories: T, H, E, R, E,  , O, U, G, H, T, A,  , B, E,  , A,  , L, A, W,

Cluster 5 words: river, north, continent, lake, flows, river,

Cluster 5 categories: B, R, O, A, D, W, A, Y,  , D, E, B, U, T, S,

Cluster 6 words: countries, n't, american, south, 's, countries,

Cluster 6 categories: L, E, T, ', S,  , S, P, E, A, K,  , I, T, A, L, I, A, N,

Cluste