In [1]:
import pandas as pd
import numpy as np
import nltk
import re

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/JEOPARDY_CSV.csv', encoding='utf-8')
full_df = pd.read_csv('data/JEOPARDY_CSV.csv', encoding='utf-8')
print full_df.shape
full_df.head()

(216930, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
df = full_df.sample(frac=0.1)
df.reset_index(drop=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,3965,2001-11-23,Jeopardy!,FILMS OF THE '70s,$100,"Cleavon Little plays Black Bart, the sheriff o...",Blazing Saddles
1,3279,1998-12-03,Jeopardy!,FABULOUS BAKER BOYS,$400,Trumpet-playing Baker seen here,Chet Baker
2,1871,1992-10-26,Double Jeopardy!,OLD MOVIES,$400,"Charley Grapewin who played Grandpa Joad in ""T...",The Wizard of Oz
3,3077,1998-01-06,Double Jeopardy!,COLORFUL WORDS & PHRASES,$600,"This colorful Jimi Hendrix classic ""experience...","""Purple Haze"""
4,4720,2005-02-25,Jeopardy!,FRUITS & VEGETABLES,$800,It's the leading agricultural product of the i...,bananas


In [5]:
# Remove the dumb spaces
df.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

# Convert to Datetime
df['Air Date'] = pd.to_datetime(df['Air Date'])

# Clean out Value column
df['Value'] = df['Value'].str.replace('$','')
df['Value'] = df['Value'].str.replace(',','')
df['Value'] = df['Value'].apply(lambda x: None if x == 'None' else int(x))
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,3965,2001-11-23,Jeopardy!,FILMS OF THE '70s,100.0,"Cleavon Little plays Black Bart, the sheriff o...",Blazing Saddles
1,3279,1998-12-03,Jeopardy!,FABULOUS BAKER BOYS,400.0,Trumpet-playing Baker seen here,Chet Baker
2,1871,1992-10-26,Double Jeopardy!,OLD MOVIES,400.0,"Charley Grapewin who played Grandpa Joad in ""T...",The Wizard of Oz
3,3077,1998-01-06,Double Jeopardy!,COLORFUL WORDS & PHRASES,600.0,"This colorful Jimi Hendrix classic ""experience...","""Purple Haze"""
4,4720,2005-02-25,Jeopardy!,FRUITS & VEGETABLES,800.0,It's the leading agricultural product of the i...,bananas


In [6]:
df['Question'].value_counts()[0:5]

[video clue]                                                 3
Charlie Parker                                               2
May 29, 1917 in Brookline, Massachusetts                     2
This language spoken in Reykjavik is also called Islenska    2
Bjork                                                        2
Name: Question, dtype: int64

In [7]:
df.shape

(21693, 7)

In [8]:
# Drop some useless questions
df = df[df['Question'] != '[audio clue]']
df = df[df['Question'] != '[video clue]']
df = df[df['Question'] != '[filler]']
df = df[df['Question'] != '(audio clue)']

In [9]:
df.shape

(21689, 7)

In [10]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
print len(stopwords)
print stopwords[:10]

153
[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [11]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [12]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [13]:
questions = df['Question'].values

In [14]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in questions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)


In [15]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [16]:
vocab_frame.head()

Unnamed: 0,words
cleavon,cleavon
littl,little
play,plays
black,black
bart,bart


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

# tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
#                                  min_df=0.2, stop_words='english',
#                                  use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

# %time tfidf_matrix = tfidf_vectorizer.fit_transform(questions)

tfidf_vectorizer.fit(questions)
joblib.dump(tfidf_vectorizer, 'tfidf_test.pkl')

# print(tfidf_matrix.shape)

['tfidf_test.pkl', 'tfidf_test.pkl_01.npy', 'tfidf_test.pkl_02.npy']

In [20]:
tfidf_vectorizer = joblib.load('tfidf_test.pkl')
tfidf_matrix = tfidf_vectorizer.transform(questions)

In [21]:
terms = tfidf_vectorizer.get_feature_names()

In [22]:
# from sklearn.metrics.pairwise import cosine_similarity
# dist = 1 - cosine_similarity(tfidf_matrix)

In [24]:
from sklearn.cluster import KMeans

num_clusters = 50

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 2min 48s, sys: 8.88 s, total: 2min 57s
Wall time: 1min 33s


In [25]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [26]:
len(clusters)

21689

In [27]:
df['Cluster'] = clusters

In [28]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,Cluster
0,3965,2001-11-23,Jeopardy!,FILMS OF THE '70s,100.0,"Cleavon Little plays Black Bart, the sheriff o...",Blazing Saddles,21
1,3279,1998-12-03,Jeopardy!,FABULOUS BAKER BOYS,400.0,Trumpet-playing Baker seen here,Chet Baker,32
2,1871,1992-10-26,Double Jeopardy!,OLD MOVIES,400.0,"Charley Grapewin who played Grandpa Joad in ""T...",The Wizard of Oz,21
3,3077,1998-01-06,Double Jeopardy!,COLORFUL WORDS & PHRASES,600.0,"This colorful Jimi Hendrix classic ""experience...","""Purple Haze""",32
4,4720,2005-02-25,Jeopardy!,FRUITS & VEGETABLES,800.0,It's the leading agricultural product of the i...,bananas,22


In [29]:
df['Cluster'].value_counts()

32    6329
36    1833
0     1197
7      564
21     536
1      477
2      433
12     431
10     420
16     417
45     386
18     383
30     347
27     344
38     344
22     341
49     332
35     315
6      311
39     296
20     271
44     258
42     258
46     256
41     244
9      241
4      240
8      240
31     236
28     233
47     225
33     220
26     220
11     211
19     211
3      211
14     204
23     202
5      200
43     163
25     137
29     131
48     120
37     119
40     118
17     109
24     106
13      98
15      88
34      83
Name: Cluster, dtype: int64

In [30]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()
    print()
    print("Cluster %d categories:" % i, end='')
    for cat in df.ix[i]['Category']:#.values.tolist():
        print(' %s,' % cat, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: games, originally, created, gods, host, produced,

Cluster 0 categories: F, I, L, M, S,  , O, F,  , T, H, E,  , ', 7, 0, s,

Cluster 1 words: including, work, authors, 's, authors, novel,

Cluster 1 categories: F, A, B, U, L, O, U, S,  , B, A, K, E, R,  , B, O, Y, S,

Cluster 2 words: became, house, white, marry, march, daughter,

Cluster 2 categories: O, L, D,  , M, O, V, I, E, S,

Cluster 3 words: song, general, hit, 's, hit, sings,

Cluster 3 categories: C, O, L, O, R, F, U, L,  , W, O, R, D, S,  , &,  , P, H, R, A, S, E, S,

Cluster 4 words: world, world, 's, world, war, world,

Cluster 4 categories: F, R, U, I, T, S,  , &,  , V, E, G, E, T, A, B, L, E, S,

Cluster 5 words: mark, middle, real, twain, 's, mark,

Cluster 5 categories: S, H, O, R, T,  , S, T, O, R, I, E, S,

Cluster 6 words: like, sound, sound, 's, type, 's,

Cluster 6 categories: P, R, E, S, I, D, E, N, T, S,  , &,  , B, A, S, E, B, A, L, L,

Cluster 7 words: href=, http, href