In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string

import matplotlib.pyplot as plt
%matplotlib inline

### For some reason, there are broken table tags in a lot of the tables below when they're rendered on Github. They don't actually look like that

In [2]:
# df = pd.read_csv('data/JEOPARDY_CSV.csv', encoding='utf-8')
full_df = pd.read_csv('../data/JEOPARDY_CSV.csv', encoding='utf-8')
print full_df.shape
full_df.head()

(216930, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
df = full_df.sample(frac=0.1)
df.reset_index(drop=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,3913,2001-09-12,Jeopardy!,DICK & JANE & SPOT HODGEPODGE,$200,"See Spot. See Spot sub for Eddie, played by Mo...",Frasier
1,3198,1998-06-24,Double Jeopardy!,LEAP DAY IN SPORTS,$1000,"On Feb. 29, 1980 this 51-year-old ex-Red Wing ...",Gordie Howe
2,3518,1999-12-15,Jeopardy!,WORLD PRESS,$300,Le Figaro,France
3,3590,2000-03-24,Jeopardy!,GOING POSTAL,$100,On the list of the Top 10 most popular commemo...,Bugs Bunny
4,5820,2009-12-25,Jeopardy!,TERMS OF ENGINEERMENT,$600,The Khurais Project in this country's desert i...,Saudi Arabia


In [5]:
# Remove the dumb spaces
df.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

# Convert to Datetime
df['Air Date'] = pd.to_datetime(df['Air Date'])

# Clean out Value column
df['Value'] = df['Value'].str.replace('$','')
df['Value'] = df['Value'].str.replace(',','')
df['Value'] = df['Value'].apply(lambda x: None if x == 'None' else int(x))
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,3913,2001-09-12,Jeopardy!,DICK & JANE & SPOT HODGEPODGE,200.0,"See Spot. See Spot sub for Eddie, played by Mo...",Frasier
1,3198,1998-06-24,Double Jeopardy!,LEAP DAY IN SPORTS,1000.0,"On Feb. 29, 1980 this 51-year-old ex-Red Wing ...",Gordie Howe
2,3518,1999-12-15,Jeopardy!,WORLD PRESS,300.0,Le Figaro,France
3,3590,2000-03-24,Jeopardy!,GOING POSTAL,100.0,On the list of the Top 10 most popular commemo...,Bugs Bunny
4,5820,2009-12-25,Jeopardy!,TERMS OF ENGINEERMENT,600.0,The Khurais Project in this country's desert i...,Saudi Arabia


In [6]:
df['Question'].value_counts()[0:5]

[audio clue]           3
[video clue]           2
Elizabeth Taylor       2
The largest in area    2
Helium, aluminum       2
Name: Question, dtype: int64

In [7]:
df.shape

(21693, 7)

In [8]:
# Drop some useless questions
df = df[df['Question'] != '[audio clue]']
df = df[df['Question'] != '[video clue]']
df = df[df['Question'] != '[filler]']
df = df[df['Question'] != '(audio clue)']

In [9]:
df.shape

(21687, 7)

In [10]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
print len(stopwords)
print stopwords[:10]

153
[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [11]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [12]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [13]:
def create_clean_columns(df):    
    df['clean_question'] = df['Question'].apply(cleanhtml)
    df['clean_answer'] = df['Answer'].apply(cleanhtml)
    df['clean_category'] = df['Category'].apply(cleanhtml)
    df['everything'] = df['clean_question']+' '+df['clean_answer']+' '+df['clean_category']
    return df

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = cleantext.replace('\n', '')
#     cleantext = cleantext.translate(None, string.punctuation)
#     cleantext = cleantext.replace('\'', '')
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    cleantext = regex.sub('', cleantext)
    cleantext = cleantext.lower()
    return cleantext

In [14]:
create_clean_columns(df)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_category,everything
0,3913,2001-09-12,Jeopardy!,DICK & JANE & SPOT HODGEPODGE,200.0,"See Spot. See Spot sub for Eddie, played by Mo...",Frasier,see spot see spot sub for eddie played by moos...,frasier,dick jane spot hodgepodge,see spot see spot sub for eddie played by moos...
1,3198,1998-06-24,Double Jeopardy!,LEAP DAY IN SPORTS,1000.0,"On Feb. 29, 1980 this 51-year-old ex-Red Wing ...",Gordie Howe,on feb 29 1980 this 51yearold exred wing score...,gordie howe,leap day in sports,on feb 29 1980 this 51yearold exred wing score...
2,3518,1999-12-15,Jeopardy!,WORLD PRESS,300.0,Le Figaro,France,le figaro,france,world press,le figaro france world press
3,3590,2000-03-24,Jeopardy!,GOING POSTAL,100.0,On the list of the Top 10 most popular commemo...,Bugs Bunny,on the list of the top 10 most popular commemo...,bugs bunny,going postal,on the list of the top 10 most popular commemo...
4,5820,2009-12-25,Jeopardy!,TERMS OF ENGINEERMENT,600.0,The Khurais Project in this country's desert i...,Saudi Arabia,the khurais project in this countrys desert in...,saudi arabia,terms of engineerment,the khurais project in this countrys desert in...
5,5539,2008-10-09,Double Jeopardy!,MAKING MONEY,1600.0,"(<a href=""http://www.j-archive.com/media/2008-...",offset,sarah of the clue crew shows a press at the bu...,offset,making money,sarah of the clue crew shows a press at the bu...
6,5130,2006-12-22,Double Jeopardy!,"COTTON, THE ACT",1200.0,"The purpose of spinning is to turn sliver, a l...",yarn,the purpose of spinning is to turn sliver a lo...,yarn,cotton the act,the purpose of spinning is to turn sliver a lo...
7,4852,2005-10-18,Double Jeopardy!,ASIA,800.0,Legend says an empress in 2640 B.C. discovered...,silk,legend says an empress in 2640 bc discovered h...,silk,asia,legend says an empress in 2640 bc discovered h...
8,3688,2000-09-20,Double Jeopardy!,SHORT STORIES,400.0,An old woman asks this Washington Irving chara...,Rip Van Winkle,an old woman asks this washington irving chara...,rip van winkle,short stories,an old woman asks this washington irving chara...
9,5170,2007-02-16,Double Jeopardy!,_____ & _____,400.0,The only railroad on the traditional Monopoly ...,B&O,the only railroad on the traditional monopoly ...,bo,,the only railroad on the traditional monopoly ...


In [41]:
# questions = df['clean_question'].values
questions = df['everything'].values
# categories = df['clean_category'].values
# answers = df['clean_answer'].values

In [42]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in questions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)


In [43]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [44]:
vocab_frame.head(20)

Unnamed: 0,words
see,see
spot,spot
see,see
spot,spot
sub,sub
for,for
eddi,eddie
play,played
by,by
moos,moose


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

# tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
#                                  min_df=0.2, stop_words='english',
#                                  use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

# tfidf_vectorizer = TfidfVectorizer(stop_words='english',
#                                  tokenizer=tokenize_only, ngram_range=(1,3))

# %time tfidf_matrix = tfidf_vectorizer.fit_transform(questions)

tfidf_vectorizer.fit(questions)
joblib.dump(tfidf_vectorizer, 'tfidf_test.pkl')

# print(tfidf_matrix.shape)

['tfidf_test.pkl', 'tfidf_test.pkl_01.npy', 'tfidf_test.pkl_02.npy']

In [46]:
tfidf_vectorizer = joblib.load('tfidf_test.pkl')
tfidf_matrix = tfidf_vectorizer.transform(questions)

In [47]:
terms = tfidf_vectorizer.get_feature_names()

In [48]:
# from sklearn.metrics.pairwise import cosine_similarity
# dist = 1 - cosine_similarity(tfidf_matrix)

In [49]:
from sklearn.cluster import KMeans

num_clusters = 50

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 4min 54s, sys: 1min 16s, total: 6min 10s
Wall time: 4min 5s


In [50]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [51]:
len(clusters)

21687

In [52]:
df['Cluster'] = clusters

In [53]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_category,everything,Cluster
0,3913,2001-09-12,Jeopardy!,DICK & JANE & SPOT HODGEPODGE,200.0,"See Spot. See Spot sub for Eddie, played by Mo...",Frasier,see spot see spot sub for eddie played by moos...,frasier,dick jane spot hodgepodge,see spot see spot sub for eddie played by moos...,15
1,3198,1998-06-24,Double Jeopardy!,LEAP DAY IN SPORTS,1000.0,"On Feb. 29, 1980 this 51-year-old ex-Red Wing ...",Gordie Howe,on feb 29 1980 this 51yearold exred wing score...,gordie howe,leap day in sports,on feb 29 1980 this 51yearold exred wing score...,43
2,3518,1999-12-15,Jeopardy!,WORLD PRESS,300.0,Le Figaro,France,le figaro,france,world press,le figaro france world press,29
3,3590,2000-03-24,Jeopardy!,GOING POSTAL,100.0,On the list of the Top 10 most popular commemo...,Bugs Bunny,on the list of the top 10 most popular commemo...,bugs bunny,going postal,on the list of the top 10 most popular commemo...,8
4,5820,2009-12-25,Jeopardy!,TERMS OF ENGINEERMENT,600.0,The Khurais Project in this country's desert i...,Saudi Arabia,the khurais project in this countrys desert in...,saudi arabia,terms of engineerment,the khurais project in this countrys desert in...,32


In [54]:
df['Cluster'].value_counts()

8     7581
20     652
1      573
33     528
3      511
12     469
19     424
32     414
46     409
7      398
47     388
29     386
40     381
43     376
42     365
13     364
49     346
25     341
16     326
41     324
24     317
27     312
4      304
22     289
38     275
44     273
2      263
6      248
28     242
21     239
35     230
36     227
23     209
0      208
37     195
14     191
11     189
48     181
26     178
39     174
30     172
10     157
34     152
9      152
18     147
17     128
5      128
15     122
31     118
45     111
Name: Cluster, dtype: int64

In [55]:
print "Top terms per cluster:"
print ''
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print "Cluster %d words:" % i, end=''
    for ind in order_centroids[i, :20]:
        print ' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=','
    print ''
    print ''
#     print("Cluster %d categories:" % i, end='')
#     for cat in df.ix[i]['Category']:#.values.tolist():
#         print(' %s,' % cat, end='')
    print ''
    print ''

SyntaxError: invalid syntax (<ipython-input-55-6a47a949b5f4>, line 5)

### LDA Attempts - work in progress

In [44]:
#strip any proper names from a text...unfortunately right now this is yanking the first word from a sentence too.
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [45]:
#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [47]:
from gensim import corpora, models, similarities 

#remove proper names
%time preprocess = [strip_proppers(doc) for doc in questions]

#tokenize
%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]

#remove stop words
%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

CPU times: user 4.21 s, sys: 20 ms, total: 4.23 s
Wall time: 4.21 s
CPU times: user 9.58 s, sys: 76 ms, total: 9.65 s
Wall time: 9.57 s
CPU times: user 1.34 s, sys: 28 ms, total: 1.37 s
Wall time: 1.32 s


In [48]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [50]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 28min 56s, sys: 1.22 s, total: 28min 57s
Wall time: 28min 57s


In [51]:
lda.show_topics()

[(0,
  u'0.011*citi + 0.011*state + 0.007*countri + 0.007*new + 0.007*name + 0.007*nation + 0.006*one + 0.006*us + 0.006*first + 0.005*world'),
 (1,
  u'0.009*film + 0.009*play + 0.006*titl + 0.005*one + 0.005*show + 0.004*star + 0.004*day + 0.004*song + 0.004*music + 0.004*man'),
 (2,
  u'0.009*one + 0.008*name + 0.008*war + 0.005*first + 0.005*presid + 0.004*henri + 0.003*king + 0.003*seen + 0.003*american + 0.003*year'),
 (3,
  u'0.014*name + 0.006*mean + 0.005*call + 0.005*island + 0.005*king + 0.004*like + 0.004*may + 0.004*one + 0.004*first + 0.003*use'),
 (4,
  u'0.007*type + 0.006*sea + 0.005*name + 0.005*use + 0.004*one + 0.004*clue + 0.004*first + 0.003*anim + 0.003*seen + 0.003*crew')]

In [67]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.array(topics_matrix)

topic_words = topics_matrix[:,:,1]
for i in topic_words:
    print [str(word) for word in i]
    print ''

ValueError: setting an array element with a sequence

In [79]:
for i in range(len(topics_matrix)):
    print i

0
1
2
3
4
