In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string

import matplotlib.pyplot as plt
%matplotlib inline

In [87]:
# df = pd.read_csv('data/JEOPARDY_CSV.csv', encoding='utf-8')
full_df = pd.read_csv('../data/JEOPARDY_CSV.csv', encoding='utf-8')
print(full_df.shape)
full_df.head()

(216930, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
df = full_df.sample(frac=0.1)
df.reset_index(drop=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,1376,1990-09-03,Jeopardy!,FOOD,$400,They're immature pigeons sometimes raised for ...,squabs
1,4345,2003-06-20,Jeopardy!,BIBLICAL QUOTES,$800,"Paul wrote to Timothy, ""Drink no longer water,...",wine
2,312,1985-11-19,Jeopardy!,WORD ORIGINS,$100,"Though an insect larva, its name comes from Fr...",caterpillar
3,309,1985-11-14,Jeopardy!,INSECTS,$400,The actual title of “Jimmy Crack Corn”,The Blue Tail Fly
4,5435,2008-04-04,Jeopardy!,MAKE IT STICK,$200,This line of auto body products includes 4 Min...,Bondo


In [5]:
# Remove the dumb spaces
df.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

# Convert to Datetime
df['Air Date'] = pd.to_datetime(df['Air Date'])

# Clean out Value column
df['Value'] = df['Value'].str.replace('$','')
df['Value'] = df['Value'].str.replace(',','')
df['Value'] = df['Value'].apply(lambda x: None if x == 'None' else int(x))
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,1376,1990-09-03,Jeopardy!,FOOD,400.0,They're immature pigeons sometimes raised for ...,squabs
1,4345,2003-06-20,Jeopardy!,BIBLICAL QUOTES,800.0,"Paul wrote to Timothy, ""Drink no longer water,...",wine
2,312,1985-11-19,Jeopardy!,WORD ORIGINS,100.0,"Though an insect larva, its name comes from Fr...",caterpillar
3,309,1985-11-14,Jeopardy!,INSECTS,400.0,The actual title of “Jimmy Crack Corn”,The Blue Tail Fly
4,5435,2008-04-04,Jeopardy!,MAKE IT STICK,200.0,This line of auto body products includes 4 Min...,Bondo


In [6]:
df['Question'].value_counts()[0:5]

[audio clue]                                                                                                      3
[video clue]                                                                                                      3
"When the one great scorer comes to write against your name, he marks -- not that you won or lost -- but" this    2
"A Time to Heal"                                                                                                  2
Daikon                                                                                                            2
Name: Question, dtype: int64

In [7]:
df.shape

(21693, 7)

In [8]:
# Drop some useless questions
df = df[df['Question'] != '[audio clue]']
df = df[df['Question'] != '[video clue]']
df = df[df['Question'] != '[filler]']
df = df[df['Question'] != '(audio clue)']

In [9]:
df.shape

(21687, 7)

In [11]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
print(len(stopwords))
print(stopwords[:10])

153
[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [12]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [13]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [22]:
def create_clean_columns(df):    
    df['clean_question'] = df['Question'].apply(cleanhtml)
    df['clean_answer'] = df['Answer'].apply(cleanhtml)
#     df['clean_category'] = df['Category'].apply(cleanhtml)
    df['everything'] = df['clean_question']+' '+df['clean_answer']#+' '+df['clean_category']
    return df

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = cleantext.replace('\n', '')
#     cleantext = cleantext.translate(None, string.punctuation)
#     cleantext = cleantext.replace('\'', '')
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    cleantext = regex.sub('', cleantext)
    cleantext = cleantext.lower()
    return cleantext

In [23]:
create_clean_columns(df)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,everything
0,1376,1990-09-03,Jeopardy!,FOOD,400.0,They're immature pigeons sometimes raised for ...,squabs,theyre immature pigeons sometimes raised for food,squabs,theyre immature pigeons sometimes raised for f...
1,4345,2003-06-20,Jeopardy!,BIBLICAL QUOTES,800.0,"Paul wrote to Timothy, ""Drink no longer water,...",wine,paul wrote to timothy drink no longer water bu...,wine,paul wrote to timothy drink no longer water bu...
2,312,1985-11-19,Jeopardy!,WORD ORIGINS,100.0,"Though an insect larva, its name comes from Fr...",caterpillar,though an insect larva its name comes from fre...,caterpillar,though an insect larva its name comes from fre...
3,309,1985-11-14,Jeopardy!,INSECTS,400.0,The actual title of “Jimmy Crack Corn”,The Blue Tail Fly,the actual title of “jimmy crack corn”,the blue tail fly,the actual title of “jimmy crack corn” the blu...
4,5435,2008-04-04,Jeopardy!,MAKE IT STICK,200.0,This line of auto body products includes 4 Min...,Bondo,this line of auto body products includes 4 min...,bondo,this line of auto body products includes 4 min...
5,4747,2005-04-05,Double Jeopardy!,YOU SLEIGH ME,400.0,J. Pierpont Morgan's uncle James Pierpont wrot...,"""Jingle Bells""",j pierpont morgans uncle james pierpont wrote ...,jingle bells,j pierpont morgans uncle james pierpont wrote ...
6,3118,1998-03-04,Double Jeopardy!,MAGAZINES,200.0,Ellery Queen & now Mary Higgins Clark have the...,Mystery,ellery queen now mary higgins clark have thei...,mystery,ellery queen now mary higgins clark have thei...
7,6149,2011-05-12,Double Jeopardy!,COLORFUL 19th CENTURY LIT,1600.0,The first chapter of this 1887 work is titled ...,A Study in Scarlet,the first chapter of this 1887 work is titled ...,a study in scarlet,the first chapter of this 1887 work is titled ...
8,1331,1990-05-21,Double Jeopardy!,EARLY AMERICA,800.0,"1797 ""affair"" in which 3 agents for France dem...","""XYZ"" Affair",1797 affair in which 3 agents for france deman...,xyz affair,1797 affair in which 3 agents for france deman...
9,3160,1998-05-01,Jeopardy!,LITERATURE,200.0,"His ""Jungle Book"" prose begins, ""It was seven ...",Rudyard Kipling,his jungle book prose begins it was seven oclo...,rudyard kipling,his jungle book prose begins it was seven oclo...


In [24]:
questions = df['everything'].values

In [25]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in questions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)


In [26]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [27]:
vocab_frame.head()

Unnamed: 0,words
theyr,theyre
immatur,immature
pigeon,pigeons
sometim,sometimes
rais,raised


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

# tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
#                                  min_df=0.2, stop_words='english',
#                                  use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                 tokenizer=tokenize_and_stem, ngram_range=(1,3))

# tfidf_vectorizer = TfidfVectorizer(stop_words='english',
#                                  tokenizer=tokenize_only, ngram_range=(1,3))

# %time tfidf_matrix = tfidf_vectorizer.fit_transform(questions)

tfidf_vectorizer.fit(questions)
joblib.dump(tfidf_vectorizer, 'tfidf_test.pkl')

# print(tfidf_matrix.shape)

['tfidf_test.pkl', 'tfidf_test.pkl_01.npy', 'tfidf_test.pkl_02.npy']

In [29]:
tfidf_vectorizer = joblib.load('tfidf_test.pkl')
tfidf_matrix = tfidf_vectorizer.transform(questions)

In [30]:
terms = tfidf_vectorizer.get_feature_names()

In [31]:
# from sklearn.metrics.pairwise import cosine_similarity
# dist = 1 - cosine_similarity(tfidf_matrix)

In [32]:
from sklearn.cluster import KMeans

num_clusters = 50

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 3min 42s, sys: 12 s, total: 3min 54s
Wall time: 2min 11s


In [34]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [35]:
len(clusters)

21687

In [36]:
df['Cluster'] = clusters

In [37]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,everything,Cluster
0,1376,1990-09-03,Jeopardy!,FOOD,400.0,They're immature pigeons sometimes raised for ...,squabs,theyre immature pigeons sometimes raised for food,squabs,theyre immature pigeons sometimes raised for f...,12
1,4345,2003-06-20,Jeopardy!,BIBLICAL QUOTES,800.0,"Paul wrote to Timothy, ""Drink no longer water,...",wine,paul wrote to timothy drink no longer water bu...,wine,paul wrote to timothy drink no longer water bu...,26
2,312,1985-11-19,Jeopardy!,WORD ORIGINS,100.0,"Though an insect larva, its name comes from Fr...",caterpillar,though an insect larva its name comes from fre...,caterpillar,though an insect larva its name comes from fre...,32
3,309,1985-11-14,Jeopardy!,INSECTS,400.0,The actual title of “Jimmy Crack Corn”,The Blue Tail Fly,the actual title of “jimmy crack corn”,the blue tail fly,the actual title of “jimmy crack corn” the blu...,2
4,5435,2008-04-04,Jeopardy!,MAKE IT STICK,200.0,This line of auto body products includes 4 Min...,Bondo,this line of auto body products includes 4 min...,bondo,this line of auto body products includes 4 min...,47


In [38]:
df['Cluster'].value_counts()

12    6779
3     1427
16     555
15     536
20     511
44     488
23     487
2      468
19     436
41     394
1      356
4      346
46     342
25     335
47     326
13     325
14     322
39     312
6      303
26     301
36     300
37     277
0      271
28     269
22     266
5      261
34     260
10     253
32     247
18     247
7      237
27     236
42     223
29     222
38     222
21     222
9      220
31     219
40     199
24     199
43     177
11     172
35     170
45     165
30     161
33     160
17     132
49     128
8      127
48      96
Name: Cluster, dtype: int64

In [39]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :20]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()
    print()
#     print("Cluster %d categories:" % i, end='')
#     for cat in df.ix[i]['Category']:#.values.tolist():
#         print(' %s,' % cat, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: john, kennedy, john, f, john, f, john, paul, adam, playing, john, paul, pope, st, pope, pope, wrote, john, st, john,



Cluster 1 words: movie, tv, dog, playing, star, fame, series, films, hall, tv, hall, tv, jack, hot, actor, title, roles, hot, animal, produced,



Cluster 2 words: title, films, novel, character, title, playing, musical, robert, playing, story, star, hit, featured, classic, girl, king, wrote, published, based, song,



Cluster 3 words: type, islands, sea, because, ocean, greek, bay, red, run, body, return, color, feet, person, blooded, mile, trees, chambers, pressure, species,



Cluster 4 words: known, capital, best, islands, better, best, country, better, country, king, once, new, south, type, once, known, henry, dynasty, artist, artist,



Cluster 5 words: clue, clue, crew, sarah, sarah, sarah, clue, crew, reports, cheryl, cheryl, cheryl, kelly, kelly, kelly, jon, jon, jon, monitor, sofia,



Cluster 6 words: book, god, gree

### LDA Attempts - work in progress

In [44]:
#strip any proper names from a text...unfortunately right now this is yanking the first word from a sentence too.
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [45]:
#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [47]:
from gensim import corpora, models, similarities 

#remove proper names
%time preprocess = [strip_proppers(doc) for doc in questions]

#tokenize
%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]

#remove stop words
%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

CPU times: user 4.21 s, sys: 20 ms, total: 4.23 s
Wall time: 4.21 s
CPU times: user 9.58 s, sys: 76 ms, total: 9.65 s
Wall time: 9.57 s
CPU times: user 1.34 s, sys: 28 ms, total: 1.37 s
Wall time: 1.32 s


In [48]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [50]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 28min 56s, sys: 1.22 s, total: 28min 57s
Wall time: 28min 57s


In [51]:
lda.show_topics()

[(0,
  u'0.011*citi + 0.011*state + 0.007*countri + 0.007*new + 0.007*name + 0.007*nation + 0.006*one + 0.006*us + 0.006*first + 0.005*world'),
 (1,
  u'0.009*film + 0.009*play + 0.006*titl + 0.005*one + 0.005*show + 0.004*star + 0.004*day + 0.004*song + 0.004*music + 0.004*man'),
 (2,
  u'0.009*one + 0.008*name + 0.008*war + 0.005*first + 0.005*presid + 0.004*henri + 0.003*king + 0.003*seen + 0.003*american + 0.003*year'),
 (3,
  u'0.014*name + 0.006*mean + 0.005*call + 0.005*island + 0.005*king + 0.004*like + 0.004*may + 0.004*one + 0.004*first + 0.003*use'),
 (4,
  u'0.007*type + 0.006*sea + 0.005*name + 0.005*use + 0.004*one + 0.004*clue + 0.004*first + 0.003*anim + 0.003*seen + 0.003*crew')]

In [67]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.array(topics_matrix)

topic_words = topics_matrix[:,:,1]
for i in topic_words:
    print([str(word) for word in i])
    print()

ValueError: setting an array element with a sequence

In [79]:
for i in range(len(topics_matrix)):
    print(i)

0
1
2
3
4
