# Jeopardy! Topic Modeling

Source of Data:
* https://drive.google.com/file/d/0BwT5wj_P7BKXUl9tOUJWYzVvUjA/view?resourcekey=0-uFrn8bQkUfSCvJlmtKGCdQ
* https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/

In [33]:
# import necessary packages
#!pip install fuzzywuzzy
#!pip install python-Levenshtein

# data handling
import pandas as pd
import pickle

# string manipulation
import re
import string

# text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import NMF
# nltk.download('stopwords') #run this once

#suppress warnings
import warnings
warnings.filterwarnings("ignore")

## 1. Construct Dataset of Jeopardy Questions

In [34]:
df = pd.read_csv('./JEOPARDY_CSV.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       216930 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216928 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [35]:
#clean column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()

#convert air_date to datetime
df.air_date = pd.to_datetime(df.air_date)

#remove clues with media attachments, missing values
df = df[~df.question.str.contains('seen here|[Cc]lue|href|filler')].reset_index(drop=True)
df = df.dropna(axis=0)
df = df.reset_index(drop=True)

#create new column for processed text, while retaining original clues
df['text'] = df['category'] + ' ' + df['question'] + ' ' + df['answer'].astype(str)
df.head()

Unnamed: 0,show_number,air_date,round,category,value,question,answer,text
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"HISTORY For the last 8 years of his life, Gali..."
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,ESPN's TOP 10 ALL-TIME ATHLETES No. 2: 1912 Ol...
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,EVERYBODY TALKS ABOUT IT... The city of Yuma i...
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,"THE COMPANY LINE In 1963, live on ""The Art Lin..."
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,EPITAPHS & TRIBUTES Signer of the Dec. of Inde...


In [36]:
#convert value to ints, and round to values in [200, 400, 600, ... 1800] 
df.value = df.value.str.replace('$','').str.replace(',', '').str.replace('None', '200').astype(int) 

def replace_values(n):
    if n in range(200, 2000, 200):
        return n
    for value in range(200, 2000, 200):
        if n < value:
            return value
    return 1800

df.value = df.value.map(replace_values)
sorted(df.value.unique())

[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]

In [37]:
df.shape

(203485, 8)

## 2. Text-Cleaning (URLs, HTML tags, digits, etc.)

In [38]:
# Apply a first round of text cleaning techniques
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, 
    remove punctuation and remove words containing numbers, remove line breaks.'''
    text = text.lower()
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('[''""]', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub('\.\.\.|\\|\\n|&|:|;|\'|\$|!', '', text)
    text = re.sub('<.*?>', '', text)
    return text

df.text = df.text.map(clean_text)

In [39]:
#view a sample of cleaned text 
df.text.to_list()[:5]

['history for the last   years of his life, galileo was under house arrest for espousing this mans theory copernicus',
 'espns top   all-time athletes no.     olympian football star at carlisle indian school   mlb seasons with the reds, giants  braves jim thorpe',
 'everybody talks about it the city of yuma in this state has a record average of  ,  hours of sunshine each year arizona',
 'the company line in  , live on the art linkletter show, this company served its billionth burger mcdonalds',
 'epitaphs  tributes signer of the dec. of indep., framer of the constitution of mass., second president of the united states john adams']

## 3. Tokenization , Removal of Digits, Stop Words and Punctuations
Further preprocessing of the new feature ‘text’
NLTK (Natural Language Toolkit) is one of the best library for preprocessing text data. 

In [40]:
def preprocess_text(text):
    
    #tokenize and lemmatize corpus
    tokens = word_tokenize(text)
    tokens = [re.sub('[%s]' % re.escape(string.punctuation), '', token) for token in tokens]
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token) for token in tokens]
    
    #define stopwords
    stop_words = stopwords.words('english')
    new_stopwords=['jpg', 'blank', 'wmv', 'also', 'used', 'made', 'like', 'one', 'wa', 'ha', 'st', 'name'] 
    stop_words.extend(new_stopwords)  
    
    #filter for nouns, minimum 2 letters long 
    #options to add adjectives:  (or pos == 'JJ' or pos == 'JJR' or pos == 'JJS')
    nouns = []
    for word,pos in nltk.pos_tag(lemmas):
         if ((pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') and (word not in stop_words) and (len(word) > 1)):
                nouns.append(word)
    
    return nouns

In [41]:
df.shape

(203485, 8)

## 4. TF-IDF Vectorizer to generate Document-Term Matrix

In [42]:
cv_tfidf = TfidfVectorizer(analyzer=preprocess_text, max_df = 0.15, min_df = .001)
doc_word = cv_tfidf.fit_transform(df.text)
doc_word = pd.DataFrame(doc_word.toarray(), columns=cv_tfidf.get_feature_names())
doc_word.to_pickle("dtm.pkl")
doc_word.shape

(203485, 1094)

In [43]:
#doc-term matrix 
doc_word.head()

Unnamed: 0,abbrev,abbreviation,abraham,academy,act,action,activity,actor,actress,ad,...,writer,wwii,yankee,year,york,youll,youre,youth,youve,zealand
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.341982,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.320805,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5. Matrix Factorization with NMF

In [44]:
# Doc-topic matrix, H matrix
nmf_model = NMF(n_components=13,random_state=1)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(203485, 13)

In [45]:
topic_word = nmf_model.components_
topic_word.shape

(13, 1094)

In [46]:
#Looking at the words that make up each topic
words = cv_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words_df = pd.DataFrame(topic_words).T

In [47]:
#topics for just nouns
topics = ['Language', 'Cities', 'Countries', 'Film & TV', 'U.S. Hodgepodge', 'Magazines', 'History', 'Science', 'Capitals', 'Words', 'Literature', 'People', 'Culture']

#different topics for nouns & adj
#topics = ['Language', 'Cities', 'Potpourri', 'Countries', 'Potpourri', 'Film', 'Magazines', 'History', 'Literature', 'Science', 'Words', 'Television', 'Culture']

topic_words_df.columns = topics
topic_words_df

Unnamed: 0,Language,Cities,Countries,Film & TV,U.S. Hodgepodge,Magazines,History,Science,Capitals,Words,Literature,People,Culture
0,word,city,country,film,state,time,world,type,capital,letter,book,man,year
1,phrase,york,island,movie,island,rhyme,war,food,world,term,author,john,day
2,term,river,music,tv,university,york,history,science,birthplace,number,woman,president,president
3,meaning,home,geography,title,college,day,island,term,river,alphabet,title,people,century
4,origin,museum,france,star,california,celebrity,leader,bird,geography,men,novel,george,history
5,latin,place,language,character,president,song,geography,fruit,lie,end,character,history,sport
6,come,port,president,role,river,magazine,ii,water,museum,symbol,century,song,number
7,language,university,people,song,park,sport,nation,body,idea,form,literature,music,war
8,something,geography,china,actor,texas,life,battle,life,province,synonym,john,century,life


### Topic-Terms Matrix (W)

In [48]:
# Create Topic-terms matrix (W) with .components_
components_df = pd.DataFrame(nmf_model.components_, columns=cv_tfidf.get_feature_names(), index=topics)
components_df.head()

Unnamed: 0,abbrev,abbreviation,abraham,academy,act,action,activity,actor,actress,ad,...,writer,wwii,yankee,year,york,youll,youre,youth,youve,zealand
Language,0.0,0.0,0.000745,0.0,0.042264,0.022097,0.012397,0.000492,0.0,0.001055,...,0.018608,0.002838,0.00174,0.0,0.0,0.017187,0.061293,0.010727,0.013412,0.000432
Cities,0.0,0.0,0.008201,0.014509,0.002382,0.003746,0.00162,0.0,0.001225,0.025534,...,0.006236,0.007652,0.004101,0.0,0.528522,0.067113,0.03414,0.003825,0.00022,0.014873
Countries,0.001459,0.000983,0.0,0.001138,0.007804,0.00355,0.001383,0.000283,0.004417,0.014485,...,0.004768,0.013003,0.0,0.0,0.0,0.047434,0.085465,0.003385,0.006406,0.111469
Film & TV,0.000799,0.002307,0.004894,0.01152,0.009789,0.06016,0.002267,0.43788,0.2311,0.029976,...,0.034488,0.011885,0.00939,0.0,0.043235,0.004467,0.042062,0.005355,0.007952,0.00209
U.S. Hodgepodge,0.009563,0.057734,0.006728,0.00619,0.009372,0.002567,0.001599,0.0,0.001671,0.0,...,0.002667,0.001596,0.002478,0.0,0.16856,0.038255,0.035985,0.002667,0.002982,0.0


In [49]:
# option to print top words per category with scores
# for topic in components_df.index:
#     tmp = components_df.loc[topic]
#     print(f'Topic: {topic}')
#     print('The words with the highest value are:')
#     print(tmp.nlargest(10))
#     print('\n')

### Topic-Document Matrix (H)

In [50]:
#look at topic-document matrix
H = pd.DataFrame(doc_topic.round(5),
             #index = df.text,
             columns = topics)

#assign each document the appropriate topic by max value
H['Category'] = H.idxmax(axis=1)
H.head()

Unnamed: 0,Language,Cities,Countries,Film & TV,U.S. Hodgepodge,Magazines,History,Science,Capitals,Words,Literature,People,Culture,Category
0,0.0,7e-05,0.0,0.0,0.0,0.0,0.00756,0.00118,0.0,0.0,0.00165,0.0657,0.05828,People
1,0.00021,0.00037,0.00014,0.00814,0.0017,0.00112,0.00059,0.00104,0.0,0.00028,0.00061,0.00186,0.00309,Film & TV
2,0.0,0.03436,0.0,0.0,0.05183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05018,U.S. Hodgepodge
3,0.00042,0.00119,0.0,0.00675,0.00044,0.00319,0.00298,0.00348,0.0,0.00127,0.00182,0.0078,0.0032,People
4,0.0,0.0,0.00057,0.00174,0.05396,0.0,0.0,0.0,1e-05,0.00013,0.00305,0.01656,0.00407,U.S. Hodgepodge


In [51]:
#highest frequency meta-categories
H.Category.value_counts()

Literature         34364
Film & TV          28373
Science            23702
People             23378
History            19167
U.S. Hodgepodge    15968
Culture            13351
Language           12681
Magazines           8198
Cities              7538
Words               7406
Countries           7037
Capitals            2322
Name: Category, dtype: int64

In [52]:
df = df.merge(H, left_index=True, right_index=True).drop(columns = topics)
df.head()

Unnamed: 0,show_number,air_date,round,category,value,question,answer,text,Category
0,4680,2004-12-31,Jeopardy!,HISTORY,200,"For the last 8 years of his life, Galileo was ...",Copernicus,"history for the last years of his life, gali...",People
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,espns top all-time athletes no. olympian...,Film & TV
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,The city of Yuma in this state has a record av...,Arizona,everybody talks about it the city of yuma in t...,U.S. Hodgepodge
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,"the company line in , live on the art linklet...",People
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,epitaphs tributes signer of the dec. of indep...,U.S. Hodgepodge


In [53]:
df.to_pickle("df.pkl")

## 6. Random Question Generator

In [54]:
import time
# Generate random questions given meta-category or category, value:

def generate_q(category, value, mc=True, df=df, sleep=2):
    #filter dataset
    if mc:
        row = df[(df.Category == category) & (df.value == value)].sample(1)
    else:
        row = df[(df.category == category.upper()) & (df.value == value)].sample(1)
    cat, val, q, a = row.category.to_list()[0], row.value.to_list()[0], row.question.to_list()[0], row.answer.to_list()[0]
    print(f'Question from the category {cat} for {val}:')
    print(q)
    time.sleep(sleep)
    print(f'Answer: {a}')
    
#testing 
generate_q('BEFORE & AFTER', 200, mc=False, sleep=3)

Question from the category BEFORE & AFTER for 200:
A vodka & tomato juice drink turns into a Texas-based cosmetics giant
Answer: Bloody Mary Kay


In [55]:
# literature treated as a J-category
generate_q('Literature', 200, mc=False)

Question from the category LITERATURE for 200:
This Nobel Prize winner dedicated "The Waste Land" to fellow poet Ezra Pound
Answer: T.S. Eliot


In [56]:
# literature treated as a meta-category
generate_q('Literature', 200, sleep=0.5)

Question from the category BOGIE MEN for 200:
Rick Blaine
Answer: Casablanca


## Next Steps

Add "View More Options" tab option to select... 
* years (slider)
* by J-category (entry box that populates anticipated terms)