In [1]:
import pandas as pd
import numpy as np
jeopardy=pd.read_csv('jeopardy.csv')
jeopardy.columns=jeopardy.columns.str.replace(' ','')
print(jeopardy.columns)
jeopardy.head(5)

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [15]:
import re
def normalize_text(string):
    string=string.lower()
    string=re.sub(r"[;:\'\"\[\].,!?]", '',string)
    return string
def normalize_numbers(text):
    text=re.sub(r"\W", '',text)
    try:
        text=int(text)
    except:
        text=0
    return text

In [16]:
jeopardy['clean_question']=jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer']=jeopardy['Answer'].apply(normalize_text)
jeopardy['clean_value']=jeopardy['Value'].apply(normalize_numbers)
jeopardy['AirDate']=pd.to_datetime(jeopardy['AirDate'])

In [17]:
jeopardy

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,high_value
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0
19274,10,1984-09-21,Jeopardy!,GEOGRAPHY,$100,Formerly Formosa,Taiwan,formerly formosa,taiwan,100,0
19275,10,1984-09-21,Jeopardy!,DOUBLE TALK,$100,"Not a Hawaiian cow, but a dress worn by Hawaii...",a muumuu,not a hawaiian cow but a dress worn by hawaiia...,a muumuu,100,0
19276,10,1984-09-21,Jeopardy!,"""JACKS"" OF ALL TRADES",$100,He celebrated his 39th birthday 41 times,Jack Benny,he celebrated his 39th birthday 41 times,jack benny,100,0
19277,10,1984-09-21,Jeopardy!,SHIPS,$100,"""Unsinkable"" for most of its maiden voyage in ...",the Titanic,unsinkable for most of its maiden voyage in 1912,the titanic,100,0
19278,10,1984-09-21,Jeopardy!,"""B"" MOVIES",$100,"In '61 movie, Audrey Hepburn's alternative to ...",Breakfast at Tiffany's,in 61 movie audrey hepburns alternative to bru...,breakfast at tiffanys,100,0
19279,10,1984-09-21,Jeopardy!,SPORTS,$100,What Gary Player plays professionailly,golf,what gary player plays professionailly,golf,100,0
19280,10,1984-09-21,Jeopardy!,GEOGRAPHY,$200,Dutch is still an official language in what is...,Dutch Guiana,dutch is still an official language in what is...,dutch guiana,200,0
19281,10,1984-09-21,Jeopardy!,DOUBLE TALK,$200,Affirmative reply to an admiral's command,aye-aye,affirmative reply to an admirals command,aye-aye,200,0
19282,10,1984-09-21,Jeopardy!,"""JACKS"" OF ALL TRADES",$200,"Between him & his wife, they licked the platte...",Jack Spratt,between him & his wife they licked the platter...,jack spratt,200,0


In [18]:
def count_matches(row):
    split_answer=row['clean_answer'].split(' ')
    split_question=row['clean_question'].split(' ')
    match_count=0
    if "the" in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in split_question:
            match_count+=1
    return match_count/len(split_answer)

In [19]:
answer_in_question= jeopardy.apply(count_matches, axis=1)
answer_in_question_mean=answer_in_question.mean()
print('Mean number of answers that occur in the question: '+ str(answer_in_question_mean))

Mean number of answers that occur in the question: 0.05608258393272658


# Answer terms in the question

This mean value indicates that we can only probably deduce the answer from the question 5.6% of the time which isn't very good. If we want to find a winning strategy, we need to try something else. 

In [7]:
jeopardy=jeopardy.sort_values('AirDate')
terms_used=set()
question_overlap=[]
for i,row in jeopardy.iterrows():
    split_question= row['clean_question'].split(" ")
    split_question=[x for x in split_question if len(x)>5]
    match_count=0
    for word in split_question:
        if word in terms_used:
            match_count+=1
        terms_used.add(word)
    if len(split_question)>0:
        match_count= match_count/len(split_question)
    question_overlap.append(match_count)
question_overlap_mean=np.mean(question_overlap)
print('Mean question overap: '+ str(question_overlap_mean))

Mean question overap: 0.681901388482


# Question Overlap
While this metric only looks at reappearance of words in questions, 68% of the words (length > 5) in new questions already appeared in previous questions. This metric is not perfect but it means that there may be something there that can still be investigated.

We can  try to improve our results. One way is to improve the filter for important words instead of just choosing words longer than 5 characters. We will do this by defining a list of "filler" words and filtering based on this list.

In [21]:
jeopardy=jeopardy.sort_values('AirDate')
terms_used=set()
question_overlap=[]
meaningless=['the','a','that','then','there','when', ''] #find online source for this list
for i,row in jeopardy.iterrows():
    split_question= row['clean_question'].split(" ")
    split_question=[x for x in split_question if x not in meaningless]
    match_count=0
    for word in split_question:
        if word in terms_used:
            match_count+=1
        terms_used.add(word)
    if len(split_question)>0:
        match_count= match_count/len(split_question)
    question_overlap.append(match_count)
question_overlap_mean=np.mean(question_overlap)
print('Mean question overap: '+ str(question_overlap_mean))

Mean question overap: 0.8547865557


In [22]:
def high_or_low(row):
    if row['clean_value']>800:
        return 1
    else:
        return 0
jeopardy['high_value']=jeopardy.apply(high_or_low,axis=1)

In [None]:
def high_low_count(word):
    low_count=0
    high_count=0
    for i, row in jeopardy.iterrows():
        split_question=row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value']==1:
                high_count+=1
            else:
                low_count+=1
    return high_count, low_count
observed_expected=[]
terms_used=list(terms_used) #lets try and look only at terms that appear frequently
comparison_terms=terms_used
frequent_terms=[]
for term in comparison_terms:
    count=high_low_count(term)
    if sum(count)>50:
        observed_expected.append(count)
        frequent_terms.append(term)
print(observed_expected,frequent_terms)

In [None]:
from scipy.stats import chisquare

high_value_count=sum(jeopardy['high_value']==1)
low_value_count=sum(jeopardy['high_value']==0)
chi_squared=[]
for vals in observed_expected:
    total=sum(vals)
    total_prop=total/len(jeopardy)
    high_count_exp=total_prop*high_value_count
    low_count_exp=total_prop*low_value_count
    observed = np.array([vals[0], vals[1]])
    expected = np.array([high_count_exp, low_count_exp])
    chi_squared.append(chisquare(observed,expected))
chi_squared

None of the results are statistically significant. This means there is no statistical difference between the usage of the words tested and the value of the question. The terms tested also only appear less than 7 times which makes the chisquared test less valid. This could be repeated for terms that only have high frequencies.