In [2]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")

jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [4]:
jeopardy.columns=['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [5]:
jeopardy.dtypes

Show Number     int64
Air Date       object
Round          object
Category       object
Value          object
Question       object
Answer         object
dtype: object

In [6]:
import string
def normalize(st):
    st=st.lower()
    st =''.join([i for i in st if i not in string.punctuation])
    return st

In [7]:
jeopardy['clean_question']=jeopardy['Question'].apply(normalize)
jeopardy['clean_answer']=jeopardy['Answer'].apply(normalize)


In [8]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [9]:
def normalize_dollar(st):
    st=normalize(st)
    st=st.replace('$',"")
    try:
        st = int(st)
    except:
        st=0
    return st    

In [10]:
jeopardy['clean_value']=jeopardy['Value'].apply(normalize_dollar)


In [11]:
jeopardy['Air Date']=pd.to_datetime(jeopardy['Air Date'])

In [12]:
jeopardy[['clean_value','Air Date']].head()

Unnamed: 0,clean_value,Air Date
0,200,2004-12-31
1,200,2004-12-31
2,200,2004-12-31
3,200,2004-12-31
4,200,2004-12-31


In [13]:
def answer_in_question_count(row):
    split_answer=row['clean_answer'].split(' ')
    split_question=row['clean_question'].split(' ')
    
    match_count=0
    
    if 'the' in split_answer:
        split_answer.remove('the')
    
    if len(split_answer)==0:
        return 0
    
    for i in split_answer:
        if i in split_question:
            match_count+=1
    
    return match_count/float(len(split_answer))

In [14]:
jeopardy['answer_in_question']=jeopardy.apply(answer_in_question_count,axis=1)

In [15]:
jeopardy['answer_in_question'].max()

1.0

In [16]:
jeopardy['answer_in_question'].mean()

0.060352773854698942

The answer comes in only 6% of the question. That means we cannot put our confidence on the deduction from questions.

In [33]:
jeopardy = jeopardy.sort_values(by="Air Date",ascending=True)

In [60]:
question_overlap=[]
terms_used=set()

for row in jeopardy.itertuples():
    split_question=row.clean_question.split(' ')
    match_count=0
    split_question=[word for word in split_question if len(word) > 5]
    for word in split_question:
        if word in terms_used:
            match_count+=1
    for word in split_question:
        terms_used.add(word)
        
    if len(split_question) > 0:
        match_count=match_count/len(split_question)
    question_overlap.append(match_count)
        

In [61]:
jeopardy['question_overlap']=question_overlap

In [62]:
print(jeopardy['question_overlap'].mean())

0.687124288097


There is nearly 70% overlap between the new questions and old ones. We have considered just the single terms, not phrases . We are just looking at the sample set , so this is not that significant .So we may need to look more in the recycling of questions.

In [63]:
def clean_value(row):
    if row['clean_value'] > 800:
        return 1
    else:
        return 0
    
jeopardy['high_value']=jeopardy.apply(clean_value,axis=1)


    

In [68]:
def high_low_count(word):
    
    low_count=0
    high_count=0
    
    for i,row in jeopardy.iterrows():
        split_question=row['clean_question'].split(' ')
        
        if word in split_question:
            if row['high_value']==1:
                high_count+=1
            else:
                low_count+=1
                
    return high_count,low_count



In [69]:
observed_expected=[]

term_used=list(terms_used)

comparision_terms=term_used[0:5]

for term in comparision_terms:
    result=high_low_count(term)
    observed_expected.append(result)

In [71]:
observed_expected[0:5]

[(0, 1), (0, 3), (0, 1), (1, 0), (1, 2)]

In [73]:
high_value_count=sum(jeopardy['high_value'])
high_value_count

5734

In [75]:
low_value_count=len(jeopardy[jeopardy['high_value']==0])
low_value_count

14265

In [81]:
chi_squared=[]
from scipy.stats import chisquare
import numpy as np
for val in observed_expected:
    total=val[0]+val[1]
    
    total_prop=total / len(jeopardy)
    expected_high=total_prop * high_value_count
    expected_low=total_prop*low_value_count
    observed=np.array([val[0],val[1]])
    expected=np.array([expected_high,expected_low])
    chisquare_value,p_value=chisquare(observed,expected)
    chi_squared.append([chisquare_value,p_value])
    
chi_squared    

[[0.40196284612688399, 0.52607729857054686],
 [1.2058885383806519, 0.27214791766902047],
 [0.40196284612688399, 0.52607729857054686],
 [2.4877921171956752, 0.11473257634454047],
 [0.031881167234403623, 0.85828871632352932]]

No significant differance in the usage of terms between high value and low value counts.
It would be better to run this test for only those terms that have high frequencies.

Here are some potential next steps:

Find a better way to eliminate non-informative words than just removing words that are less than 6 characters long. Some ideas:
Manually create a list of words to remove, like the, than, etc.
Find a list of stopwords to remove.
Remove words that occur in more than a certain percentage (like 5%) of questions.
Perform the chi-squared test across more terms to see what terms have larger differences. This is hard to do currently because the code is slow, but here are some ideas:
Use the apply method to make the code that calculates frequencies more efficient.
Only select terms that have high frequencies across the dataset, and ignore the others.
Look more into the Category column and see if any interesting analysis can be done with it. Some ideas:
See which categories appear the most often.
Find the probability of each category appearing in each round.
Use the whole Jeopardy dataset (available here) instead of the subset we used in this mission.
Use phrases instead of single words when seeing if there's overlap between questions. Single words don't capture the whole context of the question well