In [196]:
import pandas as pd
from scipy.stats import chisquare
import numpy as np

jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head(5))

print(jeopardy.columns)



   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [197]:
jeopardy.columns=['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [198]:
import re

def normString(inp):
    inp = inp.lower()
    inp = re.sub("[^A-Za-z0-9\s]", "", inp)
    return inp

def normValues(inp):
    inp = re.sub("[^A-Za-z0-9\s]", "", inp)
    try:
        inp = int(inp)
    except Exception:
        inp =0
    return inp

In [199]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normString)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normString)
jeopardy["clean_value"] = jeopardy["Value"].apply(normValues)

In [200]:
jeopardy["Air Date"]=pd.to_datetime(jeopardy["Air Date"])

In [201]:
def count(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count =0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    
    for i in split_answer:
        if i in split_question:
            match_count += 1
        return match_count/len(split_answer)

In [202]:
jeopardy["answer_in_question"] = jeopardy.apply(count,axis=1)
print(jeopardy["answer_in_question"])

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
5        0.0
6        0.0
7        0.0
8        0.0
9        0.0
10       0.0
11       0.0
12       0.0
13       0.0
14       0.0
15       0.0
16       0.0
17       0.0
18       0.0
19       0.0
20       0.0
21       0.0
22       0.0
23       0.0
24       0.0
25       0.0
26       0.0
27       0.0
28       0.0
29       0.0
        ... 
19969    0.0
19970    0.0
19971    0.0
19972    0.0
19973    0.0
19974    0.0
19975    0.0
19976    0.0
19977    0.0
19978    0.0
19979    0.0
19980    0.5
19981    0.0
19982    0.0
19983    0.0
19984    0.0
19985    0.0
19986    0.0
19987    0.0
19988    0.0
19989    0.0
19990    0.0
19991    0.0
19992    0.0
19993    0.0
19994    1.0
19995    0.0
19996    0.0
19997    0.0
19998    0.0
Name: answer_in_question, dtype: float64


In [203]:
print(jeopardy["answer_in_question"].mean())

0.0296856893849


In [204]:
question_overlap=[]
terms_used = []

for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [c for c in split_question if len(c) > 5]
    match_count = 0
    for c in split_question:
        if c in terms_used:
            match_count += 1
        else: 
            terms_used.append(c)
    if len(split_question) > 0:
        question_overlap.append(match_count / len(split_question))
    else:
        question_overlap.append(0)
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: question_overlap, dtype: float64

In [205]:
print(jeopardy["question_overlap"].mean())

0.692596005734


In [206]:
def cleanValue(row):
    value = 0
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return(value)

In [207]:
jeopardy["high_value"]= jeopardy.apply(cleanValue,axis=1)

In [208]:
def countValue(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(countValue(term))

observed_expected

[(1, 6), (3, 2), (0, 1), (11, 14), (0, 2)]

In [209]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for i in observed_expected:
    total = sum(i)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([i[0], i[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

print(chi_squared)

[Power_divergenceResult(statistic=0.70835065396621411, pvalue=0.39999189913636146), Power_divergenceResult(statistic=2.3995960878537224, pvalue=0.12136658322360773), Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686), Power_divergenceResult(statistic=2.8723025608618364, pvalue=0.09011585768849395), Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708)]
