In [19]:
import pandas as pd 
import numpy as np 
from scipy.stats import chisquare

jeopardy = pd.read_csv('jeopardy.csv')

print(jeopardy.head(5))

print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [20]:
jeopardy.columns = ['ShowNumber', 'AirDate', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [21]:
print (jeopardy.columns)

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [22]:
import re
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub("[^a-z0-9\s]","",txt)
    return txt

In [23]:
jeopardy['clean_question'] = jeopardy['Question'].apply(clean_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(clean_text)

In [24]:
jeopardy.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


In [25]:
jeopardy.Value

0          $200
1          $200
2          $200
3          $200
4          $200
5          $200
6          $400
7          $400
8          $400
9          $400
10         $400
11         $400
12         $600
13         $600
14         $600
15         $600
16         $600
17         $600
18         $800
19         $800
20         $800
21         $800
22       $2,000
23         $800
24        $1000
25        $1000
26        $1000
27        $1000
28        $1000
29         $400
          ...  
19969     $1200
19970     $1200
19971    $1,500
19972     $1200
19973     $1200
19974     $1200
19975     $1600
19976     $1600
19977     $1600
19978     $1600
19979     $1600
19980     $1600
19981    $1,200
19982     $2000
19983     $2000
19984     $2000
19985     $2000
19986     $2000
19987      None
19988      $100
19989      $100
19990      $100
19991      $100
19992      $100
19993      $100
19994      $200
19995      $200
19996      $200
19997      $200
19998      $200
Name: Value, dtype: obje

In [26]:
def clean_value(value):
    value = re.sub("[^0-9\s]","",value)
    try:
        value = int(value)
    except:
        value = 0
    return value

In [27]:
jeopardy['clean_value'] = jeopardy['Value'].apply(clean_value)

In [28]:
jeopardy.drop(['Value','Question','Answer'],axis=1,inplace=True)

In [29]:
jeopardy

Unnamed: 0,ShowNumber,AirDate,Round,Category,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,signer of the dec of indep framer of the const...,john adams,200
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,in the title of an aesop fable this insect sha...,the ant,200
6,4680,2004-12-31,Jeopardy!,HISTORY,built in 312 bc to link rome the south of ita...,the appian way,400
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,in the winter of 197172 a record 1122 inches o...,washington,400
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,this housewares store was named for the packag...,crate barrel,400


In [30]:
jeopardy["AirDate"] = pd.to_datetime(jeopardy['AirDate'])

In [31]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
ShowNumber        19999 non-null int64
AirDate           19999 non-null datetime64[ns]
Round             19999 non-null object
Category          19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
clean_value       19999 non-null int64
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 1.1+ MB


In [32]:
def ans_in_ques(row):
    split_answer = row.clean_answer.split(" ")
    split_question = row.clean_question.split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    div = match_count/len(split_answer)
    return div

jeopardy['answer_in_question'] = jeopardy.apply(ans_in_ques, axis=1)
jeopardy['answer_in_question'].mean()

0.060493257069335872

In [33]:
jeopardy.sort('AirDate', ascending=True)
question_overlap = []
terms_used = set()

for index, row in jeopardy.iterrows():
    split_question = row.clean_question.split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count/=len(split_question)
    question_overlap.append(match_count)    
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()



  if __name__ == '__main__':


0.69087373156719623

In [38]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 2), (0, 1), (0, 1), (0, 1), (0, 1)]

In [39]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]