In [1]:
import pandas as pd

jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
for col in jeopardy.columns:
    jeopardy.rename(columns={col: col.strip()}, inplace=True)

In [3]:
import string

def normalize(input_string):
    return "".join([char.lower() for char in input_string if char not in string.punctuation])

def dollars(input_string):
    try:
        new_string = int(normalize(input_string))
    except: 
        new_string = 0
    return new_string

In [4]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)
jeopardy['clean_value'] = jeopardy['Value'].apply(dollars)
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


In [5]:
def answer_in_question(row):
    
    match_count = 0
    split_answer = [word for word in row['clean_answer'].split(" ") if word != 'the']
    split_question = row['clean_question'].split(" ")
    
    for answer in split_answer:
        if answer in split_question:
            match_count += 1
    
    if len(split_answer) == 0:
        return 0
    else:
        return match_count / len(split_answer)

In [6]:
jeopardy['answer_in_question'] = jeopardy.apply(answer_in_question, axis=1)
jeopardy['answer_in_question'].mean()

0.05973712438535679

In [7]:
import numpy as np

question_overlap = []
terms_used = set()

for _, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = list(filter(lambda x: len(x) > 5, split_question))

    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)

    if len(split_question) > 0:
        match_count /= len(split_question)
    
    question_overlap.append(match_count)

print(np.mean(question_overlap))

0.6919577992203644


In [8]:
def determine_value(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    
    return value

def count_usage(word):
    low_count = 0
    high_count = 0
    
    for _, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    
    return high_count, low_count

In [9]:
jeopardy['high_value'] = jeopardy.apply(determine_value, axis=1)

In [10]:
observed_expected = []
comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    observed_expected.append(count_usage(term))

In [11]:
observed_expected

[(1, 0), (31, 54), (0, 1), (0, 1), (1, 2)]

In [12]:
from scipy.stats import chisquare

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for item in observed_expected:
    total = sum(item)
    total_prop = total / jeopardy.shape[0]
    exp_high = total_prop * high_value_count
    exp_low = total_prop * low_value_count
    
    observed = [item[0], item[1]]
    expected = [exp_high, exp_low]
    
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=2.5281398109533804, pvalue=0.1118319831524019),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.03188116723440362, pvalue=0.8582887163235293)]