In [42]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [43]:
jeopardy.columns = jeopardy.columns.str.replace(" ","")
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [44]:
import string
def normalize(s):
    s_low = s.lower()
    translator = str.maketrans("", "", string.punctuation)
    s_punc = s_low.translate(translator)
    return s_punc

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
jeopardy["clean_question"].head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [45]:
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)
jeopardy["clean_answer"].head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [46]:
def dollar(s):
    translator = str.maketrans("","", string.punctuation)
    s = s.translate(translator)
    try:
        s = int(s)
    except ValueError:
        s = 0
    return s

jeopardy["clean_value"] = jeopardy["Value"].apply(dollar)

In [47]:
jeopardy["AirDate"] = pd.to_datetime(jeopardy["AirDate"])

In [48]:
def match(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for ans in split_answer:
        if ans in split_question:
            match_count += 1

    return match_count / len(split_answer)    


jeopardy["answer_in_question"] = jeopardy.apply(match,axis=1)
jeopardy["answer_in_question"].mean()

0.060352773854699004

6% of the words are both in answer and question, which is a small fraction to deduce answer from the questions. It is not a good idea to figure out the answer from the question.

In [49]:
question_overlap = []
terms_used = set()

for index,row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [word for word in split_question if len(word) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()



0.6902117143393427

%70 of the words in questions are similar to previous ones. This means it is worth to investigate further.

In [50]:
def high_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(high_value,axis=1)

def high_word(word):
    low_count = 0
    high_count = 0
    for index,row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    counts = high_word(term)
    observed_expected.append(counts)
    
    
observed_expected        

[(1, 0), (1, 1), (0, 1), (0, 1), (0, 2)]

In [59]:
import numpy as np
from scipy.stats import chisquare

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]
chi_squared = []

for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_exp = total_prop * high_value_count
    low_exp = total_prop * low_value_count
    
    observed = np.array([obs[0],obs[1]])
    expected = np.array([high_exp,low_exp])

    chi_squared.append(chisquare(observed,expected))
chi_squared

[Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571)]

Chi-squared results

None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.
