In [1]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')

In [32]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,question_overlap,high_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200,0.0,0
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200,0.0,0
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200,0.0,0
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200,0.0,0
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200,0.0,0


print(jeopardy.columns)
jeopardy.columns = jeopardy.columns.str.strip()


In [7]:
jeopardy.columns = jeopardy.columns.str.strip()

In [8]:
import re

def normal_q(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]","",string)
    return string

def normal_a(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]","",string)
    return string

In [9]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normal_q)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normal_a)

In [10]:
def normal_d(string):
    string = string.replace('$',"")
    try:
        string = int(string)
    except Exception:
        string = 0
    return string
        

In [11]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normal_d)
jeopardy['Air Date'] = jeopardy['Air Date'].apply(pd.to_datetime)

In [12]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [13]:
def split_s(row):
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)

In [14]:
answer_in_question = jeopardy.apply(split_s,axis=1)

In [15]:
answer_in_question.mean()

0.06049325706933587

These results tell us that 6% of the questions contain the same word are the answer; and thus 6% of answers are deducible from the question. That in mind, it's best to not rely on the questions when considering your answer. 

In [16]:
questions_overlap = []
terms_used = set()

for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(" ")
    split_question = [word for word in split_question if len(word) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    questions_overlap.append(match_count)
        

In [17]:
jeopardy['question_overlap'] = questions_overlap


In [18]:
jeopardy['question_overlap'].mean()

0.6908737315671962

In [19]:
def values(row):
    value = 0
    if row['clean_value'] > 800:
        value = 1
    return value


In [20]:
jeopardy['high_value'] = jeopardy.apply(values,axis=1)

In [28]:
def counts(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        words = row['clean_question'].split(" ")
        if word in words:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count


In [29]:
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:5]

In [30]:
for term in comparison_terms:
    val = counts(term)
    observed_expected.append(val)

In [31]:
observed_expected

[(1, 3), (0, 4), (0, 1), (1, 1), (1, 0)]

In [38]:
high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0]
low_value_count = jeopardy[jeopardy['high_value'] == 0].shape[0]
chi_squared = []

import numpy as np
from scipy.stats import chisquare 

for item in observed_expected:
    total = sum(item)
    total_prop = total / jeopardy.shape[0]
    exp_high = total_prop * high_value_count
    exp_low = total_prop * low_value_count
    
    observed = np.array([item[0], item[1]])
    expected = np.array([exp_high, exp_low])
    chi_squared.append(chisquare(observed, expected))


In [39]:
chi_squared

[Power_divergenceResult(statistic=4.122707846712507e-05, pvalue=0.9948769527982859),
 Power_divergenceResult(statistic=1.323484394756106, pvalue=0.24996766692297967),
 Power_divergenceResult(statistic=0.3308710986890265, pvalue=0.565146603267378),
 Power_divergenceResult(statistic=0.6765980594008285, pvalue=0.4107606373026974),
 Power_divergenceResult(statistic=3.022325020112631, pvalue=0.08212564786568953)]