In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head(5))
print(jeopardy.columns)   

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [2]:
jeopardy.columns = ["Show Number", "Air Date", "Round", "Category", "Value", "Question", "Answer"]

In [3]:
import re
def normal(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]", "", string)
    return string
jeopardy["clean_question"] = jeopardy["Question"].apply(normal)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normal)

In [4]:
def value(string):
    string = re.sub("[^A-Za-z0-9\s]", "", string)
    try:
        string = int(string)
    except:
        string = 0
    return string
jeopardy["clean_value"] = jeopardy["Value"].apply(value)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [5]:
def ducible(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer)==0:
        return 0
    for i in split_answer:
        if i in split_question:
            match_count +=1
    return match_count / len(split_answer)
jeopardy["answer_in_question"] = jeopardy.apply(ducible, axis =1)
print(jeopardy["answer_in_question"].mean())

0.0604932570693


which means that we can't just figure out the answer from the question, need to read them all.

In [8]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    for i in split_question:
        if len(i)<6:
            split_question.remove(i)
    match_count=0
    for i in split_question:
        if i in terms_used:
            match_count +=1
    for i in split_question:
        terms_used.add(i)
    if len(split_question)>0:
        match_count/=len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
print(jeopardy["question_overlap"].mean())


0.802789065891


although looks like that the question dataset is just a piece of the whole but nearly 80percents of it is overlap with old ones. it worth to think to recycle some duplicate questions.

In [16]:
def occur(row):
    value =0
    if row["clean_value"] > 800:
        value = 1
    return value
jeopardy["high_value"] = jeopardy.apply(occur, axis = 1)

def o2(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] ==1:
                high_count +=1
            else: 
                low_count +=1
    return high_count, low_count
observed_expected = []
comparison_terms = list(terms_used)[:5]
for i in comparison_terms:
    observed_expected.append(o2(i))
observed_expected

[(1417, 3301), (1, 1), (0, 1), (2, 2), (1, 1)]

In [17]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=4.28257258858768, pvalue=0.038505027441259804),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.88975496332255899, pvalue=0.34554371914834681),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963)]