In [1]:
import pandas as pd

In [2]:
jeopardy = pd.read_csv("jeopardy.csv")

In [3]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
jeopardy.columns = jeopardy.columns.str.lstrip()

In [6]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [7]:
def normalize(s):
    from string import punctuation as punc
    from string import whitespace
    
    s2 = s.split()
    result = ["".join(c.lower() for c in w if c not in punc) for w in s2]
    result2 = " ".join(w for w in result if w not in whitespace)
    return result2

In [8]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)

In [9]:
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)

In [10]:
def norm_dollars(s):
    from string import digits
    nums = "".join(c for c in s if c in digits)
    
    try:
        nums = int(nums)
    except Exception:
        nums = 0
    return nums

In [11]:
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_dollars)

In [12]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [13]:
jeopardy["Air Date"].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

In [14]:
def match(row):
    split_answer = row["clean_answer"].split()
    split_question = row["clean_question"].split()
    
    match_count = 0
    
    while "the" in split_answer:
        split_answer.remove("the")
    
    if len(split_answer) == 0:
        return 0
    
    match_count = sum(1 for word in split_answer if word in split_question)
    return match_count / len(split_answer)
    

In [15]:
jeopardy["answer_in_question"] = jeopardy.apply(match, axis=1)

In [16]:
jeopardy["answer_in_question"][jeopardy["answer_in_question"] > 0].shape

(2485,)

In [17]:
mean = jeopardy["answer_in_question"].mean()
mean

0.058206961574629956

Since there's only around a 5% chance of the answer being said in the question, we won't be able to determine the answers just from the questions

In [18]:
jeopardy = jeopardy.sort_values("Air Date")
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split()
    split_question = [w for w in split_question if len(w) >= 6]
    
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

In [19]:
jeopardy["question_overlap"] = question_overlap

In [20]:
jeopardy["question_overlap"].mean()

0.68890553166203283

Since questions overlap with previous questions 69% of the time, studying old questions and answers could be an effective study method.

In [21]:
def hilow(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value

In [22]:
jeopardy["high_value"] = jeopardy.apply(hilow, axis=1)

In [26]:
def valuecount(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if word in row["clean_question"].split():
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [27]:
observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    val_count = valuecount(term)
    observed_expected.append(val_count)

In [28]:
observed_expected

[(1, 0), (1, 1), (0, 2), (1, 0), (1, 4)]

In [29]:
high_value_count = jeopardy[jeopardy["high_value"]==1].shape[0]
high_value_count

5734

In [30]:
low_value_count = jeopardy[jeopardy["high_value"]==0].shape[0]
low_value_count

14265

In [31]:
from scipy.stats import chisquare
import numpy as np

chi_squared = []
for l in observed_expected:
    total = sum(l)
    total_prop = total / jeopardy.shape[0]
    exp_hv_rows = total_prop * high_value_count
    exp_lv_rows = total_prop * low_value_count
    observed = np.array([l[0], l[1]])
    expected = np.array([exp_hv_rows, exp_lv_rows])
    chi_squared.append(chisquare(observed, expected))
chi_squared

[Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.18383953104516373, pvalue=0.66809416232506025)]

None of the results from the words so far are statistically significant enough to determine if they correspond to high or low value questions.