In [1]:
import pandas as pd

jeopardy_data = pd.read_csv("jeopardy.csv")

jeopardy_data.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [2]:
# Let's check the format of the column headers
print(jeopardy_data.columns)

# Since the columns have spaces and could cause issues
# further on in our analysis, let's fix that

jeopardy_data.columns = jeopardy_data.columns.str.replace(' ', '')

print(jeopardy_data.columns)


Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')
Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [3]:
# Let's see the format of all the data
jeopardy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 7 columns):
ShowNumber    19999 non-null int64
AirDate       19999 non-null object
Round         19999 non-null object
Category      19999 non-null object
Value         19999 non-null object
Question      19999 non-null object
Answer        19999 non-null object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [4]:
# We need to normalize all the text of the questions & answers 
# in order to have an accurate analysis

from string import punctuation

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

def normalize_text(text):
        text = text.lower()
        text = strip_punctuation(text)
        return text

def normalize_values(text):
    text = strip_punctuation(text)
    try:
        text = int(text)
    except Exception:
            text = 0
    return text

jeopardy_data["clean_question"] = jeopardy_data["Question"].apply(normalize_text)
jeopardy_data["clean_answer"] = jeopardy_data["Answer"].apply(normalize_text)
jeopardy_data["clean_value"] = jeopardy_data["Value"].apply(normalize_values)
jeopardy_data["AirDate"] = pd.to_datetime(jeopardy_data["AirDate"])


In [5]:
jeopardy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 10 columns):
ShowNumber        19999 non-null int64
AirDate           19999 non-null datetime64[ns]
Round             19999 non-null object
Category          19999 non-null object
Value             19999 non-null object
Question          19999 non-null object
Answer            19999 non-null object
clean_question    19999 non-null object
clean_answer      19999 non-null object
clean_value       19999 non-null int64
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 1.5+ MB


## Study Method Analysis ##

Let's see if we can find the most effective method for studying in two different ways:

1. How often the answer appears in the question
2. How often new questions are repeats of older questions

Depending on the results, we can tailor our study strategy to be more effective and a better use of our time

In [6]:
def answer_is_in_the_question(data):
    match_count = 0
    split_answer = data["clean_answer"].split(" ")
    split_question = data["clean_question"].split(" ")
    try:
        split_answer.remove("the")
    except Exception:
        split_answer
    
    if len(split_answer) == 0:
        return 0
    else:
        for item in split_answer:
            if item in split_question:
                match_count += 1
        return (match_count / len(split_answer))

jeopardy_data["answer_in_question"] = jeopardy_data.apply(answer_is_in_the_question, axis=1)
    

In [7]:
jeopardy_data["answer_in_question"].mean()

0.06035277385469894

In [8]:
jeopardy_data["answer_in_question"].value_counts(normalize=True) * 100

0.000000    86.904345
0.500000     7.250363
0.333333     2.755138
0.250000     0.850043
1.000000     0.610031
0.666667     0.510026
0.200000     0.410021
0.166667     0.140007
0.400000     0.140007
0.142857     0.095005
0.750000     0.090005
0.285714     0.050003
0.600000     0.045002
0.125000     0.045002
0.428571     0.015001
0.181818     0.010001
0.800000     0.010001
0.571429     0.010001
0.300000     0.010001
0.111111     0.010001
0.307692     0.005000
0.444444     0.005000
0.222222     0.005000
0.375000     0.005000
0.100000     0.005000
0.153846     0.005000
0.875000     0.005000
0.272727     0.005000
Name: answer_in_question, dtype: float64

## Results for study analysis question #1 ##

Given that 86% of answers do not appear in the question, and over 95% of questions have less than one word of the answer in the question, it seems the best decision by default is study old questions, versus trying to "game" and see if you can deduce the answer from the question itself

Let's see if we can see how many new questions are repeats of older questions 

In [9]:
jeopardy_data.sort_values("AirDate", ascending = True)

terms_used = set()
question_overlap = []

for i, row in jeopardy_data.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count +=1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
            match_count = (match_count / len(split_question))
    question_overlap.append(match_count)
jeopardy_data["question_overlap"] = question_overlap


In [10]:
jeopardy_data["question_overlap"].mean()

0.6902117143393507

## Results for study analysis question #2 ##

While not looking at phrases and only looking at a partial dataset of all Jeopardy questions, it appears 69% of question terms are re-used in some sort of fashion, so this is worth looking into further as a potential strategy.

Let's do this using a chi-squared function to see if we see a lot more terms versus what we expect, cut by whether a question is high-value or low-value (we'll set high-value in this case to be $800 or more)

In [11]:
def determine_value(row):
    if row["clean_value"] > 800:
        value = 1
        return value
    else: 
        value = 0
        return value
    
jeopardy_data["high_value"] = jeopardy_data.apply(determine_value, axis=1)

In [12]:
def word_usage(word):
    low_count = 0
    high_count = 0
    
    for i, row in jeopardy_data.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else: 
                low_count += 1
                
    return high_count, low_count

observed_expected = []

comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    observed_expected.append(word_usage(term))

In [13]:
import numpy as np
high_value_count = np.sum(jeopardy_data["high_value"] == 1)
low_value_count = np.sum(jeopardy_data["high_value"] == 0)

In [19]:
chi_squared = []
from scipy.stats import chisquare

for observations in observed_expected:
    total = sum(observations)
    total_prop = total / jeopardy_data.shape[0]
    high_value_expected = total_prop * high_value_count
    low_value_expected = total_prop * low_value_count
    
    observed = np.array([observations[0], observations[1]])
    expected = np.array([high_value_expected, low_value_expected])
    chi_squared.append(chisquare(observed, expected))

In [25]:
chi_squared

[Power_divergenceResult(statistic=3.423170782846152e-05, pvalue=0.9953317740648371),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047)]

## Results ## 

None of the terms had a significant result, with no significant difference observed between high-value and low-value terms. Note that all p-values above were greater than 0.05. 
