# Winning Jeopardy

## Reading in the Data

In [1]:
import pandas as pd

In [2]:
jeopardy = pd.read_csv("jeopardy.csv")

In [3]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [5]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [6]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

## Normalizing Text

In [7]:
import string

In [8]:
def normalize_QA(qa):
    qa = qa.lower()
    for c in string.punctuation:
        qa = qa.replace(c, "")
    return qa   
    

In [9]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_QA)

In [10]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_QA)

In [11]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams


## Normalizing Columns

In [12]:
def normalize_dollar_value(dollar):
    for c in string.punctuation:
        dollar = dollar.replace(c, "")
    try:
        dollar = int(dollar)
    except ValueError:
        dollar = 0
    return dollar    

In [13]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_dollar_value)

In [14]:
pd.to_datetime(jeopardy['Air Date'])

0       2004-12-31
1       2004-12-31
2       2004-12-31
3       2004-12-31
4       2004-12-31
5       2004-12-31
6       2004-12-31
7       2004-12-31
8       2004-12-31
9       2004-12-31
10      2004-12-31
11      2004-12-31
12      2004-12-31
13      2004-12-31
14      2004-12-31
15      2004-12-31
16      2004-12-31
17      2004-12-31
18      2004-12-31
19      2004-12-31
20      2004-12-31
21      2004-12-31
22      2004-12-31
23      2004-12-31
24      2004-12-31
25      2004-12-31
26      2004-12-31
27      2004-12-31
28      2004-12-31
29      2004-12-31
           ...    
19969   2009-05-14
19970   2009-05-14
19971   2009-05-14
19972   2009-05-14
19973   2009-05-14
19974   2009-05-14
19975   2009-05-14
19976   2009-05-14
19977   2009-05-14
19978   2009-05-14
19979   2009-05-14
19980   2009-05-14
19981   2009-05-14
19982   2009-05-14
19983   2009-05-14
19984   2009-05-14
19985   2009-05-14
19986   2009-05-14
19987   2009-05-14
19988   2000-03-14
19989   2000-03-14
19990   2000

In [15]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200


## Answers in Questions

In [16]:
def count_words(row):
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)        
    

In [17]:
jeopardy['answer_in_question'] = jeopardy.apply(count_words, axis=1)

In [18]:
jeopardy['answer_in_question'].mean()

0.060352773854698942

## Finding

The data shows that the answer only appears 6% of the time in the question and it wouldn't be a good strategy to only study the questions to find the answer.

## Recycled Questions

In [19]:
questions_overlap = []
terms_used = set()

In [20]:
for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for term in split_question:
        if term in terms_used:
            match_count += 1
    for term in split_question:
        terms_used.add(term)
    
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    questions_overlap.append(match_count)    
    

In [21]:
jeopardy['question_overlap'] = questions_overlap

In [22]:
jeopardy['question_overlap'].mean()

0.69021171433935069

## Findings

The data shows that there a 70% overlap on terms used. This is only a subset of the all the questions and phrases have not been looked at so teh reality maybe this is not useful at understanding the questions as first appear. However, it is worth reviewing further. 

## Low Value vs. High Value Questions

In [23]:
def greater_than_800(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value    

In [24]:
jeopardy['high_value'] = jeopardy.apply(greater_than_800, axis=1)

In [25]:
def low_high_count(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(" ")
        if word in split_question:
            if row['high_value'] == 1: 
                high_count += 1
            else:
                low_count += 1
    return (high_count, low_count)        

In [26]:
observed_expected = []

In [27]:
comparison_terms = list(terms_used)[:5]

In [28]:
comparison_terms

['pretender', 'bouquets', 'madrileno', 'knockout', 'sophomore']

In [29]:
for term in comparison_terms:
    count = low_high_count(term)
    observed_expected.append(count)

In [30]:
observed_expected

[(1, 1), (1, 0), (1, 0), (0, 3), (1, 1)]

## Chi-Squared Test

In [31]:
high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0]
low_value_count = jeopardy[jeopardy['high_value'] == 0].shape[0]

In [32]:
high_value_count

5734

In [33]:
low_value_count

14265

In [34]:
from scipy.stats import chisquare
import numpy as np

chi_squared = []

for value in observed_expected:
    total = sum(value)
    total_prop = total / jeopardy.shape[0]
    expected_high_value = total_prop * high_value_count
    expected_low_value = total_prop * low_value_count
    
    observed = np.array([value[0], value[0]])
    expected = np.array([expected_high_value, expected_low_value])
    
    chisq, p_value = chisquare(observed, expected)
    chi_squared.append([chisq, p_value])
    

In [35]:
chi_squared

[[0.44487748166127949, 0.50477764875459963],
 [1.889754963322559, 0.16922956195303951],
 [1.889754963322559, 0.16922956195303951],
 [3.0, 0.08326451666355042],
 [0.44487748166127949, 0.50477764875459963]]

## Findings

Only one term had a significant difference between the high and low value questions which overall is not significant. The frequncies were also low for a chi_squared test and would be better utilised with high frequncy terms. 