# Jeopardy statistics
## Reading the data

In [99]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')
print(jeopardy.head())

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [100]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [101]:
jeopardy.columns = jeopardy.columns.str.strip()
print(jeopardy.columns)

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


## Normalizing data

In [102]:
import re
def normalize(element):
    return re.sub("[^A-Za-z0-9\s]", "", element.lower())


jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)

In [103]:
def normalize_value(element):
    try:
        return int(normalize(element.replace('$','')))
    except:
        return 0

jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

## Identifying what to study
Are answers deducible from questions?
Do questions repeat themselves over the years?

Let's count the matches in answer words and question words.

In [104]:
def answer_match_questions(row):
    split_answer = row['clean_answer'].split(' ')
    while 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    else:
        split_question = row['clean_question'].split(' ')
        match_count = 0
        for word in split_answer:
            if word in split_question:
                match_count +=1
        return match_count / len(split_answer)

jeopardy['answer_in_question'] = jeopardy.apply(answer_match_questions, axis=1)

In [105]:
print(jeopardy['answer_in_question'].mean())

0.059877607599993714


On average, few words in the answers show up in the questions, only about 6% of the time.

## Question overlap

Let's find out how often terms come back accross questions.

In [106]:
jeopardy = jeopardy.sort_values(by='Air Date')

In [107]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [word for word in split_question if len(word)>5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count +=1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())

0.6876260592169802


It seems that on average, words in questions reappear more often, ie on average 68% of the words in a question appeared in previous questions, this is better than words in answers appearing in corresponding questions, but need more investigation to make sure.

## High value question words

In [108]:
def question_value(row):
    if row['clean_value'] > 800:
        return 1
    return 0

jeopardy['high_value'] = jeopardy.apply(question_value, axis=1)

In [109]:
def word_evaluation(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] ==1:
                high_count +=1
            else:
                low_count +=1
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    observed_expected.append(word_evaluation(term))

In [110]:
observed_expected

[(0, 1), (0, 1), (0, 1), (0, 1), (0, 2)]

In [114]:
from scipy.stats import chisquare
import numpy as np

# how many high value questions
high_value_count = (jeopardy['high_value'] == 1).shape[0] 
# how many low value questions
low_value_count = (jeopardy['high_value'] == 0).shape[0]
chi_squared = []
n = jeopardy.shape[0]

for element in observed_expected:
    # in how many questions the term appears, both low and high value ones
    total = sum(element)
    # in which proportion of all questions the term appears
    total_prop = total/n
    # in how many high value questions the term should appear
    expected_high_total = total_prop * high_value_count
    # in how many low value questions the term should appear
    expected_low_total = total_prop * low_value_count
    
    observed = np.array([element[0], element[1]])
    expected = np.array([expected_high_total, expected_low_total])
    chi_squared.append(chisquare(observed, expected))
    

In [115]:
chi_squared

[Power_divergenceResult(statistic=1.0, pvalue=0.31731050786291404),
 Power_divergenceResult(statistic=1.0, pvalue=0.31731050786291404),
 Power_divergenceResult(statistic=1.0, pvalue=0.31731050786291404),
 Power_divergenceResult(statistic=1.0, pvalue=0.31731050786291404),
 Power_divergenceResult(statistic=2.0, pvalue=0.15729920705028505)]

In [None]:
The test doesnt reveal anything significant, probably because the values are so low. SHould repeat the test with only