In [83]:
import pandas as pd
import re
from scipy.stats import chisquare
import numpy as np

# Reading the data

In [84]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


# Cleaning the dataset

In [85]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [86]:
jeopardy.columns = jeopardy.columns.str.replace(' ', '')
jeopardy.columns

Index(['ShowNumber', 'AirDate', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [87]:
def normalize_qa(string):
    return re.sub('[^a-zA-Z0-9\s]', '', string.lower())

In [88]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_qa)
jeopardy['clean_question'].head()

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [89]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_qa)
jeopardy['clean_answer'].head()

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [90]:
def norm_to_int(string):
    try:
        value = int(re.sub('[^0-9\.]', '', string.lower()))
    except Exception:
        value = 0
    return value

In [91]:
jeopardy['clean_value'] = jeopardy['Value'].apply(norm_to_int)
jeopardy['clean_value'].head()

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [92]:
jeopardy['AirDate'] = pd.to_datetime(jeopardy['AirDate'])
jeopardy['AirDate'].dtype

dtype('<M8[ns]')

# Finding the answer in the question

In [93]:
def answer_in_q(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if not len(split_answer):
        return 0
    
    for a in split_answer:
      if a in split_question:
       match_count += 1
    return match_count / len(split_answer)

In [94]:
jeopardy['answer_in_question'] = jeopardy.apply(answer_in_q, axis=1)
jeopardy['answer_in_question'].mean()

0.060493257069335872

The answer appears in the question only 6% of the time. Therefore, using the question in the answer will not be a good strategy to win Jeopardy.

# Are Jeopardy questions repeated?

In [95]:
jeopardy = jeopardy.sort_values(by=['AirDate'], ascending=True)
jeopardy.head()

Unnamed: 0,ShowNumber,AirDate,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.0
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.0
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.0


In [96]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    match_count = 0
    split_question = row['clean_question'].split(' ')
    split_question = [word for word in split_question if len(word) >= 6]
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.68940313590732449

The results show that almost 69% of words in questions are recycled. This could be a good strategy for winning Jeopardy, the recycled words will need to be further investigated.

# Finding terms corresponding to high value questions

In [97]:
def find_value(row):
    if row['clean_value'] > 800:
        return 1
    else:
        return 0

In [98]:
jeopardy['high_value'] = jeopardy.apply(find_value, axis=1)
jeopardy['high_value'].head()

19325    0
19301    0
19302    0
19303    0
19304    0
Name: high_value, dtype: int64

In [99]:
def count_values(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if word in row['clean_question'].split(' '):
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [100]:
observed_expected = []
comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    observed_expected.append(count_values(term))
    
observed_expected

[(2, 0), (1, 0), (0, 1), (1, 1), (1, 1)]

In [101]:
high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0]
low_value_count = jeopardy.shape[0] - high_value_count

print(high_value_count)
print(low_value_count)

5734
14265


In [102]:
chi_squared = []

for i in observed_expected:
    total = sum(i)
    total_prop = total / jeopardy.shape[0]
    
    high_v_expected = total_prop * high_value_count
    low_v_expected = total_prop * low_value_count
    
    observed = np.array([i[0], i[1]])
    expected = np.array([high_v_expected, low_v_expected])
    
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=4.9755842343913503, pvalue=0.025707519787911092),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963)]

In [104]:
comparison_terms

['linguistic', 'interchangeably', 'flattop', 'franks', 'repelled']

It appears that linguistic has a statistically significant chi squared result, showing that it is possible that this term corresponds to higher value questions. However, this was tested on a very small set of data, where each word was only used a maximum of twice.