In [1]:
import pandas as pd

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 
                    'Category', 'Value', 'Question', 'Answer']

In [5]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [11]:
import re

def normalize_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

def normalize_value(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [8]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_question'].head(5)

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object

In [9]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)
jeopardy['clean_answer'].head(5)

0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object

In [12]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)
jeopardy['clean_value'].head(5)

0    200
1    200
2    200
3    200
4    200
Name: clean_value, dtype: int64

In [15]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy['Air Date'].head(5)

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

In [30]:
def answer_in_question(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)

In [31]:
jeopardy['answer_in_question'] = jeopardy.apply(answer_in_question, axis=1)
jeopardy['answer_in_question'].head(5)

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: answer_in_question, dtype: float64

In [32]:
jeopardy['answer_in_question'].mean()

0.060493257069335872

### Studying Jeopardy Strategy (Answer in Question Route)

The data showed that the .06 words that were in an Answer, showed up in its Question. This is an extremely small. This data leads me to believe that studying for Jeopardy by focusing on using the words in the Question to lead me to its Answer would be a very poor technique.

In [37]:
sorted_jeopardy = jeopardy.sort_values(by=['Air Date'], ascending=False)

In [40]:
question_overlap = []
terms_used = set()
for i, row in sorted_jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
            terms_used.add(word)
    if len(split_question) > 0:
            match_count /= len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.69456824383059679

### Studying Jeopardy Strategy (Studying Old Questions)

Based on the data. There is a rather high percentage of words larger than 6 letters being reused throughout show's history. We only have a portion of the total data, so our data maybe slightly skewed if the technique used to generate question's have changed over time. However, if I were studying for jeopardy, I would focus on these 'hot' words. That should help better prepare me for the show. 

In [42]:
def value_assess(df):
    if df['clean_value'] > 800:
        return 1
    else:
        return 0

In [43]:
jeopardy['high_value'] = jeopardy.apply(value_assess, axis=1)

In [44]:
def high_low_count(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

In [47]:
observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[0:5]

In [48]:
for term in comparison_terms:
    observed_expected.append(high_low_count(term))

In [56]:
high_value_count = jeopardy['high_value'][jeopardy['high_value'] == 1].count()
low_value_count = jeopardy['high_value'][jeopardy['high_value'] == 0].count()

In [58]:
from scipy.stats import chisquare
import numpy as np
chi_squared = []
for pair in observed_expected:
    total = sum(pair)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    observed = np.array([pair[0], pair[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))
chi_squared

[Power_divergenceResult(statistic=0.44487748166127949, pvalue=0.50477764875459963),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.42281054506129573, pvalue=0.51553795812945302),
 Power_divergenceResult(statistic=0.013668136264088134, pvalue=0.90693073593149331),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]

### Chi-Square Results
All of the p-values were very high. This means that there is no statisically significant results. In other words, the terms that we were looking for didn't lean towards high or low values. Probably running a chi-sqaure test again with the high frequency words would yield better results.

### Potential Next Steps
- Find a better way to eliminate non-informative words than just removing words that are less than 6 characters long. Some ideas:
    - Manually create a list of words to remove, like the, than, etc.
Find a list of stopwords to remove.
Remove words that occur in more than a certain percentage (like 5%) of questions.
Perform the chi-squared test across more terms to see what terms have larger differences. This is hard to do currently because the code is slow, but here are some ideas:
Use the apply method to make the code that calculates frequencies more efficient.
Only select terms that have high frequencies across the dataset, and ignore the others.
Look more into the Category column and see if any interesting analysis can be done with it. Some ideas:
See which categories appear the most often.
Find the probability of each category appearing in each round.
Use the whole Jeopardy dataset (available here) instead of the subset we used in this mission.
Use phrases instead of single words when seeing if there's overlap between questions. Single words don't capture the whole context of the question well
