In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
def print_title(title, to_print):
    print("="*len(title))
    print(title)
    print("="*len(title))
    print(to_print)
    
print_title("display head", jeopardy.head())

display head
   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


Fix Column Names
====

In [2]:
print_title("display columns", jeopardy.columns)
cols = []
for col in list(jeopardy.columns):
    col = col.strip()
    cols.append(col)
jeopardy.columns = cols
print_title("display fixed columns", jeopardy.columns)

display columns
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')
display fixed columns
Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


Normalize Columns
==

In [3]:
#question & answer cols
from string import punctuation
def norm_string_col(strg):
    strg = strg.lower()
    strg = ''.join(s for s in strg if s not in punctuation)
    return strg
jeopardy["clean_question"] = jeopardy["Question"].apply(norm_string_col)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(norm_string_col)

#value col
def norm_value(strg):
    try:
        strg = norm_string_col(strg)
        strg = int(strg)
        return strg
    except ValueError:
        return 0
    
jeopardy["clean_value"] = jeopardy["Value"].apply(norm_value)

#air date col
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

#print_title("display head", jeopardy.head())

1. How often the answer is deducible from the question?
==

In [4]:
def percent_match(row):
    match_count = 0
    split_answer = row["clean_answer"].split(" ")
    if "the" in split_answer:
        split_answer.remove('the')
    split_question = row["clean_question"].split(" ")
    if len(split_answer) == 0:
        return 0
    else:
        for item in split_answer:
            if item in split_question:
                match_count += 1
        return match_count/len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(percent_match, axis=1)
avg_answer_in_question = jeopardy["answer_in_question"].mean()
print_title("avg percentage of answer in question", avg_answer_in_question)

avg percentage of answer in question
0.0603527738547


**Findings: The answer is rarely deducible from the question.**

2. How often new questions are repeats of older questions?
==
Investigating how often complex words (> 6 characters) reoccur.

In [5]:
#sort by air date
jeopardy = jeopardy.sort_values(["Air Date"], ascending=1)
#loop over clean_question
question_overlap = []
terms_used = []
col_split_question = []
for i, row in jeopardy.iterrows():
    match_count = 0
    split_question = row["clean_question"].split(" ")
    split_question = [word for word in split_question if len(word)>5]
    col_split_question.append(split_question)
    for word in split_question:
        if word in terms_used:
            match_count += 1
        else:
            terms_used.append(word)
    if len(split_question) > 0:
        question_overlap.append(match_count/len(split_question))
    else:
        question_overlap.append(0)
jeopardy["question_overlap"] = question_overlap
jeopardy["split_question"] = col_split_question
#print(question_overlap)
avg_question_overlap = jeopardy["question_overlap"].mean()
print_title("avg percentage of overlaped questions", avg_question_overlap)
#print(jeopardy.head())

avg percentage of overlaped questions
0.688905531662


3. Low Values vs High Value Questions
==

In [17]:
high_value_count, low_value_count = 0, 0

for i, row in jeopardy.iterrows():
    if row["clean_value"] < 800:
        high_value_count += 1
    else:
        low_value_count += 1

def occurance(word):
    word_count = 0
    word_count_low = 0
    word_count_high = 0
    for i, row in jeopardy.iterrows():
    # split questions into two catogories by value
        if word in row["split_question"]:
            if row["clean_value"] < 800:
                #Find the number of low value questions the word occurs in
                word_count_low += 1
            else:
                #Find the number of high value questions the word occurs in
                word_count_high += 1
            word_count += 1
    #Find the percentage of questions the word occurs in
    per = 1/jeopardy.shape[0]
    #Based on the percentage of questions the word occurs in, find expected counts
    word_count_low_exp = per*low_value_count
    word_count_high_exp = per*high_value_count
    return word_count_low, word_count_high, word_count_low_exp, word_count_high_exp 

observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    count = occurance(term)
    observed_expected.append(count)   
#Compute the chi squared value based on the expected counts and the observed counts for high and low value questions
from scipy.stats import chisquare
chi_squared = []
for data in observed_expected:
    chi_squared.append(chisquare(data[:2], f_exp=data[2:4]))
print_title("chi_squared", chi_squared)

chi_squared
[Power_divergenceResult(statistic=5.9523452957768193, pvalue=0.014697719050307324), Power_divergenceResult(statistic=67851.571860497963, pvalue=0.0), Power_divergenceResult(statistic=48.376061510213447, pvalue=3.5183706698949539e-12), Power_divergenceResult(statistic=32.809381183107277, pvalue=1.0165308487719387e-08), Power_divergenceResult(statistic=168.67061474723423, pvalue=1.4438564237211225e-38)]
