In [4]:
import pandas as pd 
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [7]:
import re
def normalize_text(str):
    str = str.lower()
    str = re.sub("[^A-Za-z0-9\s]", "", str)
    return str
    
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)
    
    

In [8]:
import re
import pandas
def normalize_value(str):
    str = re.sub("[^A-Za-z0-9\s]", "", str)
    try:
        str = int(str)
    except Exception:
        str = 0
    return str

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_value)
jeopardy["Air Date"] = pandas.to_datetime(jeopardy["Air Date"])

In [9]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)

jeopardy["answer_in_question"].mean()



0.060493257069335872

The answer only appears in the question about 6% of the time. This isn't a huge number, and we can't just hope that a question will enable us to figure out the answer. We'll have to study.

In [10]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question =  [item for item in split_question if len(item) > 5]
    match_count = 0
    for item in split_question:
        if item in terms_used:
            match_count += 1
        else:
            terms_used.add(item)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()

0.69259600573386471

In [11]:
question_overlap = []
terms_used = set()
def reappearance(item):
        if item in terms_used:
            return 1
        else:
            terms_used.add(item)
            return 0
for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        split_question = [q for q in split_question if len(q) > 5]
        def remove_punctuation(word):
            punctuation = ["'", ",", ":", ";", ".", "?"]
            for p in punctuation:
                word = word.replace(p, "")
            return word
        val = 0
        for word in split_question:
            if word in terms_used:
                val += 1
        for word in split_question:
            terms_used.add(word)
        if len(split_question) > 0:
            val /= len(split_question)
        question_overlap.append(val)
    
jeopardy["question_overlap"] = question_overlap
def count_usage(row):
    split_question = row["clean_question"].split(" ")
    for item in split_question:
        if len(item) < 6:
            split_question.remove(item)
    for item in split_question:
        if item in terms_used:
            return 1
        else:
            terms_used.add(item)
    return 0
jeopardy["question_overlap"] = jeopardy.apply(count_usage, axis=1)

jeopardy["question_overlap"].mean()

0.69087373156719623

There is about 70% overlap between terms in new questions and terms in old questions. It does mean that it's worth looking more into the recycling of questions.

In [12]:
def function(row):
    val = 0
    if row["clean_value"] > 800:
        val = 1
    return val

jeopardy["high_value"] = jeopardy.apply(function, axis=1)

def word_usage(word):
    low_count = 0
    high_count = 0
    
    for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else: 
                low_count += 1
    return high_count, low_count
                
observed_expected = []   
comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    observed_expected.append(word_usage(term))
    
observed_expected
                

[(0, 1), (0, 1), (0, 1), (2, 0), (1, 2)]

In [14]:
high_value_count = len(jeopardy[jeopardy['high_value']==1])
low_value_count = len(jeopardy[jeopardy['high_value']==0])
chi_squared = []


In [16]:
from scipy.stats import chisquare
import numpy as np

for item in observed_expected:
    total = sum(item)
    total_prop = total/jeopardy.shape[0]
    
    high_value_exp = total_prop * high_value_count
    medium_value_exp = total_prop * medium_value_count
    low_value_exp = total_prop * low_value_count
    medium_value_exp = 45
    observed = np.array([item[0], item[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=4.9755842343913503, pvalue=0.025707519787911092),
 Power_divergenceResult(statistic=0.031881167234403623, pvalue=0.85828871632352932)]

In [None]:
def a():
    return 1
def b():
    return 3
def c():
    return 5
val = a()