In [1]:

# Read the Dataset called jeopardy using Pandas
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)
jeopardy.columns
# Some of the columns names have spaces in front. I will remove these spaces using a list comprehension
jeopardy.columns = [i.strip(" ") for i in jeopardy.columns]

In [2]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [3]:
jeopardy.head(10)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel


In [4]:
# Normalize the text columns(Question and Answer columns)
# With the remove_punctuation function the jeopardy Question column and Jeopardy Answer column will be normalized
# in order to allow comparison and processing of these text. The function take in a string, convert it to lowercase
# and remove all punctuation in the string. Finally, two more columns would be created called clean_question and clean_answer.

import string 
string.punctuation

def remove_punctuation(w):
    w = w.lower()
    for c in string.punctuation:
        w = w.replace(c,"")
    return w 
jeopardy["clean_question"] = jeopardy["Question"].apply(remove_punctuation)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(remove_punctuation)

In [5]:
jeopardy.head(10)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel


In [6]:
# In this step I would normalize the value column. I create the normalize function that take a string an convert the 
# string to an integer. The clean_value column is created with an integer value. 

def normalize(v):
    for c in string.punctuation:
        v = v.replace(c,"")
    try: 
        v = int(v)
    except Exception:
        v = 0
    return v 
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize)


In [7]:
# Convert the Air Date column to a datetime column
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
jeopardy.head(10)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way,400
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400


In [8]:
# Generate a function that return the proportion of times that  words in the correct answer are also in the questions
def jeopardy_row(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0 
    for item in split_answer:
        if item in split_question:
            match_count +=1
    matches = match_count / len(split_answer)
    return matches 
jeopardy["answer_in_question"] = jeopardy.apply(jeopardy_row,axis=1)
jeopardy["answer_in_question"].mean()


0.060352773854699004

We can interpret the answer_in_question variable such as the proportion of the words in each answer that are present in the correspondent question too. If the value is 0.5 means that the half of words in the answer  are in the questions too. 

The mean of the column answer_in_question could be intrepeted as the average percent of words in the answer column, that could be find in the question column. The value of 6% imply that there is a short posibility that anyone deduce the answer from the question. So we conclude that the probability of guess an answer with the help of the question words, is very little. Then is not worth to study the past questions to increment the probability of correct answers.   

In [9]:
jeopardy[0:20]

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200,0.0
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200,0.0
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200,0.0
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200,0.0
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200,0.0
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200,0.0
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way,400,0.0
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400,0.0
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400,0.0
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400,0.333333


In [10]:
# In this step, I investigate how often new questions are repeated from older ones to figure out if studying past 
# questions could help to guess the answers. The question_overlap column has the percent of words in each column used before
# The following code will iterate over the rows of the jeopardy dataframe, and will add two inner loops to iterate over the 
# split_question column. The first inner loop will count if the word in clean_question have been used before and increment 
# match_count by one if this is the case. The second inner loop will add the words in question column to the terms_used set and verify if the word


question_overlap = []
terms_used = set()
for index, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    # Use list comprehension to remove words with less than 5 characters in the split question list
    split_question = [word for word in split_question if len(word) > 5] 
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1 
    for word in split_question:
        terms_used.add(word)  
    # It divides match_count with len(split_question) and assign the result to match_count
    if len(split_question) > 0:
            match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()



0.6902117143393427

The mean of the question_overlap column tell that the proportion of words in question column that are repetead from past questions is almost 70%. This imply that there are a high proportion of words in question column that repeated over the time. We can conclude that study the past question could give a clue for new question because many of the words are repeated.

However, to find an exact match along the words in the question column or to have a high probability of a correct answer if the question is know, it will be convenient to take rows with a high proportion of question_overlap value, such us values that are higher than the mean. 

In [11]:
# If someone is interested in study quesitions of high value to earn more money, it will be convenient to create a new 
# column called  high value that takes the value of 1 if the value is higher than 800 and 0 otherwise.

def data_frame_row(row):
    if row["clean_value"] > 800:
        value = 1
    else:
        value = 0
    return value
jeopardy["high_value"] = jeopardy.apply(data_frame_row,axis=1)


In [12]:
# Secondly, I will create a function that takes a word as the parameter and counts how many times the word in 
# clean_question column occurs in low value questions and high value questios.

def function_word(word):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        if word in row["clean_question"].split(" "): # no me habia dado cuenta de esto 
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count 

In [13]:
# The following code will yield a list of tuples with the number of times that the first 5 words in terms_used are 
# in high value questions and low value questions. The first item of the tupe is high_value and the second is low_value

observed_expected = []
comparison_terms= list(terms_used)[:5]

for item in comparison_terms:
    observed_expected.append(function_word(item))


print(comparison_terms)
print(observed_expected)

['replies', 'foodconducting', 'griesea', 'prepeanuts', 'ensure']
[(0, 1), (1, 0), (0, 1), (0, 1), (0, 1)]


The cell above return a list of tuples with the numbers of times the word in the list observed_expected are in high_value columns and low_value columns. With the number of times that each word occur in high value and low value questions, the total times that the word occur and the number of high value and low value questions, is posible to construct a chi_squared_test to check if there are any significative difference for a word respect the number of times each word is observed and the number of times the word is expected.

In [14]:
# To perform a Chi-Squared Test, we must get the observed and expected counts of the item in the high_value and low_value
# questions. The Chi-Squared Test allow to find the words with the biggest differences in usage between high and low 
# value questions. This would be the case if the value is statistically significant(large values of the Chi-squared test). 


import numpy as np
from scipy.stats import chisquare

# Applying The Chi-Squared Test 
# Find the number of rows in jeopardy where high_value=1 and high_value=0 
high_value_count = len(jeopardy[jeopardy["high_value"] == 1])
low_value_count = len(jeopardy[jeopardy["high_value"] == 0])
chi_squared = []
for item in observed_expected:
    total =  sum(item)
    total_prop = total / jeopardy.shape[0] # Is the percentage of question the word occurs in
    
    # Multiply total_prop by high_value_count to get expected term count for high value rows
    expected_high_values = total_prop * high_value_count
    expected_low_values = total_prop * low_value_count
    
    observed_values = np.array([item[0] , item[1]])
    expected_values = np.array([expected_high_values , expected_low_values]) 
    chi_squared.append(chisquare(observed_values,expected_values))
chi_squared


[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686)]

The Chi_squared test show that any of the values are statistically significative due to the p-values are greater than 0.05 in all the cases. The difference between observed and expected values don´t show a difference statistically significant for any of the words. This mean that the there are the same probability that any of the words belong to high values or low values at random than in the sample that we use. 

For this sample of words there are not evidence that each word belong to high or low values questions and then we can´t take this information into account to study the questions with high value. 

In [15]:
# In this step I will find the most common words in the clean_question column to figure out which words are 
# the most repeated over the years. Before count these words, I join all of the words in the clean_question column in a 
# string and split them by a space. Then, I eliminate from this list the stop_words and any word that is less than 6 characters. 
# Finally I define the high_frequency_terms list that hold the most common 100 words in the clean_question column. This list 
# will allow to perform a chi_squared_test with the high frequency words. 

from stop_words import get_stop_words
stop_words = get_stop_words("english")
from stop_words import safe_get_stop_words
import collections

clean_question = jeopardy["clean_question"].tolist()

string_questions = " "
for item in clean_question:
    string_questions +=  "".join(item) + " "

split_words_questions = string_questions.split()

# Remove all the stopwords in the split_words_questions list and words that doesn´t add much value in the analysis

split_words_questions = [word for word in split_words_questions if word not in stop_words and len(word) > 5]

count_words_questions = collections.Counter(split_words_questions)
high_frequency_terms = count_words_questions.most_common(100)
high_frequency_terms

[('called', 521),
 ('country', 476),
 ('played', 297),
 ('became', 287),
 ('president', 258),
 ('capital', 257),
 ('american', 256),
 ('famous', 246),
 ('targetblankherea', 244),
 ('french', 243),
 ('island', 216),
 ('people', 184),
 ('national', 183),
 ('largest', 179),
 ('little', 178),
 ('around', 169),
 ('british', 166),
 ('author', 164),
 ('meaning', 162),
 ('century', 159),
 ('family', 155),
 ('musical', 153),
 ('company', 151),
 ('series', 148),
 ('states', 142),
 ('character', 141),
 ('founded', 141),
 ('reports', 141),
 ('targetblankthisa', 140),
 ('include', 138),
 ('million', 129),
 ('number', 125),
 ('school', 120),
 ('popular', 119),
 ('father', 114),
 ('classic', 103),
 ('italian', 99),
 ('german', 99),
 ('george', 98),
 ('former', 98),
 ('another', 97),
 ('america', 97),
 ('leader', 97),
 ('william', 96),
 ('person', 95),
 ('english', 95),
 ('center', 94),
 ('museum', 94),
 ('battle', 93),
 ('countrys', 93),
 ('created', 92),
 ('university', 92),
 ('published', 92),
 ('v

In [16]:
# To perform the Chi_Squared_test that will yield any significant difference between the usage of the word in high and 
# low value questions is necessary to extract the first item of the high_frequency_terms list

observed_high_frequency_words = []
high_frequency_words = [i[0] for i in high_frequency_terms]
high_frequency_words

for word in high_frequency_words :
    observed_high_frequency_words.append(function_word(word))
    


print(observed_high_frequency_words)

[(168, 346), (141, 332), (77, 212), (79, 203), (68, 181), (61, 186), (77, 173), (78, 168), (97, 146), (108, 133), (73, 134), (57, 124), (55, 124), (42, 134), (55, 108), (50, 119), (54, 110), (52, 108), (58, 102), (54, 102), (46, 107), (42, 110), (43, 108), (34, 112), (36, 103), (40, 100), (52, 89), (62, 79), (65, 73), (43, 95), (41, 79), (30, 85), (36, 79), (22, 97), (26, 84), (31, 72), (45, 54), (44, 54), (23, 74), (22, 76), (35, 62), (25, 68), (34, 61), (25, 69), (28, 67), (37, 57), (27, 63), (30, 60), (31, 57), (14, 79), (18, 72), (29, 58), (27, 65), (17, 73), (20, 64), (31, 54), (27, 58), (23, 60), (19, 63), (24, 56), (23, 57), (33, 46), (25, 55), (21, 57), (23, 55), (23, 54), (21, 55), (22, 55), (21, 54), (25, 49), (18, 58), (27, 47), (23, 52), (19, 56), (25, 48), (23, 48), (22, 53), (18, 56), (24, 49), (13, 61), (40, 33), (15, 56), (34, 38), (23, 49), (25, 47), (21, 49), (19, 52), (24, 46), (16, 53), (19, 51), (18, 52), (23, 46), (21, 48), (14, 54), (35, 33), (22, 44), (13, 53), 

The above results show the times that the most common words in clean_question column can be observed in high value questions and low value questions. Is the frequencies that these words are in these columns. 
The next step is to calculate the Chi_squared test for these words to observe if there are any significative difference between the observed values and the expected values of the word. 

In [17]:
chi_squared = []
for item in observed_high_frequency_words:
    total = item[0] + item[1] 
    total_prop = total / jeopardy.shape[0]
    # Multiply total_prop by high_value_count to get expected term count for high value rows
    expected_high_values = total_prop * high_value_count
    expected_low_values = total_prop * low_value_count
    observed = np.array([item[0],item[1]])
    expected = np.array([expected_high_values,expected_low_values])
    chi_squared.append(chisquare(observed,expected))
chi_squared   

[Power_divergenceResult(statistic=4.0483050635345768, pvalue=0.044215717944225866),
 Power_divergenceResult(statistic=0.29967829483482744, pvalue=0.58408417131143131),
 Power_divergenceResult(statistic=0.58109902830391114, pvalue=0.4458818590919339),
 Power_divergenceResult(statistic=0.059565707308401619, pvalue=0.80718367899593324),
 Power_divergenceResult(statistic=0.22592591114717697, pvalue=0.63456129826261032),
 Power_divergenceResult(statistic=1.9084254764809114, pvalue=0.16713826420471323),
 Power_divergenceResult(statistic=0.55386193833867003, pvalue=0.45674398774097136),
 Power_divergenceResult(statistic=1.108644756518943, pvalue=0.29237673820106636),
 Power_divergenceResult(statistic=15.028296538003147, pvalue=0.00010591119029347305),
 Power_divergenceResult(statistic=30.705095602111221, pvalue=3.0037519828539177e-08),
 Power_divergenceResult(statistic=4.4013964134786523, pvalue=0.035909515913188243),
 Power_divergenceResult(statistic=0.70396304318075154, pvalue=0.40145524983

For the majority of the words, there are not a significant difference in usage between high and low value questions. This means that there is not a real difference in the usage of the word, namely the behaviour of the word in high value observed questions not differ of the expected behaviour of the word in high value questions.  

In [18]:
# In this step, I perform a deeper analysis of the Jeopardy dataframe by take the Category column to figure out if 
# there are any category with more probability than others. First I will calculate the probability of each category in 
# entire jeopardy dataframe, and then I will analyze this probability for each round. 

import operator

categories = jeopardy["Category"].unique()
category_prob = dict()
Round = jeopardy["Round"].unique()



for category in categories:
    unique_category = jeopardy[jeopardy["Category"] == category]
    length = len(unique_category)
    category_prob[category] = length / jeopardy.shape[0]

sorted_category_prob = sorted(category_prob.items(), key=operator.itemgetter(1),reverse=True)
sorted_category_prob


[('TELEVISION', 0.002550127506375319),
 ('U.S. GEOGRAPHY', 0.0025001250062503125),
 ('LITERATURE', 0.002250112505625281),
 ('BEFORE & AFTER', 0.00200010000500025),
 ('HISTORY', 0.00200010000500025),
 ('AMERICAN HISTORY', 0.00200010000500025),
 ('AUTHORS', 0.0019500975048752439),
 ('WORD ORIGINS', 0.0019000950047502376),
 ('WORLD CAPITALS', 0.0018500925046252312),
 ('SPORTS', 0.001800090004500225),
 ('BODIES OF WATER', 0.001800090004500225),
 ('SCIENCE & NATURE', 0.0017500875043752187),
 ('SCIENCE', 0.0017500875043752187),
 ('RHYME TIME', 0.0017500875043752187),
 ('MAGAZINES', 0.0017500875043752187),
 ('WORLD GEOGRAPHY', 0.0016500825041252062),
 ('WORLD HISTORY', 0.0016000800040002),
 ('ANNUAL EVENTS', 0.0016000800040002),
 ('HISTORIC NAMES', 0.0016000800040002),
 ('FICTIONAL CHARACTERS', 0.0015500775038751937),
 ('IN THE DICTIONARY', 0.0015500775038751937),
 ('BIRDS', 0.0015500775038751937),
 ('MEDICINE', 0.0015000750037501875),
 ('OPERA', 0.0015000750037501875),
 ('ISLANDS', 0.0015000

The above dictionary show that for the entire jeopardy dataframe, the category that occur most often is TELEVISION with 0.255% of the total , followed by US GEOGRAPHY and LITERATURE with 0.250% and 0.225% respectively.

In [19]:
# Generate the category_prob column in the jeopardy dataframe, with the probability of the category in the entire dataset 
def get_category_prob(df):
    return category_prob[df["Category"]]
jeopardy["category_prob"] = jeopardy.apply(get_category_prob,axis=1)
jeopardy[0:10]

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question,question_overlap,high_value,category_prob
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus,200,0.0,0.0,0,0.002
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe,200,0.0,0.0,0,0.00025
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200,0.0,0.0,0,0.0002
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds,200,0.0,0.0,0,0.00025
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams,200,0.0,0.0,0,0.00025
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant,200,0.0,0.0,0,0.00025
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,Built in 312 B.C. to link Rome & the South of ...,the Appian Way,built in 312 bc to link rome the south of ita...,the appian way,400,0.0,0.0,0,0.002
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,...",Michael Jordan,no 8 30 steals for the birmingham barons 2306 ...,michael jordan,400,0.0,0.0,0,0.00025
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inche...",Washington,in the winter of 197172 a record 1122 inches o...,washington,400,0.0,0.125,0,0.0002
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packag...,Crate & Barrel,this housewares store was named for the packag...,crate barrel,400,0.333333,0.0,0,0.00025


In [20]:
# In this step I would calculate the probability of each category in each round of the contest. There are 4 rounds and, 
# the probability of each category is calculated for each round. To reach this, I combine two for loops over the round and 
# the category in each round and then make a dictionary with a tuple as a key to show the probability of the category
# in each round. 


categories = jeopardy["Category"].unique()
category_prob = dict()

Round = jeopardy["Round"].unique()


for item in Round:
    unique_round = jeopardy[jeopardy["Round"] == item]
    for category in categories:
        unique_category = unique_round[unique_round["Category"] == category]
        length = len(unique_category)
        category_prob[(item,category)] = length / unique_round.shape[0]

sorted_category_prob = sorted(category_prob.items(), key=operator.itemgetter(1),reverse=True)
sorted_category_prob
          

[(('Tiebreaker', "CHILD'S PLAY"), 1.0),
 (('Final Jeopardy!', 'WORD ORIGINS'), 0.023880597014925373),
 (('Final Jeopardy!', 'U.S. PRESIDENTS'), 0.014925373134328358),
 (('Final Jeopardy!', 'AUTHORS'), 0.011940298507462687),
 (('Final Jeopardy!', 'FAMOUS NAMES'), 0.011940298507462687),
 (('Final Jeopardy!', 'WORLD LEADERS'), 0.008955223880597015),
 (('Final Jeopardy!', 'AMERICAN LITERATURE'), 0.008955223880597015),
 (('Final Jeopardy!', 'ARTISTS'), 0.008955223880597015),
 (('Final Jeopardy!', 'U.S. STATES'), 0.008955223880597015),
 (('Final Jeopardy!', 'WORLD CITIES'), 0.008955223880597015),
 (('Final Jeopardy!', 'THE 50 STATES'), 0.008955223880597015),
 (('Final Jeopardy!', 'FAMOUS WOMEN'), 0.008955223880597015),
 (('Final Jeopardy!', 'SPACE EXPLORATION'), 0.008955223880597015),
 (('Final Jeopardy!', 'POETS'), 0.008955223880597015),
 (('Final Jeopardy!', 'ASIA'), 0.008955223880597015),
 (('Final Jeopardy!', 'SCIENTISTS'), 0.008955223880597015),
 (('Final Jeopardy!', 'WORLD GEOGRAPHY'),

From this dictionary that show the proportion of each category per round, is possible to observe that in the Jeopardy! round the category Television is the highest with 0.35% of the total. The second category in this round is SPORTS with 0.26% of the total categories. 

In the Double Jeopardy! round, the category LITERATURE is the most asked within this round with 0.36%, followed by the U.S. GEOGRAPHY category with 0.29% of the total categories.

Finally in the Final Jeopardy! round the category with more questions asked is WORD ORIGINS with 2.4% followed by US PRESIDENTS and FAMOUS NAMES with 1.5% and 1.2% respectively. 

It seems that the number of times that each category appear in single questions in a round respect the number of questions in each round is little to concentrate in a few categories to study and get more chances of win. 
In conclusion the probability that each category appear in each round is too short.

In [21]:
import collections
from collections import Counter
clean_question = jeopardy["clean_question"].tolist()

string_questions = " "
for item in clean_question:
    string_questions +=  "".join(item) + " "
split_questions = string_questions.split(" ")
split_questions

['',
 'for',
 'the',
 'last',
 '8',
 'years',
 'of',
 'his',
 'life',
 'galileo',
 'was',
 'under',
 'house',
 'arrest',
 'for',
 'espousing',
 'this',
 'mans',
 'theory',
 'no',
 '2',
 '1912',
 'olympian',
 'football',
 'star',
 'at',
 'carlisle',
 'indian',
 'school',
 '6',
 'mlb',
 'seasons',
 'with',
 'the',
 'reds',
 'giants',
 '',
 'braves',
 'the',
 'city',
 'of',
 'yuma',
 'in',
 'this',
 'state',
 'has',
 'a',
 'record',
 'average',
 'of',
 '4055',
 'hours',
 'of',
 'sunshine',
 'each',
 'year',
 'in',
 '1963',
 'live',
 'on',
 'the',
 'art',
 'linkletter',
 'show',
 'this',
 'company',
 'served',
 'its',
 'billionth',
 'burger',
 'signer',
 'of',
 'the',
 'dec',
 'of',
 'indep',
 'framer',
 'of',
 'the',
 'constitution',
 'of',
 'mass',
 'second',
 'president',
 'of',
 'the',
 'united',
 'states',
 'in',
 'the',
 'title',
 'of',
 'an',
 'aesop',
 'fable',
 'this',
 'insect',
 'shared',
 'billing',
 'with',
 'a',
 'grasshopper',
 'built',
 'in',
 '312',
 'bc',
 'to',
 'link',


In [22]:
# The function phrases will group the words in split_questions list into 5 words phrases. With this function, the 
# split_questions list would be transformed to a list in which each element is a tuple of 5 items.
def phrases(words):
    phrase = []
    for word in words:
        phrase.append(word)
        if len(phrase) > 5:
            phrase.remove(phrase[0])
        if len(phrase) == 5:
            yield tuple(phrase)
group_phrases = list(phrases(split_questions))
group_phrases

[('', 'for', 'the', 'last', '8'),
 ('for', 'the', 'last', '8', 'years'),
 ('the', 'last', '8', 'years', 'of'),
 ('last', '8', 'years', 'of', 'his'),
 ('8', 'years', 'of', 'his', 'life'),
 ('years', 'of', 'his', 'life', 'galileo'),
 ('of', 'his', 'life', 'galileo', 'was'),
 ('his', 'life', 'galileo', 'was', 'under'),
 ('life', 'galileo', 'was', 'under', 'house'),
 ('galileo', 'was', 'under', 'house', 'arrest'),
 ('was', 'under', 'house', 'arrest', 'for'),
 ('under', 'house', 'arrest', 'for', 'espousing'),
 ('house', 'arrest', 'for', 'espousing', 'this'),
 ('arrest', 'for', 'espousing', 'this', 'mans'),
 ('for', 'espousing', 'this', 'mans', 'theory'),
 ('espousing', 'this', 'mans', 'theory', 'no'),
 ('this', 'mans', 'theory', 'no', '2'),
 ('mans', 'theory', 'no', '2', '1912'),
 ('theory', 'no', '2', '1912', 'olympian'),
 ('no', '2', '1912', 'olympian', 'football'),
 ('2', '1912', 'olympian', 'football', 'star'),
 ('1912', 'olympian', 'football', 'star', 'at'),
 ('olympian', 'football', '

In [23]:
# Finally, with this for loop, all the repeated phrase in the instance phrases(split_questions) will increment the value 
# of the key by 1 in the counts dictionary. 
import collections 

counts = collections.defaultdict(int)
for phrase in phrases(split_questions):
    counts[phrase] +=1
diction = collections.Counter(counts)
diction.most_common(50)

[(('of', 'the', 'clue', 'crew', 'reports'), 109),
 (('the', 'clue', 'crew', 'reports', 'from'), 107),
 (('targetblanksarah', 'of', 'the', 'clue', 'crew'), 69),
 (('targetblankjimmy', 'of', 'the', 'clue', 'crew'), 51),
 (('clue', 'crew', 'reports', 'from', 'the'), 47),
 (('of', 'the', 'clue', 'crew', 'shows'), 46),
 (('', '', '', '', ''), 41),
 (('targetblankkelly', 'of', 'the', 'clue', 'crew'), 41),
 (('targetblankjon', 'of', 'the', 'clue', 'crew'), 39),
 (('targetblankcheryl', 'of', 'the', 'clue', 'crew'), 33),
 (('the', 'clue', 'crew', 'shows', 'a'), 25),
 (('the', 'clue', 'crew', 'delivers', 'the'), 21),
 (('of', 'the', 'clue', 'crew', 'delivers'), 21),
 (('clue', 'crew', 'delivers', 'the', 'clue'), 20),
 (('crew', 'delivers', 'the', 'clue', 'from'), 19),
 (('from', 'the', 'latin', 'for', 'to'), 19),
 (('shares', 'its', 'name', 'with', 'a'), 18),
 (('sarah', 'of', 'the', 'clue', 'crew'), 16),
 (('clue', 'crew', 'shows', 'a', 'map'), 15),
 (('crew', 'shows', 'a', 'map', 'on'), 15),
 

With the combination of the tasks described above, is possible to observe the number of times that a phrase occurr in the split_question(clean_question) list of values. In the above cell, I reported the most common 50 phrases in the question column. 