In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chisquare, chi2_contingency
import string

In [3]:
jeopardy = pd.read_csv("jeopardy.csv")
print(jeopardy.head())
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [4]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value',
       'Question', 'Answer']

In [5]:
def normalize(s):
    s = s.lower()
    for punct in string.punctuation:
        s = s.replace(punct,"")
    return s

jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)
jeopardy["clean_answer"] = jeopardy["Answer"].astype(str,copy=False).apply(normalize)
print(jeopardy["clean_question"].head())
print(jeopardy["clean_answer"].head())

0    for the last 8 years of his life galileo was u...
1    no 2 1912 olympian football star at carlisle i...
2    the city of yuma in this state has a record av...
3    in 1963 live on the art linkletter show this c...
4    signer of the dec of indep framer of the const...
Name: clean_question, dtype: object
0    copernicus
1    jim thorpe
2       arizona
3     mcdonalds
4    john adams
Name: clean_answer, dtype: object


In [6]:
def normalize_dollar(s):
    s=s.replace("$","")
    s=s.replace(",",".")
    if s=="None":
        return 0
    return float(s)

jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_dollar)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [7]:
jeopardy["clean_value"].head(21)

0     200.0
1     200.0
2     200.0
3     200.0
4     200.0
5     200.0
6     400.0
7     400.0
8     400.0
9     400.0
10    400.0
11    400.0
12    600.0
13    600.0
14    600.0
15    600.0
16    600.0
17    600.0
18    800.0
19    800.0
20    800.0
Name: clean_value, dtype: float64

In [8]:
def prob_answer_in_question(row_jeopardy):
# takes an answer, then divides it into words and finally verifies
# the proportion of words in the answer also present in his
# corresponding question
   
    split_answer = row_jeopardy["clean_answer"].split(" ")
    split_question = row_jeopardy["clean_question"].split(" ")
    
    match_count = 0
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer)==0: return 0
    for answer in split_answer:
        if answer in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(prob_answer_in_question,axis=1)     

In [9]:
jeopardy["answer_in_question"].mean()

0.060352773854699004

######  First analysis
The function created for the analysis takes each row of the data set "hazard" and count how many words of the answer appear in his question. then this amount is divided by the number of words in the answer:
- This will tell us row by row what percentage of the answer appears in the question, 100% is that all the words of the answer appear in his question, 33.333% that only 1 of the three words of the answer appears in his question, etc. .

Taking the average of all these values ​​generates a value that we can use to estimate the typical percentage of words in the answers that appear in his question.

In [10]:
question_overlap = []
terms_used = set()
jeopardy.sort_values(by="Air Date", inplace=True)

for i,row in jeopardy.iterrows():
    match_count = 0
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if len(q) > 5]
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question)>0:
        question_overlap.append(match_count / len(split_question))
    else: question_overlap.append(0)

jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()

0.6871242880966756

##### Conclusions: There is almost a 70% of isolated words that appears in old questions (for old questions we understand questions that occurred at most the immediate previous date). This value doesn't mean much since there are isolated words and not complete phrases . We need to investigate further.



In [11]:
def over_800USD(row):
    if float(row["clean_value"])>800:
        value=1
    else: value=0
    return value

jeopardy["high_value"] = jeopardy.apply(over_800USD,axis=1)

In [12]:
def count_words(word):
    low_count=0
    high_count=0
    for i,row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"]==1:
                high_count += 1
            else: low_count += 1
    return high_count, low_count

observed_expected = []

comparison_terms = list(terms_used)[:25]

for term in comparison_terms:
    result = count_words(term)
    observed_expected.append(result)

In [13]:
observed_expected

[(0, 3),
 (1, 1),
 (0, 1),
 (0, 1),
 (1, 3),
 (1, 0),
 (0, 1),
 (0, 1),
 (0, 1),
 (1, 1),
 (0, 3),
 (0, 1),
 (0, 2),
 (0, 1),
 (0, 2),
 (0, 1),
 (0, 1),
 (0, 1),
 (1, 0),
 (4, 2),
 (1, 0),
 (1, 1),
 (0, 1),
 (0, 1),
 (2, 2)]

In [14]:
high_value_count = jeopardy[jeopardy["high_value"]==1]["high_value"].count()
low_value_count = jeopardy[jeopardy["high_value"]==0]["high_value"].count()

In [15]:
chi_squared = []
for each in observed_expected:
    total = each[0]+each[1]
    total_prop = total/jeopardy.shape[0]
    expected_high_value_counts = total_prop*high_value_count
    expected_low_value_counts = total_prop*low_value_count
    expected = np.array([expected_high_value_counts,expected_low_value_counts])
    observed = np.array([each[0],each[1]])
    chisquare_value, pvalue = chisquare(observed, expected)
    chi_squared.append((chisquare_value,pvalue))

print(chi_squared)


[(0.9926132960670793, 0.31910449982424866), (0.6765980594008285, 0.4107606373026975), (0.3308710986890265, 0.565146603267378), (0.3308710986890265, 0.565146603267378), (4.122707846712507e-05, 0.9948769527982859), (3.022325020112631, 0.08212564786568953), (0.3308710986890265, 0.565146603267378), (0.3308710986890265, 0.565146603267378), (0.3308710986890265, 0.565146603267378), (0.6765980594008285, 0.4107606373026975), (0.9926132960670793, 0.31910449982424866), (0.3308710986890265, 0.565146603267378), (0.661742197378053, 0.4159455550913673), (0.3308710986890265, 0.565146603267378), (0.661742197378053, 0.4159455550913673), (0.3308710986890265, 0.565146603267378), (0.3308710986890265, 0.565146603267378), (0.3308710986890265, 0.565146603267378), (3.022325020112631, 0.08212564786568953), (5.6134474527597, 0.017823164380903502), (3.022325020112631, 0.08212564786568953), (0.6765980594008285, 0.4107606373026975), (0.3308710986890265, 0.565146603267378), (0.3308710986890265, 0.565146603267378), (

In [16]:
chis, pval = 0, 0

for each in chi_squared:
    chis += each[0]
    pval += each[1]

chis_squared_promediate = chis/len(chi_squared)
pvalues_promediate = pval/len(chi_squared)

print("chis_squared_promediate: ",chis_squared_promediate)
print("pvalues_promediate: ",pvalues_promediate)

chis_squared_promediate:  1.0137047283335516
pvalues_promediate:  0.43951753859981535


##### Observing p-values and chi-squared values:

- Generlly we haven't low p-values indicating that the chi-squared value obtained  of the categorical distribution of words present in low and high USD values questions is not very rare. 
Then there is high probability of get this distribution by chance. Hence There isn't a Statistical significance in data for the study of "high-low USD-value words".

#### Some further analytis:
- Increasing the list of english stopwords to remove from questions: 

In [17]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
stopwords2 = ['a', 'about', 'above', 'after',
 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for',
 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've",'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours\tourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', 
              "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours','yourself','yourselves']


In [19]:
question_overlap2 = []
terms_used2 = set()
jeopardy.sort_values(by="Air Date", inplace=True)

for i,row in jeopardy.iterrows():
    match_count = 0
    split_question = row["clean_question"].split(" ")
    split_question = [q for q in split_question if (q not in stopwords2)] # and (len(q)>5)]
    for word in split_question:
        if word in terms_used2:
            match_count += 1
    for word in split_question:
        terms_used2.add(word)
    if len(split_question)>0:
        question_overlap2.append(match_count / len(split_question))
    else: question_overlap2.append(0)

jeopardy["question_overlap2"] = question_overlap2
jeopardy[["question_overlap2","Air Date"]].head(21)

Unnamed: 0,question_overlap2,Air Date
19325,0.0,1984-09-21
19324,0.0,1984-09-21
19301,0.0,1984-09-21
19302,0.0,1984-09-21
19303,0.25,1984-09-21
19304,0.0,1984-09-21
19305,0.0,1984-09-21
19306,0.0,1984-09-21
19308,0.0,1984-09-21
19309,0.0,1984-09-21


In [20]:
jeopardy["question_overlap2"].mean()

0.7968490723431718

###### New results: 79% of the words used in old questions appears on new ones. This time we do the cut with english stopwords rather than words with lenght < 5.

##### Let´s go to obtain the frequency-value-words-appear more efficient using pandas.

In [21]:
len(terms_used)

24532

In [22]:
jeopardy.shape

(19999, 14)

In [23]:
terms_used_list = []
for word in terms_used:
    if len(word)<10:
        terms_used_list.append(word)
print(len(terms_used_list))

17859


In [26]:
terms_used_df = pd.DataFrame(terms_used_list,columns=["word"]).sample(n=60)
terms_used_df.head(30)

Unnamed: 0,word
8419,xstatic
416,notebook
17076,hatter
11206,starkists
5741,herbert
10154,pensee
619,recover
2217,regulates
17064,inflate
13635,marple


In [27]:
def count_words2(word):
    low_count=0
    high_count=0
    for i,row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"]==1:
                high_count += 1
            else: low_count += 1
    return high_count, low_count, word

terms_used_df["high-low value-word"] = terms_used_df["word"].apply(count_words2)

In [31]:
terms_used_df.head(30)

Unnamed: 0,word,high-low value-word
8419,xstatic,"(0, 1, xstatic)"
416,notebook,"(0, 2, notebook)"
17076,hatter,"(0, 2, hatter)"
11206,starkists,"(0, 1, starkists)"
5741,herbert,"(0, 12, herbert)"
10154,pensee,"(0, 1, pensee)"
619,recover,"(1, 2, recover)"
2217,regulates,"(1, 1, regulates)"
17064,inflate,"(1, 1, inflate)"
13635,marple,"(0, 2, marple)"


In [32]:
high_value_count2 = jeopardy[jeopardy["high_value"]==1]["high_value"].count()
low_value_count2 = jeopardy[jeopardy["high_value"]==0]["high_value"].count()
print(high_value_count2,low_value_count2)

4972 15027


In [34]:
def chi_squared_df(term):
    
    total = term[0]+term[1]
    total_prop = total/jeopardy.shape[0]
    expected_high_value_counts = total_prop*high_value_count
    expected_low_value_counts = total_prop*low_value_count
    expected = np.array([expected_high_value_counts,expected_low_value_counts])
    observed = np.array([term[0],term[1]])
    chisquare_value, pvalue = chisquare(observed, expected)
    chi_squared = (chisquare_value,pvalue,term[2])
    return chi_squared

chi_squared = terms_used_df["high-low value-word"].apply(chi_squared_df)
print(chi_squared.tolist()[:30])

[(0.3308710986890265, 0.565146603267378, 'xstatic'), (0.661742197378053, 0.4159455550913673, 'notebook'), (0.661742197378053, 0.4159455550913673, 'hatter'), (0.3308710986890265, 0.565146603267378, 'starkists'), (3.970453184268317, 0.046305308527765786, 'herbert'), (0.3308710986890265, 0.565146603267378, 'pensee'), (0.11526980495624546, 0.7342224981885828, 'recover'), (0.6765980594008285, 0.4107606373026975, 'regulates'), (0.6765980594008285, 0.4107606373026975, 'inflate'), (0.661742197378053, 0.4159455550913673, 'marple'), (3.022325020112631, 0.08212564786568953, 'assures'), (0.6765980594008285, 0.4107606373026975, 'fireproof'), (0.11526980495624546, 0.7342224981885828, 'existence'), (3.022325020112631, 0.08212564786568953, 'cheerful'), (0.661742197378053, 0.4159455550913673, 'blossoms'), (2.80672372637985, 0.09386990525628017, 'upside'), (3.022325020112631, 0.08212564786568953, 'withstand'), (0.3308710986890265, 0.565146603267378, 'commands'), (0.6765980594008285, 0.4107606373026975, 

In [36]:
list_of_words = []
for each in chi_squared.tolist():
    
    if each[1]<0.1:
        word_significant = each[2]
        print(word_significant)
        list_of_words.append(word_significant)

        
def determining_low_pvalues(df):
    
    word = df["word"]
    if word in list_of_words:
        return df["high-low value-word"]

print(terms_used_df.apply(determining_low_pvalues,axis=1).head(30))

herbert
assures
cheerful
upside
withstand
wilshire
hurston
titled
harlan
orators
metalcore
8419                  None
416                   None
17076                 None
11206                 None
5741      (0, 12, herbert)
10154                 None
619                   None
2217                  None
17064                 None
13635                 None
15040      (1, 0, assures)
15482                 None
14472                 None
8696      (1, 0, cheerful)
15043                 None
6696        (2, 1, upside)
13424    (1, 0, withstand)
16546                 None
14277                 None
9535                  None
4780                  None
11306                 None
3896                  None
15437                 None
15354                 None
746                   None
5101                  None
11941                 None
14941                 None
10995                 None
dtype: object


##### the chi-squared test shows than only a few words in a sample of 60 have statistical significance. The words are:
- sparta
- alfalfa
- santini
- denied
- honcho
- numeral
- backcourt
- dollpuss
- tuxedos
- strapped
- predating

All these words have appearances on high value questions (over 800 USD), the first one in the amount of appearances is "sparta" with 2.

The numbers are not very high but we must to consider that we have only used a sample of n = 60 terms used in questions selected ramdomly. If we make a gross and possibly incorrect extrapolation, but only for the purpose of generating some ideas, 2 in 60 are the 3.33% and the whole set of terms used in older questions are approximatelly 17859 words, then the 3.33% of this number is 594 an insteresting amount of times that "sparta" appears on high-value questions from a total of 20,000 questions colected in the dataset.

##### The next steps in all this complex analysis are the taking of a most bigger data-set (on external workplaces like google colab I have taked a data-set of 200,000 rows, the percentages of words used in older questions goes up as well as others but in these cases we need a most powerfull way to do computations).


###### Some last analysis: counting phrases overlaping on older questions.

In [38]:
question_overlap3 = []
phrases_used = set()
jeopardy.sort_values(by="Air Date", inplace=True)

for i,row in jeopardy.iterrows():
    match_count = 0
    split_question = row["clean_question"].split(",")
    split_question = [q for q in split_question if (q not in stopwords2)] # and (len(q)>5)]
    for phrase in split_question:
        if phrase in phrases_used:
            match_count += 1
    for phrase in split_question:
        phrases_used.add(phrase)
    if len(split_question)>0:
        question_overlap3.append(match_count / len(split_question))
    else: question_overlap3.append(0)

jeopardy["question_overlap3"] = question_overlap3
jeopardy[["question_overlap3","Air Date"]].head(30)

Unnamed: 0,question_overlap3,Air Date
19325,0.0,1984-09-21
19285,0.0,1984-09-21
19324,0.0,1984-09-21
19301,0.0,1984-09-21
19302,0.0,1984-09-21
19303,0.0,1984-09-21
19304,0.0,1984-09-21
19305,0.0,1984-09-21
19308,0.0,1984-09-21
19309,0.0,1984-09-21


In [39]:
jeopardy["question_overlap3"].mean()

0.000600030001500075

##### The percentage are very very low, this can indicate that phrases have high varations over time. This doesn't mean that the subject of the questions has the same variations. Jeopardy can make small variances on questions to generate some aparent variability without varyng their instrinsict subject.

In [40]:
jeopardy["Category"].value_counts().head(30)

TELEVISION              51
U.S. GEOGRAPHY          50
LITERATURE              45
HISTORY                 40
AMERICAN HISTORY        40
BEFORE & AFTER          40
AUTHORS                 39
WORD ORIGINS            38
WORLD CAPITALS          37
BODIES OF WATER         36
SPORTS                  36
SCIENCE & NATURE        35
RHYME TIME              35
SCIENCE                 35
MAGAZINES               35
WORLD GEOGRAPHY         33
HISTORIC NAMES          32
WORLD HISTORY           32
ANNUAL EVENTS           32
IN THE DICTIONARY       31
BIRDS                   31
FICTIONAL CHARACTERS    31
U.S. PRESIDENTS         30
ISLANDS                 30
OPERA                   30
TRAVEL & TOURISM        30
MEDICINE                30
POTPOURRI               30
BALLET                  29
ART                     28
Name: Category, dtype: int64

###### Looking the rankings in "Categories":
- TELEVISION                        51
- U.S. GEOGRAPHY                    50
- LITERATURE                        45
- BEFORE & AFTER                    40
- AMERICAN HISTORY                  40
- HISTORY                           40
- AUTHORS                           39
- WORD ORIGINS                      38
- WORLD CAPITALS                    37
- SPORTS                            36
- BODIES OF WATER                   36
- RHYME TIME                        35
- SCIENCE                           35

Based on this ranking of categories, we can surmise that the questions with the greatest variability fall on the categories most frequently ocurred. So, in order to have more chances of winning Jeopardy, the participant needs to have a vast knowledge in the top 5 categories, and a general culture for the rest.
