In [2]:
#Load Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#For cleaning Strings
import re

pd.set_option('display.max_colwidth', None)

In [3]:
#Load Dataset
df_jeopardy = pd.read_csv('jeopardy.csv')

#Glimpse of DataFrame
df_jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,"No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves",Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,"The city of Yuma in this state has a record average of 4,055 hours of sunshine each year",Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", this company served its billionth burger",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States",John Adams


In [4]:
#Check Key Information about DataFrame
df_jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       216930 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216928 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [5]:
#Clean Column Names
df_jeopardy.columns = ['show_number', 'air_date',
'round', 'category', 'value', 'question', 'answer']

df_jeopardy.head(3)

Unnamed: 0,show_number,air_date,round,category,value,question,answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,"No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves",Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,"The city of Yuma in this state has a record average of 4,055 hours of sunshine each year",Arizona


In [54]:
base = r'^{}'
expr = '(?=.*{})'
words = ['apple', 'banana', 'cat']  # example
base.format(''.join(expr.format(w) for w in words))

'^(?=.*apple\\s)(?=.*banana\\s)(?=.*cat\\s)'

In [74]:
#Function to return relevant rows in the DataFrame based on the list of Keywords
def relevant_rows(df, list_of_words):
    base = r'^{}'
    expr = '(?=.*{}\s)' #?= Means Positive Lookahead in Regular Expressions. 
    #For more info - https://www.stefanjudis.com/blog/a-regular-expression-lookahead-lookbehind-cheat-sheet/
    pattern = base.format(''.join(expr.format(word) for word in list_of_words)) #Ensure that all words are in the sentence
    print(pattern) # Verify Pattern
    result_df = df[df.question_clean.str.contains(pattern, case = False)] # Make sure that case insensitive matching is being done.
    return result_df

In [8]:
result = relevant_rows(df_jeopardy, ["India", "Gandhi"])
result

Unnamed: 0,show_number,air_date,round,category,value,question,answer
42320,1605,1991-07-19,Jeopardy!,RELIGION,$500,A religious retreat or commune in India; Gandhi had one at Wardha,an ashram
62306,2948,1997-05-28,Jeopardy!,HISTORY,$300,In 1942 he replaced Gandhi as leader of India's National Congress Party,Nehru
89179,3203,1998-07-01,Double Jeopardy!,ACTORS & ACTRESSES,$1000,"This ""Gandhi"" star is the son of an Indian physician who emigrated to England from South Africa",Ben Kingsley
121606,4248,2003-02-05,Double Jeopardy!,"GOING, GOING, GANDHI",$2000,"While still a teenager, Gandhi left India & traveled to this capital to get his law degree",London
126154,4363,2003-07-16,Double Jeopardy!,ALL IN THE FAMILY,$400,This was Indira Gandhi's maiden name (her father was India's first P.M.),Nehru


In [12]:
# Using Regular Expressions to Clean Text
s = "string. With. Punctuation? england's"
s = re.sub(r'[^\w\s]',' ',s)
s = re.sub(' +', ' ', s)
s = s.lower()
s

'string with punctuation england s'

**Notes on Regular Expression**

* **Optional Quantifiers** - Denoted by a `?`. Indicates character can appear either `0` or `1` time. `humou?r` will match both `humour` or `humor`.
* **Character Sets** - Denoted by a `[]`. Match any of the characters included in the brackets. `con[sc]en[sc]us` will match any of the spellings `consensus`, `concensus`, `consencus`, and `concencus`.
* **Literals** - Match exact set of literals. The regex `monkey` will completely match the text `monkey` but will also match `monkey` in text `The monkeys like to eat bananas`.
* **Fixed Quantifiers** - Denoted by a `{}`. Matches exact quantity or the quantity range of characters to be matched. The regular expression `roa{3}r` will match the text `roaaar`, while the regular expression `roa{3,6}r` will match `roaaar`, `roaaaar`, `roaaaaar`, or `roaaaaaar`.
* **Alternation** - Denoted by a `|`. Allows for the matching of either of two subexpressions. The regex `baboons|gorillas` will match the text `baboons` as well as the text `gorillas`.
* **Anchors** - Anchors (hat  and dollar sign) are used in regular expressions to match text at the start and end of a string. 
* **Wildcards** - Denoted by a `.`. Can match any single character. the regular expression `.........` will match the text orangutan, marsupial, or any other 9-character text.
* **Kleene** - `*` indicates that the preceding character can occur 0 or more times.  `+` indicates that the preceding character can occur 1 or more times.
* **Grouping** - Accomplished by open `(` and close parenthesis `)`. Thus the regular expression `I love (baboons|gorillas)` will match the text `I love baboons` as well as `I love gorillas`, as the grouping limits the reach of the `|` to the text within the parentheses.

In [13]:
#Find Rows where Value is equal to None
df_jeopardy[df_jeopardy.value == 'None']

Unnamed: 0,show_number,air_date,round,category,value,question,answer
55,4680,2004-12-31,Final Jeopardy!,THE SOLAR SYSTEM,,Objects that pass closer to the sun than Mercury have been named for this mythological figure,Icarus
116,5957,2010-07-06,Final Jeopardy!,HISTORIC WOMEN,,"She was born in Virginia around 1596 & died in Kent, England in 1617",Pocahontas
174,3751,2000-12-18,Final Jeopardy!,SPORTS LEGENDS,,"If Joe DiMaggio's hitting streak had gone one more game in 1941, this company would have given him a $10,000 contract",H.J. Heinz (Heinz 57 Varieties)
235,3673,2000-07-19,Final Jeopardy!,THE MAP OF EUROPE,,"Bordering Italy, Austria, Hungary & Croatia, it's one of the world's newest independent countries",Slovenia
296,4931,2006-02-06,Final Jeopardy!,FAMOUS SHIPS,,"On December 27, 1831 it departed Plymouth, England to map the coastline of South America",the HMS Beagle
...,...,...,...,...,...,...,...
216686,3940,2001-10-19,Final Jeopardy!,MAJOR LEAGUE BASEBALL TEAM NAMES,,"This team received its name after an 1890 incident in which it ""stole"" away an important player from another team",Pittsburgh Pirates
216746,6044,2010-12-16,Final Jeopardy!,SKYSCRAPERS,,"After a construction boom fueled by oil & gas money, this capital city now has Europe's tallest building",Moscow
216807,5070,2006-09-29,Final Jeopardy!,NATIONAL CAPITALS,,"This city's website calls it ""the last divided capital in Europe""",Nicosia
216868,5195,2007-03-23,Final Jeopardy!,BESTSELLING AUTHORS,,"He had the year's bestselling novel a record 7 years in a row with 7 different titles, ending in 2000",John Grisham


In [14]:
#Convert None to $0 using loc
df_jeopardy.loc[df_jeopardy.value == 'None', 'value'] = '$0'

In [15]:
#Convert Dollar String to Float
df_jeopardy['value_float'] = df_jeopardy['value'].replace('[\$,]', '', regex=True).astype(float)
df_jeopardy.head()

Unnamed: 0,show_number,air_date,round,category,value,question,answer,value_float
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus,200.0
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,"No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves",Jim Thorpe,200.0
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,"The city of Yuma in this state has a record average of 4,055 hours of sunshine each year",Arizona,200.0
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", this company served its billionth burger",McDonald's,200.0
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States",John Adams,200.0


In [21]:
#Clean Question Column
# Replace All Characters except Alphabets and Digits with space.
df_jeopardy['question_clean'] = df_jeopardy['question'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
# Remove Extra Spaces
df_jeopardy['question_clean'] = df_jeopardy['question_clean'].apply(lambda x: re.sub(' +', ' ',x))
# Change to lower case
df_jeopardy['question_clean'] = df_jeopardy['question_clean'].str.lower()
# Verify Results
df_jeopardy

Unnamed: 0,show_number,air_date,round,category,value,question,answer,value_float,question_clean
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus,200.0,for the last 8 years of his life galileo was under house arrest for espousing this man s theory
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,"No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves",Jim Thorpe,200.0,no 2 1912 olympian football star at carlisle indian school 6 mlb seasons with the reds giants braves
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,"The city of Yuma in this state has a record average of 4,055 hours of sunshine each year",Arizona,200.0,the city of yuma in this state has a record average of 4 055 hours of sunshine each year
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", this company served its billionth burger",McDonald's,200.0,in 1963 live on the art linkletter show this company served its billionth burger
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States",John Adams,200.0,signer of the dec of indep framer of the constitution of mass second president of the united states
...,...,...,...,...,...,...,...,...,...
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,$2000,This Puccini opera turns on the solution to 3 riddles posed by the heroine,Turandot,2000.0,this puccini opera turns on the solution to 3 riddles posed by the heroine
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",$2000,"In North America this term is properly applied to only 4 species that are crested, including the tufted",a titmouse,2000.0,in north america this term is properly applied to only 4 species that are crested including the tufted
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,$2000,"In Penny Lane, where this ""Hellraiser"" grew up, the barber shaves another customer--then flays him alive!",Clive Barker,2000.0,in penny lane where this hellraiser grew up the barber shaves another customer then flays him alive
216928,4999,2006-05-11,Double Jeopardy!,QUOTATIONS,$2000,"From Ft. Sill, Okla. he made the plea, Arizona is my land, my home, my father's land, to which I now ask to... return""",Geronimo,2000.0,from ft sill okla he made the plea arizona is my land my home my father s land to which i now ask to return


In [80]:
Relevant_Questions = relevant_rows(df_jeopardy, ["Indira","Gandhi"])
Relevant_Questions

^(?=.*Indira\s)(?=.*Gandhi\s)


Unnamed: 0,show_number,air_date,round,category,value,question,answer,value_float,question_clean
11545,4461,2004-01-19,Final Jeopardy!,RANKS & TITLES,$0,"Queen Victoria & Indira Gandhi each held this title, though only one was official",Empress of India,0.0,queen victoria indira gandhi each held this title though only one was official
24869,5271,2007-07-09,Double Jeopardy!,AIR APPARENT,$2000,"1984 news included Indira Gandhi's death, as well as a deadly gas leak at this company's chemical plant in Bhopal",Union Carbide,2000.0,1984 news included indira gandhi s death as well as a deadly gas leak at this company s chemical plant in bhopal
56837,4380,2003-09-26,Double Jeopardy!,FORMER WORLD LEADERS,$800,Indira Gandhi was the first & only female prime minister so far of this Asian nation,India,800.0,indira gandhi was the first only female prime minister so far of this asian nation
57589,6034,2010-12-02,Jeopardy!,FASHION TRENDS,$600,"The Beatles helped popularize this jacket, the original of which was worn by Indira Gandhi's father",the Nehru jacket,600.0,the beatles helped popularize this jacket the original of which was worn by indira gandhi s father
92417,3565,2000-02-18,Jeopardy!,WORLD CAPITALS,$200,Indira Gandhi International Airport lies just outside this capital,New Delhi,200.0,indira gandhi international airport lies just outside this capital
126154,4363,2003-07-16,Double Jeopardy!,ALL IN THE FAMILY,$400,This was Indira Gandhi's maiden name (her father was India's first P.M.),Nehru,400.0,this was indira gandhi s maiden name her father was india s first p m
165757,4673,2004-12-22,Jeopardy!,"""N""TICE ME!",$600,"When you land at Indira Gandhi International Airport, you're in this city",New Delhi,600.0,when you land at indira gandhi international airport you re in this city
186628,4277,2003-03-18,Jeopardy!,CAPITAL CITY UNIVERSITIES,$200,Indira Gandhi National Open University,New Delhi,200.0,indira gandhi national open university
198777,5534,2008-10-02,Double Jeopardy!,ELEANOR ROOSEVELT GOES TO THE THEATRE,$1200,"Eleanor took Indira Gandhi to see this man's play ""The Night Of The Iguana""; it left both ladies ""a little baffled""",Tennessee Williams,1200.0,eleanor took indira gandhi to see this man s play the night of the iguana it left both ladies a little baffled


In [79]:
#Average Value of Questions with Relevant Keywords
Relevant_Questions.value_float.mean()

819.5121951219512