# Jeopardy Dataset Cleaning and Preparation

In [None]:
import pandas as pd
import pathlib
import numpy as np
import json

## Add in a combined column of Questions & Answers with their stopwords removed for Jeopardy dataset that captures 500+ sample data from the raw dataset. This will be the base data of the our machine learning process.

In [None]:
import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-2.4.8'
# spark_version = 'spark-2.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:13 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease [15.9 kB]
Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:15 

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("StopWords").getOrCreate()
spark = SparkSession.builder.appName("NLPTokens").getOrCreate()

In [None]:
# Read in the CSV to create data frame
raw_df = spark.read.csv("jeopardy-category-cleanup - cat_validated.csv", sep=",", header=True)

In [None]:
from pyspark.ml.feature import Tokenizer

In [None]:
# Tokenize DataFrame
tokenizer = Tokenizer(inputCol="combined", outputCol="words")

In [None]:
# Transform and show DataFrame
tokenized = tokenizer.transform(raw_df)
tokenized.show(truncate=False)

+-----------+----------+---------+-------------------------------+-----+--------------------------------------------------------------------------------------------------------------------------+------------------------------------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|show_number|air_date  |round    |category_title                 |value|data_q                                                                                                                    |data_a                                    |classes      |combined                                                                                                                               

In [None]:
# import stopwords library
from pyspark.ml.feature import StopWordsRemover

In [None]:
# Instantiate Remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [None]:
# Transform and show data
filtered_df = remover.transform(tokenized)
filtered_df.show(truncate=False)

+-----------+----------+---------+-------------------------------+-----+--------------------------------------------------------------------------------------------------------------------------+------------------------------------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|show_number|air_date  |round    |category_title                 |value|data_q                                                                                                                    |data_a                       

In [None]:
# Convert the spark dataframe to pandas dataframe
jeopardy_df_raw = filtered_df.toPandas()
jeopardy_df_raw

Unnamed: 0,show_number,air_date,round,category_title,value,data_q,data_a,classes,combined,words,filtered
0,4680,12/31/2004,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,people,"HISTORY For the last 8 years of his life, Gali...","[history, for, the, last, 8, years, of, his, l...","[history, last, 8, years, life,, galileo, hous..."
1,4680,12/31/2004,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,people,ESPN's TOP 10 ALL-TIME ATHLETES No. 2: 1912 Ol...,"[espn's, top, 10, all-time, athletes, no., 2:,...","[espn's, top, 10, all-time, athletes, no., 2:,..."
2,4680,12/31/2004,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,geography,EVERYBODY TALKS ABOUT IT... The city of Yuma i...,"[everybody, talks, about, it..., the, city, of...","[everybody, talks, it..., city, yuma, state, r..."
3,4680,12/31/2004,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""",McDonald's,organizations,[organizations],[organizations]
4,4680,12/31/2004,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,people,EPITAPHS & TRIBUTES Signer of the Dec. of Inde...,"[epitaphs, &, tributes, signer, of, the, dec.,...","[epitaphs, &, tributes, signer, dec., indep.,,..."
...,...,...,...,...,...,...,...,...,...,...,...
531,5243,5/30/2007,Double Jeopardy!,DOWN MEXICO WAY,"$2,000",This popular resort island lies north of Cozum...,Cancun,geography,DOWN MEXICO WAY This popular resort island lie...,"[down, mexico, way, this, popular, resort, isl...","[mexico, way, popular, resort, island, lies, n..."
532,5243,5/30/2007,Double Jeopardy!,TAKE A PILL,"$2,000",This tranquilizer that sounds like a village w...,Miltown,history,TAKE A PILL This tranquilizer that sounds like...,"[take, a, pill, this, tranquilizer, that, soun...","[take, pill, tranquilizer, sounds, like, villa..."
533,5243,5/30/2007,Double Jeopardy!,TRANSPORTATION,"$1,000",Since 1899 these stalwart animals used in tran...,mules,history,TRANSPORTATION Since 1899 these stalwart anima...,"[transportation, since, 1899, these, stalwart,...","[transportation, since, 1899, stalwart, animal..."
534,5243,5/30/2007,Double Jeopardy!,AN E FOR AN I,"$2,000","""""""To replenish"""" becomes """"to knock down""""""",to fill & to fell,language,"""AN E FOR AN I """"To replenish"""" becomes """"to k...","[""an, e, for, an, i, """"to, replenish"""", become...","[""an, e, """"to, replenish"""", becomes, """"to, kno..."


In [None]:
# Conver the 'filtered' column to a list
filtered_words = jeopardy_df_raw['filtered']
filtered_list = filtered_words.to_list()

In [None]:
# Use a for loop, join individual tokenized words to sentences

append_list = []
for x in filtered_list:

  append_list.append(' '.join(x))

In [None]:
append_list

["history last 8 years life, galileo house arrest espousing man's theory copernicus",
 "espn's top 10 all-time athletes no. 2: 1912 olympian; football star carlisle indian school; 6 mlb seasons reds, giants & braves jim thorpe",
 'everybody talks it... city yuma state record average 4,055 hours sunshine year arizona',
 'organizations',
 'epitaphs & tributes signer dec. indep., framer constitution mass., second president united states john adams',
 '3-letter words title aesop fable, insect shared billing grasshopper ant',
 'history built 312 b.c. link rome & south italy, still use today appian way',
 "espn's top 10 all-time athletes no. 8: 30 steals birmingham barons; 2,306 steals bulls michael jordan",
 'everybody talks it... winter 1971-72, record 1,122 inches snow fell rainier paradise ranger station state washington',
 'company line housewares store named packaging merchandise came & first displayed crate & barrel',
 '"epitaphs & tributes ""and away go"" jackie gleason"',
 '3-letter

In [None]:
# Make a copy of the dataframe
jeopardy_df = jeopardy_df_raw

In [None]:
# Drop the unnecessary columns in the dataframe, only keep the filtered_sentences
jeopardy_df = jeopardy_df.drop(['words', 'filtered'], axis=1)
jeopardy_df['filtered_sentence'] = append_list

In [None]:
jeopardy_df

Unnamed: 0,show_number,air_date,round,category_title,value,data_q,data_a,classes,combined,filtered_sentence
0,4680,12/31/2004,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,people,"HISTORY For the last 8 years of his life, Gali...","history last 8 years life, galileo house arres..."
1,4680,12/31/2004,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,people,ESPN's TOP 10 ALL-TIME ATHLETES No. 2: 1912 Ol...,espn's top 10 all-time athletes no. 2: 1912 ol...
2,4680,12/31/2004,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,geography,EVERYBODY TALKS ABOUT IT... The city of Yuma i...,everybody talks it... city yuma state record a...
3,4680,12/31/2004,Jeopardy!,THE COMPANY LINE,$200,"""In 1963, live on """"The Art Linkletter Show""""","this company served its billionth burger""",McDonald's,organizations,organizations
4,4680,12/31/2004,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,people,EPITAPHS & TRIBUTES Signer of the Dec. of Inde...,"epitaphs & tributes signer dec. indep., framer..."
...,...,...,...,...,...,...,...,...,...,...
531,5243,5/30/2007,Double Jeopardy!,DOWN MEXICO WAY,"$2,000",This popular resort island lies north of Cozum...,Cancun,geography,DOWN MEXICO WAY This popular resort island lie...,mexico way popular resort island lies north co...
532,5243,5/30/2007,Double Jeopardy!,TAKE A PILL,"$2,000",This tranquilizer that sounds like a village w...,Miltown,history,TAKE A PILL This tranquilizer that sounds like...,take pill tranquilizer sounds like village int...
533,5243,5/30/2007,Double Jeopardy!,TRANSPORTATION,"$1,000",Since 1899 these stalwart animals used in tran...,mules,history,TRANSPORTATION Since 1899 these stalwart anima...,transportation since 1899 stalwart animals use...
534,5243,5/30/2007,Double Jeopardy!,AN E FOR AN I,"$2,000","""""""To replenish"""" becomes """"to knock down""""""",to fill & to fell,language,"""AN E FOR AN I """"To replenish"""" becomes """"to k...","""an e """"to replenish"""" becomes """"to knock down..."


In [None]:
# Save the filttered sample dataset to a CSV
jeopardy_df.to_csv('filtered_smaller_sample.csv')

## Next, prepare the JSON file for the raw Jeopardy dataset. This will serve as the base dataset for our web keyword searches.

In [None]:
# Read in the Jeopardy JSON file
with open('JEOPARDY_QUESTIONS1.json', 'r') as json_file:
    json_object = json.load(json_file)

In [None]:
# Prettified the JSON string
json_file = json.dumps(json_object, indent=1)

In [None]:
# Save the prettified string to a JSON file
jsonFile = open("jeopardy.json", "w")
jsonFile.write(json_file)
jsonFile.close()