In [5]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

In [6]:
jeopardy = spark.read.json("JEOPARDY_QUESTIONS1.json")

In [7]:
jeopardy.show(30)


+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|  air_date|              answer|            category|            question|           round|show_number| value|
+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|2004-12-31|          Copernicus|             HISTORY|'For the last 8 y...|       Jeopardy!|       4680|  $200|
|2004-12-31|          Jim Thorpe|ESPN's TOP 10 ALL...|'No. 2: 1912 Olym...|       Jeopardy!|       4680|  $200|
|2004-12-31|             Arizona|EVERYBODY TALKS A...|'The city of Yuma...|       Jeopardy!|       4680|  $200|
|2004-12-31|         McDonald\'s|    THE COMPANY LINE|'In 1963, live on...|       Jeopardy!|       4680|  $200|
|2004-12-31|          John Adams| EPITAPHS & TRIBUTES|'Signer of the De...|       Jeopardy!|       4680|  $200|
|2004-12-31|             the ant|      3-LETTER WORDS|'In the title of ...|       Jeopardy!|       4680|

In [8]:
j_categoryCount = jeopardy.groupBy("category").count()
j_categoryCount.sort(desc("count")).show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|      BEFORE & AFTER|  547|
|             SCIENCE|  519|
|          LITERATURE|  496|
|    AMERICAN HISTORY|  418|
|           POTPOURRI|  401|
|       WORLD HISTORY|  377|
|        WORD ORIGINS|  371|
|COLLEGES & UNIVER...|  351|
|             HISTORY|  349|
|              SPORTS|  342|
|         U.S. CITIES|  339|
|     WORLD GEOGRAPHY|  338|
|     BODIES OF WATER|  327|
|             ANIMALS|  324|
|      STATE CAPITALS|  314|
| BUSINESS & INDUSTRY|  311|
|             ISLANDS|  301|
|      WORLD CAPITALS|  300|
|      U.S. GEOGRAPHY|  299|
|            RELIGION|  297|
+--------------------+-----+
only showing top 20 rows



In [9]:
jeopardy = jeopardy.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))


In [10]:
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized = tokenizer.transform(jeopardy)
tokenized.select("stripped","words").take(5)

[Row(stripped='For the last 8 years of his life Galileo was under house arrest for espousing this mans theory', words=['for', 'the', 'last', '8', 'years', 'of', 'his', 'life', 'galileo', 'was', 'under', 'house', 'arrest', 'for', 'espousing', 'this', 'mans', 'theory']),
 Row(stripped='No. 2 1912 Olympian football star at Carlisle Indian School 6 MLB seasons with the Reds Giants  Braves', words=['no.', '2', '1912', 'olympian', 'football', 'star', 'at', 'carlisle', 'indian', 'school', '6', 'mlb', 'seasons', 'with', 'the', 'reds', 'giants', '', 'braves']),
 Row(stripped='The city of Yuma in this state has a record average of 4055 hours of sunshine each year', words=['the', 'city', 'of', 'yuma', 'in', 'this', 'state', 'has', 'a', 'record', 'average', 'of', '4055', 'hours', 'of', 'sunshine', 'each', 'year']),
 Row(stripped='In 1963 live on "The Art Linkletter Show" this company served its billionth burger', words=['in', '1963', 'live', 'on', '"the', 'art', 'linkletter', 'show"', 'this', 'com

In [11]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
#remover.transform(tokenized).show(truncate=False)
jeopardyStopRemoved = remover.transform(tokenized)
jeopardyStopRemoved.show(truncate=False)

+----------+---------------------+-------------------------------+------------------------------------------------------------------------------------------------------------------------+---------+-----------+-----+----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|air_date  |answer               |category                       |question                                                                                                                |round    |show_number|value|stripped                                                                                                        |words                                                                                            

In [12]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [13]:
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec


In [14]:
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

In [15]:
jeopardyStemmed = jeopardyStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeopardyStemmed.show(truncate=False)

+----------+---------------------+-------------------------------+------------------------------------------------------------------------------------------------------------------------+---------+-----------+-----+----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|air_date  |answer               |category                       |question                                                                                                                |round    |show_number|value|stripped                                                                                                    

In [19]:
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(jeopardyStemmed)
featurizedData.show(truncate=False)

+----------+---------------------+-------------------------------+------------------------------------------------------------------------------------------------------------------------+---------+-----------+-----+----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|air_date  |answer               |category                       |question                                                                                                  