In [59]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer


sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

In [2]:
#reading in the jeopardy data set
jeopardy = spark.read.json("JEOPARDY_QUESTIONS1-Copy1.json")

In [3]:
#shows first 30 observations of jeopardy data set
jeopardy.show(30)

+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|  air_date|              answer|            category|            question|           round|show_number| value|
+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|2004-12-31|          Copernicus|             HISTORY|'For the last 8 y...|       Jeopardy!|       4680|  $200|
|2004-12-31|          Jim Thorpe|ESPN's TOP 10 ALL...|'No. 2: 1912 Olym...|       Jeopardy!|       4680|  $200|
|2004-12-31|             Arizona|EVERYBODY TALKS A...|'The city of Yuma...|       Jeopardy!|       4680|  $200|
|2004-12-31|         McDonald\'s|    THE COMPANY LINE|'In 1963, live on...|       Jeopardy!|       4680|  $200|
|2004-12-31|          John Adams| EPITAPHS & TRIBUTES|'Signer of the De...|       Jeopardy!|       4680|  $200|
|2004-12-31|             the ant|      3-LETTER WORDS|'In the title of ...|       Jeopardy!|       4680|

In [4]:
#shows top 20 categories with highest count of observations, and the counts associated with them
j_categoryCount = jeopardy.groupBy("category").count()
j_categoryCount.sort(desc("count")).show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|      BEFORE & AFTER|  547|
|             SCIENCE|  519|
|          LITERATURE|  496|
|    AMERICAN HISTORY|  418|
|           POTPOURRI|  401|
|       WORLD HISTORY|  377|
|        WORD ORIGINS|  371|
|COLLEGES & UNIVER...|  351|
|             HISTORY|  349|
|              SPORTS|  342|
|         U.S. CITIES|  339|
|     WORLD GEOGRAPHY|  338|
|     BODIES OF WATER|  327|
|             ANIMALS|  324|
|      STATE CAPITALS|  314|
| BUSINESS & INDUSTRY|  311|
|             ISLANDS|  301|
|      WORLD CAPITALS|  300|
|      U.S. GEOGRAPHY|  299|
|            RELIGION|  297|
+--------------------+-----+
only showing top 20 rows



In [5]:
#This is the number of categories in the dataset
j_categoryCount.count()

27995

In [7]:
#This is the number of categories in the dataset which have greater than 50 observations
j_categoryCount = jeopardy.groupBy("category").count()
count100 = j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 100).count()
count100

145

In [8]:
#This is a list of all categories which have a count greater than 50
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(count100).toPandas().category)
top_categories

['BEFORE & AFTER',
 'SCIENCE',
 'LITERATURE',
 'AMERICAN HISTORY',
 'POTPOURRI',
 'WORLD HISTORY',
 'WORD ORIGINS',
 'COLLEGES & UNIVERSITIES',
 'HISTORY',
 'SPORTS',
 'U.S. CITIES',
 'WORLD GEOGRAPHY',
 'BODIES OF WATER',
 'ANIMALS',
 'STATE CAPITALS',
 'BUSINESS & INDUSTRY',
 'ISLANDS',
 'WORLD CAPITALS',
 'U.S. GEOGRAPHY',
 'RELIGION',
 'OPERA',
 'SHAKESPEARE',
 'LANGUAGES',
 'BALLET',
 'TELEVISION',
 'FICTIONAL CHARACTERS',
 'TRANSPORTATION',
 'RHYME TIME',
 'PEOPLE',
 'STUPID ANSWERS',
 'ART & ARTISTS',
 'THE BIBLE',
 'ART',
 'BOOKS & AUTHORS',
 'U.S. HISTORY',
 'FOOD',
 'MUSEUMS',
 'AMERICANA',
 'SCIENCE & NATURE',
 'COMMON BONDS',
 'HOLIDAYS & OBSERVANCES',
 'GEOGRAPHY',
 '3-LETTER WORDS',
 'ANNUAL EVENTS',
 'AMERICAN LITERATURE',
 'CLASSICAL MUSIC',
 'AUTHORS',
 'POP MUSIC',
 'POETS & POETRY',
 'QUOTATIONS',
 'HODGEPODGE',
 'MYTHOLOGY',
 'NONFICTION',
 'THE MOVIES',
 'WORLD CITIES',
 'MUSICAL INSTRUMENTS',
 'AROUND THE WORLD',
 'THE CIVIL WAR',
 'MUSIC',
 'U.S. PRESIDENTS',
 'C

In [9]:
#Shows 5 observations in our new dataset that only contains categories that have greater than 50 osbervations
jeo_f = jeopardy.where(col("category").isin(top_categories))
jeo_f.show(5)

+----------+--------------------+--------------+--------------------+---------+-----------+-----+
|  air_date|              answer|      category|            question|    round|show_number|value|
+----------+--------------------+--------------+--------------------+---------+-----------+-----+
|2004-12-31|          Copernicus|       HISTORY|'For the last 8 y...|Jeopardy!|       4680| $200|
|2004-12-31|             the ant|3-LETTER WORDS|'In the title of ...|Jeopardy!|       4680| $200|
|2004-12-31|      the Appian Way|       HISTORY|'Built in 312 B.C...|Jeopardy!|       4680| $400|
|2004-12-31|             the cud|3-LETTER WORDS|'Cows regurgitate...|Jeopardy!|       4680| $400|
|2004-12-31|Ceylon (or Sri La...|       HISTORY|'In 1000 Rajaraja...|Jeopardy!|       4680| $600|
+----------+--------------------+--------------+--------------------+---------+-----------+-----+
only showing top 5 rows



In [10]:
jeo_f.count()

29252

In [3]:
#stripping punctuation, tokenizing, and stop word removing for the whole dataset
#jeopardy = jeopardy.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
#jeopardy.cache()
#tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
#tokenized = tokenizer.transform(jeopardy)
#tokenized.cache()
#remover = StopWordsRemover(inputCol="words", outputCol="filtered")
#jeopardyStopRemoved = remover.transform(tokenized)
#jeopardyStopRemoved.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>]

In [5]:
#jeopardy.select("question").show(1,truncate=False)

+--------------------------------------------------------------------------------------------------+
|question                                                                                          |
+--------------------------------------------------------------------------------------------------+
|'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'|
+--------------------------------------------------------------------------------------------------+
only showing top 1 row



In [4]:
#jeopardyStopRemoved.select("filtered").show(1,truncate=False)

+-----------------------------------------------------------------------+
|filtered                                                               |
+-----------------------------------------------------------------------+
|[last, 8, years, life, galileo, house, arrest, espousing, mans, theory]|
+-----------------------------------------------------------------------+
only showing top 1 row



In [11]:
#stripping punctuation, tokenizing, and stop word removing for the modified dataset
jeo_fpunc = jeo_f.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
jeo_fpunc.cache()
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized_f = tokenizer.transform(jeo_fpunc)
tokenized_f.cache()
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
jeo_fStopRemoved = remover.transform(tokenized_f)
jeo_fStopRemoved.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>]

In [12]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [13]:
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec


In [14]:
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

In [50]:
#jeopardyStemmed = jeopardyStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
#jeopardyStemmed.cache()

+----------+---------------------+-------------------------------+------------------------------------------------------------------------------------------------------------------------+---------+-----------+-----+----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|air_date  |answer               |category                       |question                                                                                                                |round    |show_number|value|stripped                                                                                                    

In [15]:
#stemming words in modified dataset
jeo_fStemmed = jeo_fStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeo_fStemmed.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>]

In [37]:
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(jeo_fStemmed)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.cache()


DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector]

In [38]:
training,test = rescaledData.randomSplit([0.8,0.2], seed = 1) 
kmeans = KMeans(k=50, seed=1)
clusters = kmeans.fit(training.select('features'))
transformed = clusters.transform(test)
transformed.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector, prediction: int]

In [39]:
#transformed.select('category','prediction').show(100)
transformed.groupBy('prediction').count().sort(desc("prediction")).show(50)

+----------+-----+
|prediction|count|
+----------+-----+
|        47|   29|
|        46|    2|
|        44|   12|
|        39|    1|
|        36|   11|
|        22|  136|
|        19|    1|
|        18|   45|
|        17|    4|
|        16|   26|
|        13|    1|
|        12|   10|
|        11|   30|
|         8|  161|
|         6|   54|
|         4|    2|
|         3|   68|
|         0| 5285|
+----------+-----+



In [28]:
test.count()

5878

In [61]:
transformed.filter(transformed["prediction"] == 3).select("category","question").show(truncate=False)

+--------------------+------------------------------------------------------------------------------------------------------------+
|category            |question                                                                                                    |
+--------------------+------------------------------------------------------------------------------------------------------------+
|QUOTES              |'"Reading is to the mind as" this is "to the body"'                                                         |
|10-LETTER WORDS     |'Mercy killing'                                                                                             |
|WORD ORIGINS        |'From Old French "manoeuvrer" meaning "to work by hand", which was how this item was put into soil'         |
|BIRDS               |'Falcons kill their prey by doing this'                                                                     |
|PLAYWRIGHTS         |'His last play, "What The Butler Saw", was produced in

In [32]:
#gmm = GaussianMixture().setK(10).setSeed(538009335)
#model = gmm.fit(training.select("features"))
#model.gaussiansDF.show(truncate=False)

In [63]:
indexer = StringIndexer(inputCol="category", outputCol="label")
indexed = indexer.fit(rescaledData).transform(rescaledData)
indexed.select("label","category").show(truncate=False)

+-----+-----------------+
|label|category         |
+-----+-----------------+
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
+-----+-----------------+
only showing top 20 rows



In [64]:
indexed.show(1)

+----------+----------+--------+--------------------+---------+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|  air_date|    answer|category|            question|    round|show_number|value|            stripped|               words|            filtered|             stemmed|         rawFeatures|            features|label|
+----------+----------+--------+--------------------+---------+-----------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|2004-12-31|Copernicus| HISTORY|'For the last 8 y...|Jeopardy!|       4680| $200|For the last 8 ye...|[for, the, last, ...|[last, 8, years, ...|[last, 8, year, l...|(50000,[7848,8230...|(50000,[7848,8230...|  8.0|
+----------+----------+--------+--------------------+---------+-----------+-----+--------------------+--------------------+--------------------+

In [66]:
training,test = indexed.randomSplit([0.8,0.2], seed = 1) 

In [None]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(training)