In [1]:
#lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, regParam=0.3, elasticNetParam=0.8)
#lrModel = lr.fit(training)
#predictions = lrModel.transform(test)
#predictions.cache()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import NGram
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.clustering import BisectingKMeans

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

In [3]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

# Exploratorty Data Analysis
### and Preprocessing for a new dataset where counts are greater than 100 for each category

In [4]:
#reading in the jeopardy data set
jeopardy = spark.read.json("JEOPARDY_QUESTIONS1-Copy1.json")

In [5]:
#shows first 30 observations of jeopardy data set
jeopardy.show(30)

+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|  air_date|              answer|            category|            question|           round|show_number| value|
+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|2004-12-31|          Copernicus|             HISTORY|'For the last 8 y...|       Jeopardy!|       4680|  $200|
|2004-12-31|          Jim Thorpe|ESPN's TOP 10 ALL...|'No. 2: 1912 Olym...|       Jeopardy!|       4680|  $200|
|2004-12-31|             Arizona|EVERYBODY TALKS A...|'The city of Yuma...|       Jeopardy!|       4680|  $200|
|2004-12-31|         McDonald\'s|    THE COMPANY LINE|'In 1963, live on...|       Jeopardy!|       4680|  $200|
|2004-12-31|          John Adams| EPITAPHS & TRIBUTES|'Signer of the De...|       Jeopardy!|       4680|  $200|
|2004-12-31|             the ant|      3-LETTER WORDS|'In the title of ...|       Jeopardy!|       4680|

In [6]:
#shows top 20 categories with highest count of observations, and the counts associated with them
j_categoryCount = jeopardy.groupBy("category").count()
j_categoryCount.sort(desc("count")).show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|      BEFORE & AFTER|  547|
|             SCIENCE|  519|
|          LITERATURE|  496|
|    AMERICAN HISTORY|  418|
|           POTPOURRI|  401|
|       WORLD HISTORY|  377|
|        WORD ORIGINS|  371|
|COLLEGES & UNIVER...|  351|
|             HISTORY|  349|
|              SPORTS|  342|
|         U.S. CITIES|  339|
|     WORLD GEOGRAPHY|  338|
|     BODIES OF WATER|  327|
|             ANIMALS|  324|
|      STATE CAPITALS|  314|
| BUSINESS & INDUSTRY|  311|
|             ISLANDS|  301|
|      WORLD CAPITALS|  300|
|      U.S. GEOGRAPHY|  299|
|            RELIGION|  297|
+--------------------+-----+
only showing top 20 rows



In [7]:
#This is the number of categories in the dataset
j_categoryCount.count()

27995

In [8]:
#This is the number of categories in the dataset which have greater than 100 observations
j_categoryCount = jeopardy.groupBy("category").count()
count100 = j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 100).count()
count100

145

In [9]:
#This is a list of all categories which have a count greater than 100
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(count100).toPandas().category)

In [10]:
#Shows 5 observations in our new dataset that only contains categories that have greater than 100 osbervations
jeo_f = jeopardy.where(col("category").isin(top_categories))
jeo_f.show(5)

+----------+--------------------+--------------+--------------------+---------+-----------+-----+
|  air_date|              answer|      category|            question|    round|show_number|value|
+----------+--------------------+--------------+--------------------+---------+-----------+-----+
|2004-12-31|          Copernicus|       HISTORY|'For the last 8 y...|Jeopardy!|       4680| $200|
|2004-12-31|             the ant|3-LETTER WORDS|'In the title of ...|Jeopardy!|       4680| $200|
|2004-12-31|      the Appian Way|       HISTORY|'Built in 312 B.C...|Jeopardy!|       4680| $400|
|2004-12-31|             the cud|3-LETTER WORDS|'Cows regurgitate...|Jeopardy!|       4680| $400|
|2004-12-31|Ceylon (or Sri La...|       HISTORY|'In 1000 Rajaraja...|Jeopardy!|       4680| $600|
+----------+--------------------+--------------+--------------------+---------+-----------+-----+
only showing top 5 rows



In [11]:
jeo_f.count()

29252

# Data preprocessing for the full jeopardy dataset

In [12]:
#stripping punctuation, tokenizing, and stop word removing for the whole dataset
jeopardy = jeopardy.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
jeopardy.cache()
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized = tokenizer.transform(jeopardy)
tokenized.cache()
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
jeopardyStopRemoved = remover.transform(tokenized)
jeopardyStopRemoved.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>]

In [13]:
#jeopardy.select("question").show(1,truncate=False)

In [14]:
#jeopardyStopRemoved.select("filtered").show(1,truncate=False)

In [15]:
#stemming words in whole dataset
jeopardyStemmed = jeopardyStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeopardyStemmed.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>]

In [16]:
#performing tf-idf to stemmed words
hashingTF1 = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=50000)
featurizedData1 = hashingTF1.transform(jeopardyStemmed)
idf1 = IDF(inputCol="rawFeatures", outputCol="features")
idfModel1 = idf1.fit(featurizedData1)
rescaledData1 = idfModel1.transform(featurizedData1)
rescaledData1.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector]

# K-means Clustering for the full jeopardy dataset

In [17]:
#training, test split
training,test = rescaledData1.randomSplit([0.8,0.2], seed = 1) 

In [18]:
#kmeans clustering for whole dataset
kmeans1 = KMeans(k=50, seed=1)
clusters1 = kmeans1.fit(training.select('features'))
transformed1 = clusters1.transform(test)
transformed1.cache()
#This shows the count for each cluster
transformed1.groupBy('prediction').count().sort(desc("prediction")).show(50)

+----------+-----+
|prediction|count|
+----------+-----+
|        43|  192|
|        33|   62|
|        31|   11|
|        30|  172|
|        29|    2|
|        28|   83|
|        21|    8|
|        19|  113|
|        17|  384|
|        16|    1|
|        15|   18|
|        11|   36|
|        10|    1|
|         8|   19|
|         5|  307|
|         4|  794|
|         3|  273|
|         2|   40|
|         1|  652|
|         0|40272|
+----------+-----+



In [19]:
transformed1.filter(transformed1["prediction"] == 5).select("category","question").show(truncate=False)

+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category           |question                                                                                                                                                                                                                                                                                                                |
+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Data preprocessing for the modified jeopardy dataset
### The modified jeopardy dataset is where there are only observations where the categories have a count greater than 100

In [20]:
#stripping punctuation, tokenizing, and stop word removing for the modified dataset
jeo_fpunc = jeo_f.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
jeo_fpunc.cache()
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized_f = tokenizer.transform(jeo_fpunc)
tokenized_f.cache()
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
jeo_fStopRemoved = remover.transform(tokenized_f)
jeo_fStopRemoved.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>]

In [21]:
#stemming words in modified dataset
jeo_fStemmed = jeo_fStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeo_fStemmed.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>]

In [22]:
training1,test1 = jeo_fStemmed.randomSplit([0.8,0.2], seed = 1)

In [23]:
#peforming tf-idf on modified dataset
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(jeo_fStemmed)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector]

In [24]:
#scaler = MaxAbsScaler(inputCol="sfeatures", outputCol="features")
#scalerModel = scaler.fit(rescaledData)
#scaledData = scalerModel.transform(rescaledData)

# K-means Clustering for the modified jeopardy dataset

In [25]:
#training,test split for modified dataset
training,test = rescaledData.randomSplit([0.8,0.2], seed = 1) 

cost = np.zeros(50)
for k in range(2,50):
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(training.sample(False,0.1, seed=42))
    cost[k] = model.computeCost(training)

fig, ax = plt.subplots(1,1, figsize =(8,6))
ax.plot(range(2,50),cost[2:50])
ax.set_xlabel('k')
ax.set_ylabel('cost')

In [26]:
#kmeans clustering for modified dataset
kmeans = KMeans(k=50, seed=1)
clusters = kmeans.fit(training.select('features'))
transformed = clusters.transform(test)
transformed.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector, prediction: int]

In [27]:
#This shows the count for each cluster
transformed.groupBy('prediction').count().sort(desc("prediction")).show(50)

+----------+-----+
|prediction|count|
+----------+-----+
|        47|   29|
|        46|    2|
|        44|   12|
|        39|    1|
|        36|   11|
|        22|  136|
|        19|    1|
|        18|   45|
|        17|    4|
|        16|   26|
|        13|    1|
|        12|   10|
|        11|   30|
|         8|  161|
|         6|   54|
|         4|    2|
|         3|   68|
|         0| 5285|
+----------+-----+



In [28]:
transformed.filter(transformed["prediction"] == 3).select("category","question").show(truncate=False)

+--------------------+------------------------------------------------------------------------------------------------------------+
|category            |question                                                                                                    |
+--------------------+------------------------------------------------------------------------------------------------------------+
|QUOTES              |'"Reading is to the mind as" this is "to the body"'                                                         |
|10-LETTER WORDS     |'Mercy killing'                                                                                             |
|WORD ORIGINS        |'From Old French "manoeuvrer" meaning "to work by hand", which was how this item was put into soil'         |
|BIRDS               |'Falcons kill their prey by doing this'                                                                     |
|PLAYWRIGHTS         |'His last play, "What The Butler Saw", was produced in

# Bisecting K-means for Modified Dataset

In [29]:
#training,test split for modified dataset
training,test = rescaledData.randomSplit([0.8,0.2], seed = 1) 

cost = np.zeros(55) 
for k in range(2,52,10): 
    bkmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") 
    model = bkmeans.fit(training.sample(False,0.1, seed=42)) 
    cost[k] = model.computeCost(training)

fig, ax = plt.subplots(1,1, figsize =(8,6)) 
ax.plot(range(2,50),cost[2:50]) 
ax.set_xlabel('k') 
ax.set_ylabel('cost')

In [32]:
bkm = BisectingKMeans().setK(50).setSeed(1)
model = bkm.fit(training)

In [33]:
bkmpredict = model.transform(test)

In [34]:
bkmpredict.groupBy('prediction').count().sort(desc("prediction")).show(50)

+----------+-----+
|prediction|count|
+----------+-----+
|        49|    1|
|        48|    5|
|        47|    4|
|        46|   12|
|        45|    8|
|        44|    9|
|        43|   14|
|        42|   14|
|        41|   57|
|        40|   30|
|        39|   30|
|        38|   32|
|        37|  109|
|        36|   34|
|        35|  116|
|        34|   17|
|        33|   45|
|        32|  159|
|        31|  538|
|        30|  139|
|        29|  131|
|        28|   81|
|        27|   84|
|        26|  136|
|        25|  150|
|        24|   20|
|        23|   39|
|        22|  144|
|        21|   27|
|        20|  497|
|        19|  237|
|        18|  124|
|        17|  156|
|        16|  138|
|        15|  240|
|        14|  300|
|        13|  797|
|        12|   49|
|        11|  147|
|        10|   26|
|         9|   38|
|         8|   48|
|         7|   44|
|         6|   76|
|         5|  259|
|         4|   69|
|         3|  108|
|         2|  188|
|         1|   35|
|         0|

In [35]:
bkmpredict.filter(bkmpredict["prediction"] == 1).select("category","question").show(truncate=False)

+---------------+--------------------------------------------------------------------------------------------------+
|category       |question                                                                                          |
+---------------+--------------------------------------------------------------------------------------------------+
|LAKES & RIVERS |'Scottish word for lake'                                                                          |
|LAKES & RIVERS |'The end of a river that empties into a lake or ocean'                                            |
|LAKES & RIVERS |'This lake in the Banff National Park is the most visited place in the Canadian Rockies'          |
|BODIES OF WATER|'Once part of the Gulf of California, it's now the largest natural lake entirely within the state'|
|U.S. GEOGRAPHY |'This lake was created when Hoover Dam was built on the Colorado River'                           |
|ISLANDS        |'Over 500 bison live in the Bison Refuge on Ant

# Naive Bayes Classification for Modified Dataset

In [36]:
#making numerical labels for each category
indexer = StringIndexer(inputCol="category", outputCol="label")
indexed = indexer.fit(rescaledData).transform(rescaledData)
indexed.select("label","category").show(truncate=False)

+-----+-----------------+
|label|category         |
+-----+-----------------+
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|8.0  |HISTORY          |
|42.0 |3-LETTER WORDS   |
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|81.0 |IN THE DICTIONARY|
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
|77.0 |TRAVEL & TOURISM |
+-----+-----------------+
only showing top 20 rows



In [37]:
#splitting to training and test for classification
training,test = indexed.randomSplit([0.8,0.2], seed = 1) 

In [38]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial", featuresCol = "features", labelCol = "label")
model = nb.fit(training)
predictions = model.transform(test)
predictions.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [39]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.2689690370874447


# Pipeline for Naive Bayes Classification for Modified Dataset

hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="category", outputCol="label")
nb = NaiveBayes(modelType="multinomial", featuresCol = idf.getOutputCol(), labelCol = "label")
pipeline = Pipeline(stages=[hashingTF,idf,indexer,nb])
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [1000,10000,50000]) \
    .addGrid(nb.smoothing, [1.0]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training1)


prediction = cvModel.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))