In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import NGram
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.clustering import BisectingKMeans

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

from nltk.stem.porter import *
stemmer = PorterStemmer()
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

In [2]:
#reading in the jeopardy data set
jeopardy = spark.read.json("JEOPARDY_QUESTIONS1-Copy1.json")
j_categoryCount = jeopardy.groupBy("category").count()
#This is the number of categories in the dataset which have greater than 100 observations
count100 = j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 100).count()
#This is a list of all categories which have a count greater than 100
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(count100).toPandas().category)
#new dataset that only contains categories that have greater than 100 osbervations
jeo_f = jeopardy.where(col("category").isin(top_categories))

In [3]:
#stripping punctuation, tokenizing, stop word removing, and stemming for the modified dataset
jeo_fpunc = jeo_f.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
jeo_fpunc.cache()
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized_f = tokenizer.transform(jeo_fpunc)
tokenized_f.cache()
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
jeo_fStopRemoved = remover.transform(tokenized_f)
jeo_fStopRemoved.cache()
jeo_fStemmed = jeo_fStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeo_fStemmed.cache()


DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>]

# Naive Bayes on all 145 Categories

In [4]:
training,test = jeo_fStemmed.randomSplit([0.8,0.2], seed = 1) 

In [5]:
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures = 50000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="category", outputCol="label")
nb = NaiveBayes(modelType="multinomial", featuresCol = "features", labelCol = "label")

pipelineNB = Pipeline(stages=[hashingTF,idf,indexer,nb])
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0.1,1.0,5.0,10.0,50.0]) \
    .build()

crossval = CrossValidator(estimator=pipelineNB,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training)

In [6]:
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.29431779516842466


In [7]:
cvModel.avgMetrics

[0.23848231181459423,
 0.2517210629295972,
 0.2598820587524988,
 0.26410048731126273,
 0.2552501343817924]

In [8]:
prediction.groupBy('prediction').count().sort(desc("prediction")).show(145)

+----------+-----+
|prediction|count|
+----------+-----+
|     143.0|    3|
|     142.0|   14|
|     140.0|    9|
|     139.0|   12|
|     138.0|   19|
|     137.0|    4|
|     136.0|    7|
|     135.0|    1|
|     134.0|    9|
|     133.0|    5|
|     132.0|    5|
|     130.0|   13|
|     129.0|   11|
|     128.0|    9|
|     127.0|   16|
|     125.0|    1|
|     123.0|   11|
|     122.0|   13|
|     121.0|    4|
|     120.0|   12|
|     119.0|    3|
|     117.0|    4|
|     115.0|    4|
|     113.0|   14|
|     112.0|    3|
|     111.0|    6|
|     110.0|    9|
|     109.0|    3|
|     108.0|   18|
|     107.0|    2|
|     106.0|    1|
|     105.0|   11|
|     104.0|    4|
|     103.0|    8|
|     101.0|    2|
|     100.0|    3|
|      99.0|    1|
|      98.0|   16|
|      97.0|    4|
|      96.0|    1|
|      95.0|   18|
|      94.0|    2|
|      93.0|    6|
|      92.0|    5|
|      91.0|    2|
|      90.0|    9|
|      89.0|    7|
|      88.0|   21|
|      87.0|   13|
|      86.0|

In [9]:
prediction.filter(prediction['prediction'] == 1).select("category","question").sort("category") \
    .show(424,truncate=False)

+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category               |question                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [10]:
prediction.select("label","category").sort(asc("label")).distinct().show()

+-----+--------------------+
|label|            category|
+-----+--------------------+
|  0.0|      BEFORE & AFTER|
|  1.0|             SCIENCE|
|  2.0|          LITERATURE|
|  3.0|    AMERICAN HISTORY|
|  4.0|           POTPOURRI|
|  5.0|       WORLD HISTORY|
|  6.0|        WORD ORIGINS|
|  7.0|             HISTORY|
|  8.0|COLLEGES & UNIVER...|
|  9.0|              SPORTS|
| 10.0|     WORLD GEOGRAPHY|
| 11.0|         U.S. CITIES|
| 12.0|             ANIMALS|
| 13.0| BUSINESS & INDUSTRY|
| 14.0|      STATE CAPITALS|
| 15.0|     BODIES OF WATER|
| 16.0|               OPERA|
| 17.0|      WORLD CAPITALS|
| 18.0|            RELIGION|
| 19.0|      TRANSPORTATION|
+-----+--------------------+
only showing top 20 rows



# Naive Bayes on Human Defined Clusters


In [11]:
humanclust = [9, 0, 5, 7, 9, 7, 9, 2, 7, 8, 2, 2, 3, 1, 2, 8, 3, 2, 2, 5, 6, 5, 5, 8, 8, 5, 9, 4, 0, 6,
              9, 5, 6, 5, 7, 1, 8, 0, 7, 9, 2, 8, 9, 8, 5, 6, 4, 6, 5, 9, 9, 5, 5, 8, 2, 6, 2, 7, 6, 4,
              4, 0, 1, 9, 7, 0, 2, 1, 9, 7, 0, 4, 4, 4, 0, 0, 4, 2, 1, 0, 8, 9, 4, 8, 0, 1, 9, 9, 8, 8,
              9, 0, 7, 0, 7, 4, 2, 9, 4, 4, 0, 7, 9, 0, 4, 4, 3, 2, 1, 9, 8, 4, 4, 9, 5, 9, 1, 0, 2, 4,
              1, 9, 4, 4, 9, 6, 4, 4, 5, 4, 8, 9, 6, 5, 5, 0, 2, 0, 0, 4, 8, 8, 4, 7, 2]
human_udf = udf(lambda x: int(humanclust[top_categories.index(x)]))
jeo_g = jeo_fStemmed.withColumn("human", human_udf(jeo_f.category).cast("integer"))


In [12]:
training,test = jeo_g.randomSplit([0.8,0.2], seed = 1)

In [13]:
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures = 50000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="human", outputCol="label")
nb = NaiveBayes(modelType="multinomial", featuresCol = "features", labelCol = "label")

pipelineNB = Pipeline(stages=[hashingTF,idf,indexer,nb])
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0.1,1.0,5.0,10.0,50.0, 100]) \
    .build()

crossval = CrossValidator(estimator=pipelineNB,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training)


In [14]:
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.4703980945899966


In [15]:
cvModel.avgMetrics

[0.3889696910842948,
 0.4353400667858076,
 0.4624241775570728,
 0.46818954344286046,
 0.46649395219294015,
 0.4632680381251408]

In [16]:
prediction.groupBy('prediction').count().sort(desc("prediction")).show(145)


+----------+-----+
|prediction|count|
+----------+-----+
|       9.0|   31|
|       8.0|  233|
|       7.0|  172|
|       6.0|  576|
|       5.0|  661|
|       4.0|  662|
|       3.0|  763|
|       2.0|  967|
|       1.0|  718|
|       0.0| 1095|
+----------+-----+



In [17]:
prediction.filter(prediction['prediction'] == 0).select("category","question").sort("category") \
    .show(424,truncate=False)

+-----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category               |question                                                                                                                                                                                                                                                                                                                                                                                           |
+-----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------

Key:  
0: Science   
1: Life (food)   
2: Human Geography  
3: Water Geo  
4: People  
5: Literature  
6: Music /Art  
7: History  
8: Entertainment  
9: Wordplay  

In [22]:
prediction.select("label","human").sort(asc("human")).distinct().show()

+-----+-----+
|label|human|
+-----+-----+
|  4.0|    0|
|  8.0|    1|
|  2.0|    2|
|  9.0|    3|
|  1.0|    4|
|  3.0|    5|
|  7.0|    6|
|  6.0|    7|
|  5.0|    8|
|  0.0|    9|
+-----+-----+

