In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import NGram
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.clustering import BisectingKMeans

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

# Preprocessing


In [None]:
#this code is to create a function to stem the words from questions with stop words removed
from nltk.stem.porter import *
stemmer = PorterStemmer()
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

#reading in the jeopardy data set
jeopardy = spark.read.json("JEOPARDY_QUESTIONS1-Copy1.json")
j_categoryCount = jeopardy.groupBy("category").count()
#This is the number of categories in the dataset which have greater than 100 observations
count100 = j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 100).count()
#This is a list of all categories which have a count greater than 100, comes out to be 145 categories
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(count100).toPandas().category)
#new dataset that only contains categories that have greater than 100 osbervations
jeo_f = jeopardy.where(col("category").isin(top_categories))
#stripping punctuation, tokenizing, stop word removing, and stemming for the modified dataset
jeo_fpunc = jeo_f.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
jeo_fpunc.cache()
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized_f = tokenizer.transform(jeo_fpunc)
tokenized_f.cache()
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
jeo_fStopRemoved = remover.transform(tokenized_f)
jeo_fStopRemoved.cache()
jeo_fStemmed = jeo_fStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeo_fStemmed.cache()
#shows what the new dataset looks like
jeo_fStemmed.show(truncate=False)

# Naive Bayes on 145 Categories

In [None]:
#training and test split
training,test = jeo_fStemmed.randomSplit([0.8,0.2], seed = 1) 
#pipeline that goes through hashing tf, idf, indexing, and naive bayes algorithm
#also uses multiple smoothing parameters for cross validation
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures = 50000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="category", outputCol="label")
nb = NaiveBayes(modelType="multinomial", featuresCol = "features", labelCol = "label")

pipelineNB = Pipeline(stages=[hashingTF,idf,indexer,nb])
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0.1,1.0,5.0,10.0,50.0]) \
    .build()

crossval = CrossValidator(estimator=pipelineNB,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training)

prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
#output accuracy metric for our model
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
#output f1 score metric for our model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1 = evaluator.evaluate(prediction)
print("Test set f1 = " + str(f1))
#training accuracy for all smoothing parameters
cvModel.avgMetrics
#shows counts for all predictions on test set
prediction.groupBy('prediction').count().sort(desc("prediction")).show(145)
#shows category and question for the prediction category '1'
prediction.filter(prediction['prediction'] == 1).select("category","question").sort("category") \
    .show(424,truncate=False)
#shows the category and its respective label
prediction.select("label","category").sort(asc("label")).distinct().show()

# Creating Computer Defined Clusters

In [None]:
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(jeo_fStemmed)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.cache()
#training,test split for modified dataset
training,test = rescaledData.randomSplit([0.8,0.2], seed = 1)
# 10 computer defined clusters
bkm10 = BisectingKMeans().setK(10).setSeed(1)
modelf = bkm10.fit(rescaledData)
bkmclustf = modelf.transform(rescaledData)
bkmclustf.groupBy('prediction').count().sort(desc("prediction")).show(10)
bkmclustf.select("prediction").write.csv("bkmf2.csv")

In [None]:
#import computer defines clusters
comp_col = spark.read.csv("bkmf2.csv")
comp_col2 = comp_col.withColumn("ID", monotonically_increasing_id())
jeo_h = jeo_g.withColumn("ID", monotonically_increasing_id())
jeo_i = jeo_h.join(comp_col2, on="ID").withColumnRenamed("_c0", "computer")

# Naive Bayes on Computer Defined Clusters

In [None]:
#training and test split
training,test = jeo_i.randomSplit([0.8,0.2], seed = 1) 
#pipeline that goes through hashing tf, idf, indexing, and naive bayes algorithm
#also uses multiple smoothing parameters for cross validation
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures = 50000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="computer", outputCol="label")
nb = NaiveBayes(modelType="multinomial", featuresCol = "features", labelCol = "label")

pipelineNB = Pipeline(stages=[hashingTF,idf,indexer,nb])
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [5000, 20000, 30000, 40000, 50000]) \
    .build()

crossval = CrossValidator(estimator=pipelineNB,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training)


prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
#output accuracy metric for our model
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
#training accuracy for all smoothing parameters
cvModel.avgMetrics
#shows counts for all predictions on test set
prediction.groupBy('prediction').count().sort(desc("prediction")).show(145)
#shows category and question for the prediction cluster '0'
prediction.filter(prediction['prediction'] == 0).select("category","question").sort("category") \
    .show(424,truncate=False)



# Logistic Regression on Computer Defined Clusters

In [None]:
#training and test split
training,test = jeo_i.randomSplit([0.8,0.2], seed = 1) 
#pipeline that goes through hashing tf, idf, indexing, and logistic regression algorithm
#also uses multiple regularization parameters for cross validation
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures = 50000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="computer", outputCol="label")
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, elasticNetParam=0.8,
                       family="multinomial")

pipelineLR = Pipeline(stages=[hashingTF,idf,indexer,lr])
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.001,0.0025,0.005,0.0075,.01]) \
    .build()

crossval = CrossValidator(estimator=pipelineLR,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training)
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
#output accuracy metric for our model
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
#training accuracy for all smoothing parameters
cvModel.avgMetrics
#shows counts for all predictions on test set
prediction.groupBy('prediction').count().sort(desc("prediction")).show(145)

# Creating Human Defined Labels

In [None]:
#defining the category of the human labels
#created a excel file where we manually grouped together these categories using a key.
humanclust = [9, 0, 5, 7, 9, 7, 9, 2, 7, 8, 2, 2, 3, 1, 2, 8, 3, 2, 2, 5, 6, 5, 5, 8, 8, 5, 9, 4, 0, 6,
              9, 5, 6, 5, 7, 1, 8, 0, 7, 9, 2, 8, 9, 8, 5, 6, 4, 6, 5, 9, 9, 5, 5, 8, 2, 6, 2, 7, 6, 4,
              4, 0, 1, 9, 7, 0, 2, 1, 9, 7, 0, 4, 4, 4, 0, 0, 4, 2, 1, 0, 8, 9, 4, 8, 0, 1, 9, 9, 8, 8,
              9, 0, 7, 0, 7, 4, 2, 9, 4, 4, 0, 7, 9, 0, 4, 4, 3, 2, 1, 9, 8, 4, 4, 9, 5, 9, 1, 0, 2, 4,
              1, 9, 4, 4, 9, 6, 4, 4, 5, 4, 8, 9, 6, 5, 5, 0, 2, 0, 0, 4, 8, 8, 4, 7, 2]
human_udf = udf(lambda x: int(humanclust[top_categories.index(x)]))
jeo_g = jeo_fStemmed.withColumn("human", human_udf(jeo_f.category).cast("integer"))

# Naive Bayes on Human Defined Labels

In [None]:
#training and test split
training,test = jeo_g.randomSplit([0.8,0.2], seed = 1)
#pipeline that goes through hashing tf, idf, indexing, and naive bayes algorithm
#also uses multiple smoothing parameters for cross validation
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures = 50000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="human", outputCol="label")
nb = NaiveBayes(modelType="multinomial", featuresCol = "features", labelCol = "label")

pipelineNB = Pipeline(stages=[hashingTF,idf,indexer,nb])
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0.1,1.0,5.0,10.0,50.0, 100]) \
    .build()

crossval = CrossValidator(estimator=pipelineNB,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training)


prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
#output accuracy metric for our model
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
#output f1 score for our model
f1 = evaluator.evaluate(prediction)
print("Test set f1 = " + str(f1))
#training accuracy for all smoothing parameters
cvModel.avgMetrics
#shows counts for all predictions on test set
prediction.groupBy('prediction').count().sort(desc("prediction")).show(145)
#shows category and question for the prediction category '0'
prediction.filter(prediction['prediction'] == 0).select("category","question").sort("category") \
    .show(424,truncate=False)
#shows the label for each human defined cluster
prediction.select("label","human").sort(asc("human")).distinct().show()

#run multiple cross validations on naive bayes model
training,test = jeo_g.randomSplit([0.8,0.2], seed = 123)
cvModel = crossval.fit(training)
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
cvModel.avgMetrics
training,test = jeo_g.randomSplit([0.8,0.2], seed = 300)
cvModel = crossval.fit(training)
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))


cvModel.avgMetrics
#end of running multiple cross validations on naive bayes model

# Logistic Regression on Human Defined Clusters

In [None]:
#training and test split
training,test = jeo_g.randomSplit([0.8,0.2], seed = 1) 
#pipeline that goes through hashing tf, idf, indexing, and logistic regression algorithm
#also uses multiple regularization parameters for cross validation
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures = 50000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
indexer = StringIndexer(inputCol="human", outputCol="label")
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, elasticNetParam=0.8,
                       family="multinomial")

pipelineLR = Pipeline(stages=[hashingTF,idf,indexer,lr])
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.001,0.0025,0.005,0.0075,.01]) \
    .build()

crossval = CrossValidator(estimator=pipelineLR,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)
cvModel = crossval.fit(training)
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
#output accuracy metric for our model
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
#training accuracy for all smoothing parameters
cvModel.avgMetrics
#shows counts for all predictions on test set
prediction.groupBy('prediction').count().sort(desc("prediction")).show(145)
#run multiple cross validations on naive bayes model
training,test = jeo_g.randomSplit([0.8,0.2], seed = 123) 
cvModel = crossval.fit(training)
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
cvModel.avgMetrics
training,test = jeo_g.randomSplit([0.8,0.2], seed = 300) 
cvModel = crossval.fit(training)
prediction = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
cvModel.avgMetrics
#end of running multiple cross validations on naive bayes model

# Code for Creating WordClouds

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from matplotlib import pyplot as mp

#this code for wordclouds was replicated for many models and sets of questions that we created, 
#this is just one example of plenty of wordclouds created. Only show one example because code is repetitive

#remember to do pip install wordcloud in terminal

#create wordcloud
text = prediction.filter(prediction["prediction"] == 9).select("question").toPandas()
questions = str(text)
wordcloud = WordCloud(max_words=100, width=800, height=400, background_color="white").generate(questions)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
#save wordcloud to a file
wordcloud.to_file("LR_Human_images/watergeographyLRH.png")