In [1]:
# Start with loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
from PIL import Image
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import NGram
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.clustering import BisectingKMeans

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()


from nltk.stem.porter import *
stemmer = PorterStemmer()
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

#reading in the jeopardy data set
jeopardy = spark.read.json("data/JEOPARDY_QUESTIONS1.json")

#This is the number of categories in the dataset which have greater than 100 observations
j_categoryCount = jeopardy.groupBy("category").count()
count100 = j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 100).count()
count100

#This is a list of all categories which have a count greater than 100
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(50).toPandas().category)

jeo_f = jeopardy.where(col("category").isin(top_categories))

In [2]:
jeo_f.count()

15012

In [3]:
#stripping punctuation, tokenizing, and stop word removing for the modified dataset
jeo_fpunc = jeo_f.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
jeo_fpunc.cache()
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized_f = tokenizer.transform(jeo_fpunc)
tokenized_f.cache()
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
jeo_fStopRemoved = remover.transform(tokenized_f)
jeo_fStopRemoved.cache()

#stemming words in modified dataset
jeo_fStemmed = jeo_fStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeo_fStemmed.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>]

In [4]:
#peforming tf-idf on modified dataset
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(jeo_fStemmed)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector]

In [5]:
#making numerical labels for each category
indexer = StringIndexer(inputCol="category", outputCol="label")
indexed = indexer.fit(rescaledData).transform(rescaledData)
indexed.distinct().select("label","category").show(50,truncate=False)

+-----+-----------------------+
|label|category               |
+-----+-----------------------+
|6.0  |WORD ORIGINS           |
|5.0  |WORLD HISTORY          |
|37.0 |SCIENCE & NATURE       |
|27.0 |RHYME TIME             |
|4.0  |POTPOURRI              |
|48.0 |POETS & POETRY         |
|4.0  |POTPOURRI              |
|18.0 |U.S. GEOGRAPHY         |
|15.0 |BUSINESS & INDUSTRY    |
|34.0 |U.S. HISTORY           |
|3.0  |AMERICAN HISTORY       |
|29.0 |ART & ARTISTS          |
|32.0 |ART                    |
|41.0 |HOLIDAYS & OBSERVANCES |
|28.0 |TRANSPORTATION         |
|10.0 |U.S. CITIES            |
|22.0 |LANGUAGES              |
|23.0 |BALLET                 |
|23.0 |BALLET                 |
|27.0 |RHYME TIME             |
|48.0 |POETS & POETRY         |
|34.0 |U.S. HISTORY           |
|7.0  |COLLEGES & UNIVERSITIES|
|32.0 |ART                    |
|31.0 |THE BIBLE              |
|16.0 |ISLANDS                |
|8.0  |HISTORY                |
|46.0 |AUTHORS                |
|33.0 |B

In [6]:
training,test = indexed.randomSplit([0.8,0.2], seed = 300)

In [7]:
training.columns

['air_date',
 'answer',
 'category',
 'question',
 'round',
 'show_number',
 'value',
 'stripped',
 'words',
 'filtered',
 'stemmed',
 'rawFeatures',
 'features',
 'label']

In [8]:
training.count()

11984

In [9]:
test.count()

3028

In [10]:
testgroup = test.groupby("label").count()
testgroup.sort(desc("label")).show(30)

+-----+-----+
|label|count|
+-----+-----+
| 49.0|   51|
| 48.0|   53|
| 47.0|   40|
| 46.0|   43|
| 45.0|   49|
| 44.0|   48|
| 43.0|   42|
| 42.0|   45|
| 41.0|   47|
| 40.0|   56|
| 39.0|   47|
| 38.0|   53|
| 37.0|   60|
| 36.0|   39|
| 35.0|   51|
| 34.0|   67|
| 33.0|   65|
| 32.0|   62|
| 31.0|   47|
| 30.0|   62|
| 29.0|   47|
| 28.0|   59|
| 27.0|   55|
| 26.0|   57|
| 25.0|   53|
| 24.0|   58|
| 23.0|   59|
| 22.0|   63|
| 21.0|   61|
| 20.0|   53|
+-----+-----+
only showing top 30 rows



In [11]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, regParam=0.005, elasticNetParam=0.8, family="multinomial")
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [12]:
predictions.select("category", "label","prediction").show(100, truncate=False)

+--------------------+-----+----------+
|category            |label|prediction|
+--------------------+-----+----------+
|THE BIBLE           |31.0 |44.0      |
|TRANSPORTATION      |28.0 |28.0      |
|OPERA               |20.0 |0.0       |
|TRANSPORTATION      |28.0 |0.0       |
|U.S. HISTORY        |34.0 |3.0       |
|ANIMALS             |13.0 |13.0      |
|ANIMALS             |13.0 |2.0       |
|GEOGRAPHY           |40.0 |41.0      |
|GEOGRAPHY           |40.0 |40.0      |
|STATE CAPITALS      |14.0 |17.0      |
|GEOGRAPHY           |40.0 |12.0      |
|BUSINESS & INDUSTRY |15.0 |15.0      |
|STATE CAPITALS      |14.0 |0.0       |
|TRANSPORTATION      |28.0 |28.0      |
|SHAKESPEARE         |21.0 |21.0      |
|SHAKESPEARE         |21.0 |42.0      |
|SCIENCE             |1.0  |13.0      |
|LITERATURE          |2.0  |49.0      |
|LITERATURE          |2.0  |44.0      |
|SPORTS              |9.0  |47.0      |
|LITERATURE          |2.0  |25.0      |
|U.S. HISTORY        |34.0 |49.0      |


In [13]:
predictions.filter(predictions["label"] == predictions["prediction"]).select("category","label","prediction").show(20)

+-------------------+-----+----------+
|           category|label|prediction|
+-------------------+-----+----------+
|     TRANSPORTATION| 28.0|      28.0|
|            ANIMALS| 13.0|      13.0|
|          GEOGRAPHY| 40.0|      40.0|
|BUSINESS & INDUSTRY| 15.0|      15.0|
|     TRANSPORTATION| 28.0|      28.0|
|        SHAKESPEARE| 21.0|      21.0|
|     WORLD CAPITALS| 17.0|      17.0|
|             SPORTS|  9.0|       9.0|
|             SPORTS|  9.0|       9.0|
|             SPORTS|  9.0|       9.0|
|           RELIGION| 19.0|      19.0|
|        U.S. CITIES| 10.0|      10.0|
|          THE BIBLE| 31.0|      31.0|
|      WORLD HISTORY|  5.0|       5.0|
|            SCIENCE|  1.0|       1.0|
|   AMERICAN HISTORY|  3.0|       3.0|
|            ANIMALS| 13.0|      13.0|
|          THE BIBLE| 31.0|      31.0|
|            SCIENCE|  1.0|       1.0|
|           RELIGION| 19.0|      19.0|
+-------------------+-----+----------+
only showing top 20 rows



In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.39531043593130777
