In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
import pandas as pd
sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

jeopardy = spark.read.json('data/JEOPARDY_QUESTIONS1.json')

In [2]:
#This is the number of categories in the dataset which have greater than 50 observations
j_categoryCount = jeopardy.groupBy("category").count()
count100 = j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 100).count()
count100

145

In [3]:
#This is a list of all categories which have a count greater than 50
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(count100).toPandas().category)
top_categories

['BEFORE & AFTER',
 'SCIENCE',
 'LITERATURE',
 'AMERICAN HISTORY',
 'POTPOURRI',
 'WORLD HISTORY',
 'WORD ORIGINS',
 'COLLEGES & UNIVERSITIES',
 'HISTORY',
 'SPORTS',
 'U.S. CITIES',
 'WORLD GEOGRAPHY',
 'BODIES OF WATER',
 'ANIMALS',
 'STATE CAPITALS',
 'BUSINESS & INDUSTRY',
 'ISLANDS',
 'WORLD CAPITALS',
 'U.S. GEOGRAPHY',
 'RELIGION',
 'OPERA',
 'SHAKESPEARE',
 'LANGUAGES',
 'BALLET',
 'TELEVISION',
 'FICTIONAL CHARACTERS',
 'RHYME TIME',
 'PEOPLE',
 'TRANSPORTATION',
 'ART & ARTISTS',
 'STUPID ANSWERS',
 'THE BIBLE',
 'ART',
 'BOOKS & AUTHORS',
 'U.S. HISTORY',
 'FOOD',
 'MUSEUMS',
 'SCIENCE & NATURE',
 'AMERICANA',
 'COMMON BONDS',
 'GEOGRAPHY',
 'HOLIDAYS & OBSERVANCES',
 '3-LETTER WORDS',
 'ANNUAL EVENTS',
 'AMERICAN LITERATURE',
 'CLASSICAL MUSIC',
 'AUTHORS',
 'POP MUSIC',
 'POETS & POETRY',
 'QUOTATIONS',
 'HODGEPODGE',
 'MYTHOLOGY',
 'NONFICTION',
 'THE MOVIES',
 'WORLD CITIES',
 'MUSICAL INSTRUMENTS',
 'AROUND THE WORLD',
 'THE CIVIL WAR',
 'MUSIC',
 'U.S. PRESIDENTS',
 'C

In [4]:
jeo_f = jeopardy.where(col("category").isin(top_categories))

In [5]:
#stripping punctuation, tokenizing, and stop word removing for the modified dataset
jeo_fpunc = jeo_f.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
jeo_fpunc.cache()
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized_f = tokenizer.transform(jeo_fpunc)
tokenized_f.cache()
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
jeo_fStopRemoved = remover.transform(tokenized_f)
jeo_fStopRemoved.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>]

In [6]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec

from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

In [7]:
#stemming words in modified dataset
jeo_fStemmed = jeo_fStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))
jeo_fStemmed.cache()

DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>]

In [8]:
hashingTF = HashingTF(inputCol="stemmed", outputCol="rawFeatures", numFeatures=50000)
featurizedData = hashingTF.transform(jeo_fStemmed)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.cache()

The history saving thread hit an unexpected error (OperationalError('disk I/O error',)).History will not be written to the database.


DataFrame[air_date: string, answer: string, category: string, question: string, round: string, show_number: string, value: string, stripped: string, words: array<string>, filtered: array<string>, stemmed: array<string>, rawFeatures: vector, features: vector]

In [9]:
training,test = rescaledData.randomSplit([0.8,0.2], seed = 1) 

In [11]:
from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel

In [12]:
bkm = BisectingKMeans().setK(145).setSeed(1)
model = bkm.fit(training)

In [13]:
bkmpredict = model.transform(test)

In [14]:
bkmpredict.groupBy('prediction').count().sort(desc('prediction')).show(145)

+----------+-----+
|prediction|count|
+----------+-----+
|       144|    1|
|       138|    1|
|       137|    4|
|       133|    4|
|       130|    5|
|       129|    7|
|       127|    6|
|       125|    2|
|       122|    1|
|       121|    8|
|       120|    2|
|       119|    1|
|       118|    6|
|       117|    5|
|       115|    5|
|       114|    4|
|       113|    5|
|       112|    1|
|       111|   10|
|       110|   13|
|       109|   33|
|       108|    4|
|       107|    8|
|       106|    6|
|       105|   12|
|       104|    3|
|       103|   16|
|       102|    4|
|       101|    7|
|       100|    9|
|        99|   35|
|        98|   28|
|        97|   69|
|        96|   13|
|        95|   18|
|        94|   42|
|        93|   77|
|        92|    2|
|        91|    3|
|        90|    8|
|        89|   49|
|        88|   35|
|        87|   98|
|        86|   50|
|        85|   38|
|        84|   57|
|        83|  111|
|        82|   56|
|        81|  252|
|        80|

In [20]:
bkmpredict.filter(bkmpredict['prediction']==118).select('category','question').show(truncate=False)

+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category           |question                                                                                                                                                                                                                                                                                                                                                                  |
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------