In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

jeopardy = spark.read.json('data/JEOPARDY_QUESTIONS1.json')

In [2]:
jeopardy.columns

['air_date', 'answer', 'category', 'question', 'round', 'show_number', 'value']

In [3]:
jeopardy = jeopardy.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized = tokenizer.transform(jeopardy)
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
#remover.transform(tokenized).show(truncate=False)
jeopardyStopRemoved = remover.transform(tokenized)


from nltk.stem.porter import *
stemmer = PorterStemmer()


def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec

from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))


jeopardy_processed = jeopardyStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))

In [4]:
jeopardy_stemmed = jeopardy_processed.select("stemmed").rdd
jeopardy_words = jeopardy_stemmed.flatMap(lambda r: r)

In [5]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix


In [6]:
jeopardyStopRemoved.filter(jeo)

NameError: name 'jeo' is not defined

In [24]:
#j_categoryCount = jeopardy.groupBy("category").count()
#j_categoryCount.sort(desc("count")).select("category").show(20)
j_categoryCount.sort(desc("count")).select("category").take(20)

[Row(category='BEFORE & AFTER'),
 Row(category='SCIENCE'),
 Row(category='LITERATURE'),
 Row(category='AMERICAN HISTORY'),
 Row(category='POTPOURRI'),
 Row(category='WORLD HISTORY'),
 Row(category='WORD ORIGINS'),
 Row(category='COLLEGES & UNIVERSITIES'),
 Row(category='HISTORY'),
 Row(category='SPORTS'),
 Row(category='U.S. CITIES'),
 Row(category='WORLD GEOGRAPHY'),
 Row(category='BODIES OF WATER'),
 Row(category='ANIMALS'),
 Row(category='STATE CAPITALS'),
 Row(category='BUSINESS & INDUSTRY'),
 Row(category='ISLANDS'),
 Row(category='WORLD CAPITALS'),
 Row(category='U.S. GEOGRAPHY'),
 Row(category='RELIGION')]

In [50]:
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(20).toPandas().category)
top_categories

['BEFORE & AFTER',
 'SCIENCE',
 'LITERATURE',
 'AMERICAN HISTORY',
 'POTPOURRI',
 'WORLD HISTORY',
 'WORD ORIGINS',
 'COLLEGES & UNIVERSITIES',
 'HISTORY',
 'SPORTS',
 'U.S. CITIES',
 'WORLD GEOGRAPHY',
 'BODIES OF WATER',
 'ANIMALS',
 'STATE CAPITALS',
 'BUSINESS & INDUSTRY',
 'ISLANDS',
 'WORLD CAPITALS',
 'U.S. GEOGRAPHY',
 'RELIGION']

In [54]:
jeo_f = jeopardy.where(col("category").isin(top_categories))
jeo_f.show(5)

+----------+--------------------+--------+--------------------+---------+-----------+-----+--------------------+
|  air_date|              answer|category|            question|    round|show_number|value|            stripped|
+----------+--------------------+--------+--------------------+---------+-----------+-----+--------------------+
|2004-12-31|          Copernicus| HISTORY|'For the last 8 y...|Jeopardy!|       4680| $200|For the last 8 ye...|
|2004-12-31|      the Appian Way| HISTORY|'Built in 312 B.C...|Jeopardy!|       4680| $400|Built in 312 B.C....|
|2004-12-31|Ceylon (or Sri La...| HISTORY|'In 1000 Rajaraja...|Jeopardy!|       4680| $600|In 1000 Rajaraja ...|
|2004-12-31|   the International| HISTORY|'Karl led the fir...|Jeopardy!|       4680| $800|Karl led the firs...|
|2004-12-31|  the Congress Party| HISTORY|'This Asian polit...|Jeopardy!|       4680|$1000|This Asian politi...|
+----------+--------------------+--------+--------------------+---------+-----------+-----+-----

In [8]:
j_categoryCount = jeopardy.groupBy("category").count()
j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 50).count()

332

In [None]:
j50counts = j_categoryCount.filter(j_categoryCount["count"] > 50)

In [None]:
type(j50counts)

In [None]:
type(jeopardy)

In [None]:
j50counts.select("category").show()

In [None]:
test = ["LIBRARIES", "ACTRESSES", "THE 50 STATES"]

In [None]:
jeopardywhatever.take(2)

In [None]:
jeopardycategories = jeopardy.groupby("category").count().show(1)


#.filter(lambda x : x['category'].count() > 20)

In [None]:
jeopardy.select("category").show(10)

In [None]:
jeopardy.filter(lambda r: r).show()