In [84]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .appName("Jeoaprdy Clustering") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [7]:
jeopardy = spark.read.json("JEOPARDY_QUESTIONS1.json")

In [10]:
jeopardy.show(30)


+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|  air_date|              answer|            category|            question|           round|show_number| value|
+----------+--------------------+--------------------+--------------------+----------------+-----------+------+
|2004-12-31|          Copernicus|             HISTORY|'For the last 8 y...|       Jeopardy!|       4680|  $200|
|2004-12-31|          Jim Thorpe|ESPN's TOP 10 ALL...|'No. 2: 1912 Olym...|       Jeopardy!|       4680|  $200|
|2004-12-31|             Arizona|EVERYBODY TALKS A...|'The city of Yuma...|       Jeopardy!|       4680|  $200|
|2004-12-31|         McDonald\'s|    THE COMPANY LINE|'In 1963, live on...|       Jeopardy!|       4680|  $200|
|2004-12-31|          John Adams| EPITAPHS & TRIBUTES|'Signer of the De...|       Jeopardy!|       4680|  $200|
|2004-12-31|             the ant|      3-LETTER WORDS|'In the title of ...|       Jeopardy!|       4680|

In [20]:
j_categoryCount = jeopardy.groupBy("category").count()
j_categoryCount.sort(desc("count")).show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|      BEFORE & AFTER|  547|
|             SCIENCE|  519|
|          LITERATURE|  496|
|    AMERICAN HISTORY|  418|
|           POTPOURRI|  401|
|       WORLD HISTORY|  377|
|        WORD ORIGINS|  371|
|COLLEGES & UNIVER...|  351|
|             HISTORY|  349|
|              SPORTS|  342|
|         U.S. CITIES|  339|
|     WORLD GEOGRAPHY|  338|
|     BODIES OF WATER|  327|
|             ANIMALS|  324|
|      STATE CAPITALS|  314|
| BUSINESS & INDUSTRY|  311|
|             ISLANDS|  301|
|      WORLD CAPITALS|  300|
|      U.S. GEOGRAPHY|  299|
|            RELIGION|  297|
+--------------------+-----+
only showing top 20 rows



In [93]:
#This is replacing with RDDs
'''
questionOnly = jeopardy.select("question")
questionRdd = questionOnly.rdd.map(lambda x: [col.replace(",","").replace("'","") for col in x])
a = '!"#$%&\()*+,-/:;<=>?@\\^_`{|}~]['
questionRdd1 = questionRdd.map(lambda x: [col.replace(";",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace('"','') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("!",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("#",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("$",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("%",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("&",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace('/','') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("(",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace(")",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("*",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("+",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace(",",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("-",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace(":",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("<",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("=",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace(">",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("?",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("@",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("^",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("_",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("{",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("}",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("|",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("~",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("`",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("[",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("]",'') for col in x])
questionRdd1 = questionRdd1.map(lambda x: [col.replace("'",'') for col in x])
questionRdd1.take(20)
''' and None


In [90]:
jeopardy = jeopardy.withColumn("stripped", f.regexp_replace(f.col("question"), "[\!@#$%^&*)(><,';:]", ""))


[Row(air_date='2004-12-31', answer='Copernicus', category='HISTORY', question="'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'", round='Jeopardy!', show_number='4680', value='$200', stripped='For the last 8 years of his life Galileo was under house arrest for espousing this mans theory'),
 Row(air_date='2004-12-31', answer='Jim Thorpe', category="ESPN's TOP 10 ALL-TIME ATHLETES", question="'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'", round='Jeopardy!', show_number='4680', value='$200', stripped='No. 2 1912 Olympian football star at Carlisle Indian School 6 MLB seasons with the Reds Giants  Braves')]

[['For the last 8 years of his life Galileo was under house arrest for espousing this mans theory'],
 ['No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds Giants & Braves'],
 ['The city of Yuma in this state has a record average of 4055 hours of sunshine each year'],
 ['In 1963 live on "The Art Linkletter Show" this company served its billionth burger'],
 ['Signer of the Dec. of Indep. framer of the Constitution of Mass. second President of the United States']]

In [91]:
tokenizer = Tokenizer(inputCol = "stripped", outputCol = "words")
tokenized = tokenizer.transform(jeopardy)
tokenized.select("stripped","words").take(5)

[Row(stripped='For the last 8 years of his life Galileo was under house arrest for espousing this mans theory', words=['for', 'the', 'last', '8', 'years', 'of', 'his', 'life', 'galileo', 'was', 'under', 'house', 'arrest', 'for', 'espousing', 'this', 'mans', 'theory']),
 Row(stripped='No. 2 1912 Olympian football star at Carlisle Indian School 6 MLB seasons with the Reds Giants  Braves', words=['no.', '2', '1912', 'olympian', 'football', 'star', 'at', 'carlisle', 'indian', 'school', '6', 'mlb', 'seasons', 'with', 'the', 'reds', 'giants', '', 'braves']),
 Row(stripped='The city of Yuma in this state has a record average of 4055 hours of sunshine each year', words=['the', 'city', 'of', 'yuma', 'in', 'this', 'state', 'has', 'a', 'record', 'average', 'of', '4055', 'hours', 'of', 'sunshine', 'each', 'year']),
 Row(stripped='In 1963 live on "The Art Linkletter Show" this company served its billionth burger', words=['in', '1963', 'live', 'on', '"the', 'art', 'linkletter', 'show"', 'this', 'com

In [92]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
remover.transform(tokenized).show(truncate=False)

+----------+---------------------+-------------------------------+------------------------------------------------------------------------------------------------------------------------+---------+-----------+-----+----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|air_date  |answer               |category                       |question                                                                                                                |round    |show_number|value|stripped                                                                                                        |words                                                                                            