In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import NGram
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import matplotlib.pyplot as plt

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

In [38]:
from wordcloud import WordCloud, STOPWORDS
def word_tokenize(x):
  import MeCab as mc
  t = mc.Tagger("-Owakati  -d ./MECAB/mecab_env/lib/mecab/dic/ipadic -r ./MECAB/mecab_env/etc/mecabrc")
  
  return t.parse(x.encode('utf-8'))

stopwords = set(STOPWORDS)

ModuleNotFoundError: No module named 'wordcloud'

In [2]:
jeopardy = spark.read.json("JEOPARDY_QUESTIONS1-Copy1.json")

In [3]:
j_categoryCount = jeopardy.groupBy("category").count()
count100 = j_categoryCount.sort(desc("count")).filter(j_categoryCount["count"] > 100).count()

In [4]:
top_categories = list(j_categoryCount.sort(desc("count")).select("category").limit(count100).toPandas().category)

In [5]:
jeo_f = jeopardy.where(col("category").isin(top_categories))

In [6]:
jeopardyUse = jeo_f.select("question")

In [7]:
jeopardyUse.coalesce(1).write.format("text").option("header", "false").mode("append").save("<path>")

In [29]:
text_file = spark.sparkContext.textFile("useforRdd.txt")

In [30]:
text_file.take(5)

['For the last 8 years of his life Galileo was under house arrest for espousing this mans theory',
 'No. 2 1912 Olympian football star at Carlisle Indian School 6 MLB seasons with the Reds Giants  Braves',
 'The city of Yuma in this state has a record average of 4055 hours of sunshine each year',
 'In 1963 live on "The Art Linkletter Show" this company served its billionth burger',
 'Signer of the Dec. of Indep. framer of the Constitution of Mass. second President of the United States']

In [35]:
counts = text_file.map(lambda line: word_tokenize(line)) \
             .flatMap(lambda line: line.split(" ")) \
             .filter(lambda word: len(word.decode('utf-8')) >= 3) \
             .map(lambda word: (word.decode('utf-8'), 1)) \
             .reduceByKey(lambda a, b: a + b)

In [None]:
from pyspark.sql.types import *
schema = StructType([StructField("word", StringType(), True),
                     StructField("frequency", IntegerType(), True)])

filtered = counts.filter(lambda pair: pair[1] >= threshold)
counts_df = spark.createDataFrame(filtered, schema)

frequencies = counts_df.toPandas().set_index('word').T.to_dict('records')