In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer
import re
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from numpy import array
from math import sqrt
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import GaussianMixture


sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Jeopardy Calculation") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

In [2]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        out_vec.append(t_stem)       
    return out_vec
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

In [3]:
df = sqlCtx.read.format("csv")\
    .option("header", "true")\
    .option("delimiter", "\t")\
    .option("inferSchema", "true")\
    .load("rspct.tsv")


In [4]:
df.cache()

DataFrame[id: string, subreddit: string, title: string, selftext: string]

In [5]:
df.show(1, truncate=False)

+------+--------------------+--------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)



In [7]:
df.count()

1013000

In [149]:
df_lb = df.withColumn("cleaned", regexp_replace('selftext','<lb>',' '))

In [204]:
df_punc = df_lb.withColumn("stripped_5", f.regexp_replace(f.col("cleaned"), "[\!@#$%^&*)(><,.;:?-]", ""))
df_punc = df_punc.withColumn("stripped_4", f.regexp_replace(f.col("stripped_5"),'[,]'," "))
df_punc = df_punc.withColumn("stripped_3", f.regexp_replace(f.col("stripped_5"),'["]',""))
df_punc = df_punc.withColumn("stripped_2", f.regexp_replace(f.col("stripped_3"),"[']",""))
df_punc = df_punc.withColumn("stripped", f.regexp_replace(f.col("stripped_2"),"[\[\]]",""))
df_punc = df_punc.withColumn("stripped_final5", regexp_replace('stripped','  ',' '))
df_punc = df_punc.withColumn("stripped_final4", regexp_replace('stripped_final5','  ',' '))
df_punc = df_punc.withColumn("stripped_final3", regexp_replace('stripped_final4','  ',' '))
df_punc = df_punc.withColumn("stripped_final2", regexp_replace('stripped_final3','  ',' '))
df_punc = df_punc.withColumn("stripped_final", regexp_replace('stripped_final2','  ',' '))



In [205]:
df_punc.select("stripped_final").show(1,truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|stripped_final                                                                                                                                                                      

In [207]:
tokenizer = Tokenizer(inputCol = "stripped_final", outputCol = "words")
tokenized = tokenizer.transform(df_punc)

In [208]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
dfStopRemoved = remover.transform(tokenized)

In [209]:
dfStemmed = dfStopRemoved.withColumn("stemmed", stemmer_udf("filtered"))

In [210]:
dfStemmed.select("stemmed").show(2,truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|stemmed                                                                                                                                                                                                                                                                                                                                                                                         