In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext()
sc

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("final_text").getOrCreate()

In [4]:
data = spark.read.parquet("s3://ppol567lab2/datasample.parquet")

In [6]:
from pyspark.sql.functions import col, lower, regexp_replace

#function to clean up text and remove url inforamtion
def clean(text):
    t = lower(text)
    t = regexp_replace(t, "^rt ", "")
    t = regexp_replace(t, "[^a-zA-Z0-9\\s]", "")
    #remove hyperlinks
    t = regexp_replace(t, "(https?\://)\S+", "")
    t = regexp_replace(t, 'https?://(www.)?\w+\.\w+(/\w+)*/?', "")
    #remove mentions
    t = regexp_replace(t, '@(\w+)', "")
    #remove numeric words
    t = regexp_replace(t, '(\\d+)', "")
    return t

clean_text_df = data.select("*", clean(col("body")).alias("body_clean"))

clean_text_df.printSchema()
clean_text_df.show(10)

root
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- body: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- comment: integer (nullable = true)
 |-- post: integer (nullable = true)
 |-- moderator: integer (nullable = true)
 |-- admin: integer (nullable = true)
 |-- body_clean: string (nullable = true)

+----------------+--------------+--------------------+--------------------+--------+----------------+---------+-----+------------------+-------+----+---------+-----+--------------------+
|          author|author_cakeday|                body|author_patreon_flair|can_gild|controversiality|collapsed|score|         subreddit|comment|post|moderator|admin|          body_clean|
+----------------+--------------+---------

In [7]:
from pyspark.ml.feature import Tokenizer

#tokenizing data
tokenizer = Tokenizer(inputCol="body_clean", outputCol="body_tokenized")

clean_text_df = tokenizer.transform(clean_text_df)

In [8]:
from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords() 

remover.setInputCol("body_tokenized")
remover.setOutputCol("body_no_stopw")

# Transform existing dataframe with the StopWordsRemover
clean_text_df = remover.transform(clean_text_df)

# Display
clean_text_df.printSchema()
clean_text_df.show(5)

root
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- body: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- comment: integer (nullable = true)
 |-- post: integer (nullable = true)
 |-- moderator: integer (nullable = true)
 |-- admin: integer (nullable = true)
 |-- body_clean: string (nullable = true)
 |-- body_tokenized: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- body_no_stopw: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------------+--------------+--------------------+--------------------+--------+----------------+---------+-----+------------------+-------+----+---------+-----+--------------------+--------------------+--------------------+
|    

In [9]:
from pyspark.ml.feature import HashingTF

#hashing text data
hashingTF = HashingTF(inputCol= 'body_no_stopw', outputCol="rawfeatures")
feature_data = hashingTF.transform(clean_text_df)

In [10]:
from pyspark.ml.feature import IDF

#creating idf 
idf = IDF(inputCol="rawfeatures", outputCol="feats")
idfModel = idf.fit(feature_data)

feature_data = idfModel.transform(feature_data)

In [11]:
feature_data.show()

+----------------+--------------+-------------------------------------+--------------------+--------+----------------+---------+-----+------------------+-------+----+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|          author|author_cakeday|                                 body|author_patreon_flair|can_gild|controversiality|collapsed|score|         subreddit|comment|post|moderator|admin|          body_clean|      body_tokenized|       body_no_stopw|         rawfeatures|               feats|
+----------------+--------------+-------------------------------------+--------------------+--------+----------------+---------+-----+------------------+-------+----+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|         DashHen|          null|                 But actually what...|               false|    true|               0|    false|    7|  

In [None]:
#writing to data for modeling
feature_data.write.parquet("s3://ppol567lab2/data_text.parquet", mode = 'overwrite')