In [91]:
import findspark
findspark.init('/opt/spark')

import os
from dotenv import load_dotenv
load_dotenv('../.env')
access = os.environ.get('AWS_ACCESS')
secret = os.environ.get('AWS_SECRET')

In [92]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col, size, udf

conf = SparkConf() \
    .set("fs.s3a.awsAccessKeyId", access) \
    .set("fs.s3a.awsSecretAccessKey", secret) \
    .set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com") \
    .set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .set("fs.s3a.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem") \
    .set("com.amazonaws.services.s3.enableV4", "true") \
    .set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .set("spark.hadoop.fs.s3a.fast.upload","true") \
    .set("spark.sql.parquet.filterPushdown", "true")

spark = SparkSession.builder.appName('data-cleaning').config(conf=conf).getOrCreate()

In [93]:
import sparknlp
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml import Pipeline

In [94]:
filename = ''
df = spark.read.parquet(filename).drop('geo', 'coordinates', 'place', 'Unnamed: 0', 'Unnamed: 0.1')

In [95]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- full_text: string (nullable = true)
 |-- retweet_count: double (nullable = true)
 |-- favorite_count: double (nullable = true)



In [96]:
import preprocessor as p

In [97]:
def cleanTweet(text):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.MENTION)
    text = p.clean(text)
    return text
tweetPreprocessor_udf = udf(cleanTweet, StringType())

In [98]:
def extractHashtag(text):
    result = []
    parse = p.parse(text)

    if parse.hashtags == None:
        return result
    
    for i in range(len(parse.hashtags)):
        result.append(parse.hashtags[i].match[1:])
    return result

extractHashtag_udf = udf(extractHashtag, ArrayType(StringType()))

In [100]:
df = df.withColumn('hashtags', extractHashtag_udf('full_text'))

In [101]:
df = df.withColumn('clean', tweetPreprocessor_udf('full_text'))

+--------+
|hashtags|
+--------+
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
|      []|
+--------+
only showing top 20 rows



In [102]:
# The nltk stopword won't download onto emr without some setup so it's just easier to copy and paste
stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'dont', 'should', 'shouldve', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt', 'like', 'hes', 'let', 'lot', 'ok', 'yes']

In [103]:
documentAssembler = DocumentAssembler() \
    .setInputCol('clean') \
    .setOutputCol('document') \
    .setCleanupMode('shrink_full')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['normalized']) \
    .setOutputCol('normalized_no_stop') \
    .setCaseSensitive(False) \
    .setStopWords(stopwords_list)

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized_no_stop']) \
    .setOutputCol('lemma')

finisher = Finisher() \
     .setInputCols(['lemma']) \
     .setCleanAnnotations(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [104]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           stopwords_cleaner,
           lemmatizer,
           finisher
     ])

In [105]:
df = pipeline.fit(df).transform(df)
df = df.drop('clean').withColumnRenamed('finished_lemma', 'clean_text')

In [106]:
df = df.where(size(col('clean_text')) > 4)

In [107]:
df.select('clean_text').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|clean_text                                                                                                                                                        |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[impeach, mikepence, murder, +, american, many, supporter]                                                                                                        |
|[kind, gloss, domestic, terrorism, blue, livestrump, supporter, trumpsterrorist, revoke, trumpfailure]                                                            |
|[let, start, prosecute, white, house, violation, hatch, act]                                                                                                      |
|[let, tal

Writing will take forever

In [108]:
filename = ''
df.write.parquet(filename, mode='overwrite')