In [11]:
import findspark
findspark.init('/opt/spark')

import os
from dotenv import load_dotenv
load_dotenv('../.env')
access = os.environ.get('AWS_ACCESS')
secret = os.environ.get('AWS_SECRET')

In [68]:
import sparknlp

from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

from nltk.corpus import stopwords
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml import Pipeline

In [69]:
conf = SparkConf() \
    .set("fs.s3a.awsAccessKeyId", access) \
    .set("fs.s3a.awsSecretAccessKey", secret) \
    .set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com") \
    .set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .set("fs.s3a.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem") \
    .set("com.amazonaws.services.s3.enableV4", "true")

spark = SparkSession.builder.master('local').appName('cool').config(conf=conf).getOrCreate()

In [133]:
s3_file = ''
df = spark.read.parquet(s3_file).drop('geo', 'coordinates', 'place')

In [134]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- full_text: string (nullable = true)
 |-- retweet_count: double (nullable = true)
 |-- favorite_count: double (nullable = true)



In [136]:
import preprocessor as p

In [137]:
def tweetPreprocessor(text):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
    text = p.clean(text)
    return text
tweetPreprocessor_udf = udf(tweetPreprocessor, StringType())

In [138]:
df = df.withColumn('clean', tweetPreprocessor_udf('full_text'))

+-------------------+--------------------+-------------+--------------+--------------------+
|                 id|           full_text|retweet_count|favorite_count|               clean|
+-------------------+--------------------+-------------+--------------+--------------------+
|1300070747059167233|@kmiranda1973 @Mi...|          0.0|           2.0|@kmiranda1973 @Mi...|
|1300070747453427713|@realDonaldTrump ...|          0.0|           0.0|@realDonaldTrump ...|
|1300070747566755845|@realDonaldTrump ...|          0.0|           1.0|@realDonaldTrump ...|
|1300070748116131840|@EricTrump @realD...|          0.0|           1.0|@EricTrump @realD...|
|1300070748359454721|Impeached @realdo...|          0.0|           1.0|Impeached @realdo...|
+-------------------+--------------------+-------------+--------------+--------------------+
only showing top 5 rows



In [139]:
stopwords_list = stopwords.words('english')

documentAssembler = DocumentAssembler() \
    .setInputCol('clean') \
    .setOutputCol('document') \
    .setCleanupMode('shrink_full')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma')

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_text') \
    .setCaseSensitive(False) \
    .setStopWords(stopwords_list)

finisher = Finisher() \
     .setInputCols(['clean_text']) \
     .setCleanAnnotations(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [140]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [141]:
df = pipeline.fit(df).transform(df)
df = df.drop('clean').withColumnRenamed('finished_clean_text', 'clean_text')
df.show()

+-------------------+--------------------+-------------+--------------+--------------------+
|                 id|           full_text|retweet_count|favorite_count|          clean_text|
+-------------------+--------------------+-------------+--------------+--------------------+
|1300070747059167233|@kmiranda1973 @Mi...|          0.0|           2.0|[kmiranda, militi...|
|1300070747453427713|@realDonaldTrump ...|          0.0|           0.0|[realdonaldtrump,...|
|1300070747566755845|@realDonaldTrump ...|          0.0|           1.0|[realdonaldtrump,...|
|1300070748116131840|@EricTrump @realD...|          0.0|           1.0|[erictrump, reald...|
|1300070748359454721|Impeached @realdo...|          0.0|           1.0|[impeach, realdon...|
|1300070748367785985|@PollWatch2020 @r...|          0.0|           0.0|[pollwatch, reald...|
|1300070748631916548|@RichLowry @realD...|          0.0|           0.0|[richlowry, reald...|
|1300070748942344192|@realDonaldTrump ...|          0.0|           1.0

In [144]:
s3_write = ''
df.write.parquet(s3_write, mode='overwrite')