In [21]:
# import findspark
# findspark.init('/opt/spark')

# import os
# from dotenv import load_dotenv
# load_dotenv('../.env')
# access = os.environ.get('AWS_ACCESS')
# secret = os.environ.get('AWS_SECRET')

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql import Row
from pyspark.sql.types import ArrayType, StringType, TimestampType, DateType, StructType, DoubleType, IntegerType, FloatType
from pyspark.sql import functions as f

from pyspark.ml.functions import vector_to_array
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover
from pyspark.ml import Pipeline, PipelineModel

conf = SparkConf() \
    .set("fs.s3a.awsAccessKeyId", access) \
    .set("fs.s3a.awsSecretAccessKey", secret) \
    .set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com") \
    .set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .set("fs.s3a.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem") \
    .set("com.amazonaws.services.s3.enableV4", "true")

spark = SparkSession.builder.appName('LDA').config(conf=conf).getOrCreate()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1619723587027_0002,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [74]:
files = []
df = spark.read.parquet(files).drop('geo', 'coordinates', 'place', 'retweet_count', 'favorite_count')
df = df.withColumn('datetime', df['created_at'].cast(TimestampType())).drop('created_at')
df.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+
|                 id|           full_text|          clean_text|           sentiment|sentiment_class|           datetime|
+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+
|1316679802288513027|@realDonaldTrump ...|[yeah, read, one,...| [0.155, 0.0, 0.845]|              1|2020-10-15 09:58:22|
|1312125637600010240|Remember how nice...|[remember, nice, ...|[0.351, 0.263, 0....|              1|2020-10-02 20:21:44|
|1321059480394747905|You're insane.  P...|[insane, presiden...|[0.318, 0.233, 0.45]|              1|2020-10-27 12:01:38|
|1317133069783097345|@RepMattGaetz @re...|[democrat, protec...|[0.497, 0.081, 0....|              1|2020-10-16 15:59:29|
|1312853349126021128|It fuck up how pe...|[fuck, people, ce...|[0.35, 0.324, 0.327]|              1|2020-10-04 20:33:24|
+-------------------+-----------

In [75]:
temp = df.drop('id','full_text','hashtags','datetime') \
         .withColumn('word', f.explode('clean_text')) \
         .groupBy('word') \
         .count() \
         .sort('count', ascending=False)

remove_list = temp.drop('count').limit(25).collect()
to_remove = []
for row in remove_list:
    to_remove.append(row['word'])
to_remove = to_remove + ['let', 'cant', 'lol', 'way', 'also', 'for', 'and', 'nor', 'but', 'or', 'yet', 'so','give','come', '+', 'still', 'hear', 'already', 'feel', 'really', 'long', 'without', 'ask', 'via', 
                         'oh', 'please', 'seriously', 'that','every', 'must', 'stay', 'put', 'keep', 'ever', 'fit', 'gonna', 'theyre', 'anyone', 'thing', 'wow', 'yeah', 'sure', 'use', 'actually', 'much', 'help',
                        'everything', 'interest', 'remember', 'guess', 'there', 'tell', 'do', 'around', 'enough', 'question', 'speak', 'well', 'believe', 'become', 'call', 'look']

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [76]:
rm_freq_words = StopWordsRemover(inputCol='clean_text', outputCol='clean_rm_frequent', stopWords=to_remove)
cv = CountVectorizer(inputCol='clean_rm_frequent', outputCol='raw_features', vocabSize=5000, minDF=5)
idf = IDF(inputCol="raw_features", outputCol="features")
lda = LDA(k=6, maxIter=400, topicConcentration=.04, docConcentration=[.05], optimizer='online')

pipeline = Pipeline(stages=[rm_freq_words, cv, idf, lda])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [77]:
model = pipeline.fit(df)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [78]:
topics = model.stages[-1].describeTopics(maxTermsPerTopic=9)
vocabArray = model.stages[1].vocabulary

def covertToWord(indices):
    result = []
    for i in indices:
        result.append(vocabArray[i])
    return result

udf_convertToWord = f.udf(covertToWord, ArrayType(StringType()))
topics = topics.withColumn('word', udf_convertToWord('termIndices'))
topics.select('word').show(truncate=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------------------------------------------------------------------------+
|word                                                                                |
+------------------------------------------------------------------------------------+
|[state, america, god, ballot, bless, stand, last, country, act, lead]               |
|[plan, health, care, maga, security, social, world, try, life, part]                |
|[party, everyone, another, wish, never, bad, republican, twitter, conservative, try]|
|[house, tax, peace, mask, else, white, wear, today, rich, break]                    |
|[man, woman, black, court, word, wrong, support, supreme, donald, voter]            |
|[news, debate, test, first, covid, positive, funny, new, virus, even]               |
+------------------------------------------------------------------------------------+

In [79]:
len(vocabArray)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1890

In [80]:
transformed = model.transform(df).drop('hashtags', 'clean_rm_frequent', 'raw_features', 'features')
transformed.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+
|                 id|           full_text|          clean_text|           sentiment|sentiment_class|           datetime|   topicDistribution|
+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+
|1316679802288513027|@realDonaldTrump ...|[yeah, read, one,...| [0.155, 0.0, 0.845]|              1|2020-10-15 09:58:22|[0.47098284125424...|
|1312125637600010240|Remember how nice...|[remember, nice, ...|[0.351, 0.263, 0....|              1|2020-10-02 20:21:44|[0.41687262796477...|
|1321059480394747905|You're insane.  P...|[insane, presiden...|[0.318, 0.233, 0.45]|              1|2020-10-27 12:01:38|[0.08655836429115...|
|1317133069783097345|@RepMattGaetz @re...|[democrat, protec...|[0.497, 0.081, 0....|              1|2020-10-16 15:59:29|[0.36164059912514...|
|13128

In [81]:
max_index = f.udf(lambda x: x.tolist().index(max(x)), IntegerType())
transformed = transformed.withColumn('topic_id', max_index('topicDistribution'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [82]:
transformed.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+
|                 id|           full_text|          clean_text|           sentiment|sentiment_class|           datetime|   topicDistribution|topic_id|
+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+
|1316679802288513027|@realDonaldTrump ...|[yeah, read, one,...| [0.155, 0.0, 0.845]|              1|2020-10-15 09:58:22|[0.47098284125424...|       0|
|1312125637600010240|Remember how nice...|[remember, nice, ...|[0.351, 0.263, 0....|              1|2020-10-02 20:21:44|[0.41687262796477...|       0|
|1321059480394747905|You're insane.  P...|[insane, presiden...|[0.318, 0.233, 0.45]|              1|2020-10-27 12:01:38|[0.08655836429115...|       4|
|1317133069783097345|@RepMattGaetz @re...|[democrat, protec...|[0.497, 0.081, 0....|          

In [83]:
def extractSentiment(row):
    pos = row[0]
    neg = row[1]
    if pos > neg:
        return pos
    else:
        return neg

extractSentiment_udf = f.udf(extractSentiment, FloatType())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [84]:
transformed = transformed.withColumn('sentiment_score', extractSentiment_udf('sentiment'))
transformed.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+---------------+
|                 id|           full_text|          clean_text|           sentiment|sentiment_class|           datetime|   topicDistribution|topic_id|sentiment_score|
+-------------------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------+---------------+
|1316679802288513027|@realDonaldTrump ...|[yeah, read, one,...| [0.155, 0.0, 0.845]|              1|2020-10-15 09:58:22|[0.47098284125424...|       0|          0.155|
|1312125637600010240|Remember how nice...|[remember, nice, ...|[0.351, 0.263, 0....|              1|2020-10-02 20:21:44|[0.41687262796477...|       0|          0.351|
|1321059480394747905|You're insane.  P...|[insane, presiden...|[0.318, 0.233, 0.45]|              1|2020-10-27 12:01:38|[0.08655836429115...|       4|          0.318

In [85]:
transformed = transformed.groupBy('topic_id').agg(f.avg('sentiment_score'))
transformed.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+--------------------+
|topic_id|avg(sentiment_score)|
+--------+--------------------+
|       1| 0.31303439483426176|
|       3| 0.31083252128304506|
|       4|  0.3083450519043254|
|       5|  0.2992143931015321|
|       2| 0.31084645213138673|
|       0| 0.30439868577534634|
+--------+--------------------+

In [86]:
topics = topics.drop('termIndices', 'termWeights')
result = transformed.join(topics, transformed.topic_id == topics.topic, 'inner').drop('topic').sort(f.col('avg(sentiment_score)'), ascending=False)
result.show(truncate=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+--------------------+------------------------------------------------------------------------------------+
|topic_id|avg(sentiment_score)|word                                                                                |
+--------+--------------------+------------------------------------------------------------------------------------+
|1       |0.31303439483426176 |[plan, health, care, maga, security, social, world, try, life, part]                |
|2       |0.31084645213138673 |[party, everyone, another, wish, never, bad, republican, twitter, conservative, try]|
|3       |0.31083252128304506 |[house, tax, peace, mask, else, white, wear, today, rich, break]                    |
|4       |0.3083450519043254  |[man, woman, black, court, word, wrong, support, supreme, donald, voter]            |
|0       |0.30439868577534634 |[state, america, god, ballot, bless, stand, last, country, act, lead]               |
|5       |0.2992143931015321  |[news, debate, test, first, covid