In [21]:
import findspark
findspark.init('/opt/spark')

import os
from dotenv import load_dotenv
load_dotenv('../.env')
access = os.environ.get('AWS_ACCESS')
secret = os.environ.get('AWS_SECRET')

In [99]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf

from pyspark.ml.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer , IDF

In [23]:
conf = SparkConf() \
    .set("fs.s3a.awsAccessKeyId", access) \
    .set("fs.s3a.awsSecretAccessKey", secret) \
    .set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com") \
    .set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .set("fs.s3a.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem") \
    .set("com.amazonaws.services.s3.enableV4", "true")

spark = SparkSession.builder.master('LDA').appName('cool').config(conf=conf).getOrCreate()

In [24]:
# filename = 's3a://patellism/processed_data/2020-08-cleaned.parquet.snappy/part-00004-d2d9c5cf-46de-47d9-86df-28fefd1709e5-c000.snappy.parquet'
filename = '2020-08-cleaned-small.snappy.parquet'
df = spark.read.parquet(filename).drop('geo', 'coordinates', 'place')
df.show(5)

+-------------------+--------------------+-------------+--------------+--------------------+
|                 id|           full_text|retweet_count|favorite_count|          clean_text|
+-------------------+--------------------+-------------+--------------+--------------------+
|1300070747059167233|@kmiranda1973 @Mi...|          0.0|           2.0|[kmiranda, militi...|
|1300070747453427713|@realDonaldTrump ...|          0.0|           0.0|[realdonaldtrump,...|
|1300070747566755845|@realDonaldTrump ...|          0.0|           1.0|[realdonaldtrump,...|
|1300070748116131840|@EricTrump @realD...|          0.0|           1.0|[erictrump, reald...|
|1300070748359454721|Impeached @realdo...|          0.0|           1.0|[impeach, realdon...|
+-------------------+--------------------+-------------+--------------+--------------------+
only showing top 5 rows



In [None]:
cv_idf = CountVectorizer(inputCol='clean_text', outputCol='raw_features', vocabSize=10000, minDF=35)
idf = IDF(inputCol="raw_features", outputCol="features")
lda_tfidf = LDA(k=10, maxIter=2)

tfidf_pipeline = Pipeline(stages=[cv_idf, idf, lda_tfidf])

In [None]:
cv_tf = CountVectorizer(inputCol='clean_text', outputCol='features', vocabSize=10000, minDF=35)
lda_tf = LDA(k=10, maxIter=2)

tf_pipeline = Pipeline(stages=[cv, lda])

In [None]:
# Three groupings
#   Ungrouped
#   Time
#   Hashtags
#       Need to determine similarity metrics and method for tweets with no hashtags

In [None]:
model = tfidf_pipeline.fit(df)

In [None]:
topics = model.stages[-1].describeTopics(maxTermsPerTopic=5)
vocabArray = model.stages[0].vocabulary

def covertToWord(indices):
    result = []
    for i in indices:
        result.append(vocabArray[i])
    return result

udf_convertToWord = udf(covertToWord, ArrayType(StringType()))
topics = topics.withColumn('word', udf_convertToWord('termIndices'))
topics.select('word').show(truncate=False)