## Big Data Project: Twitter Sentiment Analysis
Laiba Shah, Part Time Data Science Bootcamp

## 1. Installing Necessary Packages

In [0]:
#optional- create wordcloud in notebook
pip install wordcloud

Python interpreter will be restarted.
Collecting wordcloud
  Downloading wordcloud-1.8.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (458 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.2.2
Python interpreter will be restarted.


In [0]:
#install texblob NLP library for sentiment analysis
pip install textblob

Python interpreter will be restarted.
Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting nltk>=3.1
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (769 kB)
Installing collected packages: tqdm, regex, nltk, textblob
Successfully installed nltk-3.8.1 regex-2022.10.31 textblob-0.17.1 tqdm-4.64.1
Python interpreter will be restarted.


## 2. Import

In [0]:
#import SparkContext and SparkConf
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
#import all the libraries of pyspark.sql
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.feature import StopWordsRemover, CountVectorizer, Tokenizer, HashingTF, IDF, StringIndexer
from textblob import TextBlob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes #import all machine learning libraries
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator



## 3. Mounting Twitter s3 Buckets

In [0]:
#function to mount s3 bucket
def mount_s3_bucket(access_key, secret_key, bucket_name, mount_folder):
    ACCESS_KEY_ID=access_key
    SECRET_ACCESS_KEY=secret_key
    ENCODED_SECRET_KEY=SECRET_ACCESS_KEY.replace("/","%2F")
    
    print("Mounting", bucket_name)
    
    try:
        dbutils.fs.unmount("/mnt/%s" % mount_folder)
        
    except:
        print("Directory not unmounted: ", mount_folder)
        
    finally:
        dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY_ID, ENCODED_SECRET_KEY, bucket_name),"/mnt/%s" % mount_folder)
        print("The bucket ", bucket_name, " was mounted to ", mount_folder, "\n")

In [0]:
#specify access and secret access keys to access mounted s3 bucket
ACCESS_KEY="AKIA5NF3A6CQHV5Z7VH7"
SECRET_ACCESS_KEY="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"

In [0]:
#call function to mount to s3 elon musk twitter bucket
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, "weclouddata/twitter/ElonMusk/2022/", "twitter_Musk")

Mounting weclouddata/twitter/ElonMusk/2022/
/mnt/twitter_Musk has been unmounted.
The bucket  weclouddata/twitter/ElonMusk/2022/  was mounted to  twitter_Musk 



## 4. Reading and Formatting Data from Mounted Bucket

In [0]:
#create spark session 
spark = (SparkSession
        .builder
        .appName('elon_musk')
        .getOrCreate())
print('Session created')

#initialize SQLContext from spark cluster
sc=spark.sparkContext

Session created


In [0]:
#create schema to structure data from s3 bucket
schema = StructType([StructField("ID", StringType(), True),
                     StructField("Name", StringType(), True),
                     StructField("Handle", StringType(), True),
                     StructField("Tweet", StringType(), True),
                     StructField("Follower_count", IntegerType(), True),
                     StructField("Location", StringType(), True),
                     StructField("Geo", StringType(), True),
                     StructField("Created_at", StringType(), True)])

In [0]:
#read elon musk twitter data from s3 bucket using schema specified above into spark dataframe
df= spark.read.option('header', 'true').option('delimiter', '\t').schema(schema).csv('/mnt/twitter_Musk/11/*/*/*')
df.cache()
df.display()

ID,Name,Handle,Tweet,Follower_count,Location,Geo,Created_at
1.594755515485143e+18,Javier Perdomo,Javierperdomo,"RT @MattGertz: Elon Musk interacting with sycophantic right-wing influencers this weekend, a thread.",469.0,,,Mon Nov 21 18:11:58 +0000 2022
1.5947555175741645e+18,Casey Reilley,caseyreilley,"RT @MattGertz: Elon Musk interacting with sycophantic right-wing influencers this weekend, a thread.",180.0,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022
1.5947555198680433e+18,Val Ornelas,_surfcowgirl,"RT @elizableu: I’d like to make something else clear, I don’t work for Twitter, Elon Musk, any government, political party, group etc. I ru…",195.0,Merica,,Mon Nov 21 18:11:59 +0000 2022
1.594755519981437e+18,Name Can't be Blank,adrenaline1073,RT @disclosetv: JUST IN - Elon Musk has reinstated Rep. Marjorie Taylor Greene's (R-GA) personal Twitter account.,915.0,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022
1.5947555205267784e+18,The Original Johnboy 🇺🇸uLtRA MAgA🇺🇸 #WPS,johnboy02131989,RT @BehizyTweets: BREAKING: Elon Musk just reinstated Marjorie Taylor Greene's account https://t.co/4hwc46k6Yx,203.0,,,Mon Nov 21 18:11:59 +0000 2022
1.5947555207909376e+18,⚡️Gideon Henry⚡️🇺🇸🍊,GideonHenry,RT @w_terrence: Elon Musk should purchase the rights to the “ THE VIEW “on ABC. I would love to see the look on their faces and watch them…,3684.0,America,,Mon Nov 21 18:11:59 +0000 2022
1.594755521197785e+18,Leora Smoot,MomOfTwins57,"RT @DashDobrofsky: CNN's Jake Tapper asked Hakeem Jeffries what his ""reaction"" was to Donald Trump being reinstated on Twitter by Elon Musk…",772.0,,,Mon Nov 21 18:12:00 +0000 2022
1.594755522191917e+18,Howard Lovy,Howard_Lovy,RT @JTAnews: Elon Musk bantered with Kanye West and trolled the Anti-Defamation League this weekend. https://t.co/zvKrrAOP5C,8820.0,"Traverse City, MI",,Mon Nov 21 18:12:00 +0000 2022
1.594755524024746e+18,CNM MERCOSUL,CNMMERCOSUL,"RT @TradutordoBR: Jair Bolsonaro: ""Elon Musk, here they call me a myth, I don't know why, but you really are the myth of our freedom."" 🇧🇷🇺🇸…",1598.0,America do Sul,,Mon Nov 21 18:12:00 +0000 2022
1.5947555240037745e+18,Simon.Mount,pseuderman,@whotheFisfran Watching Elon Musk masterbate,300.0,London,,Mon Nov 21 18:12:00 +0000 2022


## 5. Cleaning Tweet Column

In [0]:
#drop any rows where tweet is null
tweets_clean= df.dropna(subset=['Tweet'])

#reformat tweets to remove unnecessary information
tweets_clean = tweets_clean.withColumn('Tweet', F.regexp_replace('Tweet', r"http\S+", "")) \
                    .withColumn('Tweet', F.regexp_replace('Tweet', r"[^a-zA-z]", " ")) \
                    .withColumn('Tweet', F.regexp_replace('Tweet', r"\s+", " ")) \
                    .withColumn('Tweet', F.lower('Tweet')) \
                    .withColumn('Tweet', F.trim('Tweet')) 

tweets_clean.display()

ID,Name,Handle,Tweet,Follower_count,Location,Geo,Created_at
1594755515485143052,Javier Perdomo,Javierperdomo,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,469,,,Mon Nov 21 18:11:58 +0000 2022
1594755517574164481,Casey Reilley,caseyreilley,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,180,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022
1594755519868043264,Val Ornelas,_surfcowgirl,rt elizableu i d like to make something else clear i don t work for twitter elon musk any government political party group etc i ru,195,Merica,,Mon Nov 21 18:11:59 +0000 2022
1594755519981436961,Name Can't be Blank,adrenaline1073,rt disclosetv just in elon musk has reinstated rep marjorie taylor greene s r ga personal twitter account,915,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022
1594755520526778368,The Original Johnboy 🇺🇸uLtRA MAgA🇺🇸 #WPS,johnboy02131989,rt behizytweets breaking elon musk just reinstated marjorie taylor greene s account,203,,,Mon Nov 21 18:11:59 +0000 2022
1594755520790937611,⚡️Gideon Henry⚡️🇺🇸🍊,GideonHenry,rt w_terrence elon musk should purchase the rights to the the view on abc i would love to see the look on their faces and watch them,3684,America,,Mon Nov 21 18:11:59 +0000 2022
1594755521197785106,Leora Smoot,MomOfTwins57,rt dashdobrofsky cnn s jake tapper asked hakeem jeffries what his reaction was to donald trump being reinstated on twitter by elon musk,772,,,Mon Nov 21 18:12:00 +0000 2022
1594755522191917056,Howard Lovy,Howard_Lovy,rt jtanews elon musk bantered with kanye west and trolled the anti defamation league this weekend,8820,"Traverse City, MI",,Mon Nov 21 18:12:00 +0000 2022
1594755524024746055,CNM MERCOSUL,CNMMERCOSUL,rt tradutordobr jair bolsonaro elon musk here they call me a myth i don t know why but you really are the myth of our freedom,1598,America do Sul,,Mon Nov 21 18:12:00 +0000 2022
1594755524003774476,Simon.Mount,pseuderman,whothefisfran watching elon musk masterbate,300,London,,Mon Nov 21 18:12:00 +0000 2022


## 6. Sentiment Analysis on Tweets

In [0]:
#create function using textblob library for processing tweets to determine polarity value
def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return 'positive'
    elif sentiment < 0:
        return 'negative'
    else:
        return 'neutral'

In [0]:
#call function and create column in new df to save results of sentiment analysis
sentiment_score = F.udf(lambda x: get_sentiment(x))
df_sent = tweets_clean.withColumn('sentiment', sentiment_score('Tweet'))

df_sent.display()

ID,Name,Handle,Tweet,Follower_count,Location,Geo,Created_at,sentiment
1594755515485143052,Javier Perdomo,Javierperdomo,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,469,,,Mon Nov 21 18:11:58 +0000 2022,positive
1594755517574164481,Casey Reilley,caseyreilley,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,180,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022,positive
1594755519868043264,Val Ornelas,_surfcowgirl,rt elizableu i d like to make something else clear i don t work for twitter elon musk any government political party group etc i ru,195,Merica,,Mon Nov 21 18:11:59 +0000 2022,positive
1594755519981436961,Name Can't be Blank,adrenaline1073,rt disclosetv just in elon musk has reinstated rep marjorie taylor greene s r ga personal twitter account,915,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022,neutral
1594755520526778368,The Original Johnboy 🇺🇸uLtRA MAgA🇺🇸 #WPS,johnboy02131989,rt behizytweets breaking elon musk just reinstated marjorie taylor greene s account,203,,,Mon Nov 21 18:11:59 +0000 2022,neutral
1594755520790937611,⚡️Gideon Henry⚡️🇺🇸🍊,GideonHenry,rt w_terrence elon musk should purchase the rights to the the view on abc i would love to see the look on their faces and watch them,3684,America,,Mon Nov 21 18:11:59 +0000 2022,positive
1594755521197785106,Leora Smoot,MomOfTwins57,rt dashdobrofsky cnn s jake tapper asked hakeem jeffries what his reaction was to donald trump being reinstated on twitter by elon musk,772,,,Mon Nov 21 18:12:00 +0000 2022,neutral
1594755522191917056,Howard Lovy,Howard_Lovy,rt jtanews elon musk bantered with kanye west and trolled the anti defamation league this weekend,8820,"Traverse City, MI",,Mon Nov 21 18:12:00 +0000 2022,neutral
1594755524024746055,CNM MERCOSUL,CNMMERCOSUL,rt tradutordobr jair bolsonaro elon musk here they call me a myth i don t know why but you really are the myth of our freedom,1598,America do Sul,,Mon Nov 21 18:12:00 +0000 2022,positive
1594755524003774476,Simon.Mount,pseuderman,whothefisfran watching elon musk masterbate,300,London,,Mon Nov 21 18:12:00 +0000 2022,neutral


## 7. Machine Learning Analysis

7a. Feature Transformers

In [0]:
#create transformers for the ML LR and later pipeline model
tokenizer = Tokenizer(inputCol="Tweet", outputCol="tokens")
tweets_tokenized = tokenizer.transform(df_sent)

#stopword removal
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tweets_stopword = stopword_remover.transform(tweets_tokenized)

#term frequency using countVectorizer
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol="cv")
cv_model = cv.fit(tweets_stopword)
tweets_cv = cv_model.transform(tweets_stopword)

#feature transformation using TF-IDF Vectorization
idf = IDF(inputCol="cv", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
idf_model = idf.fit(tweets_cv)
tweets_idf = idf_model.transform(tweets_cv)

#code sentiment categories with values between 0-2
label_encoder = StringIndexer(inputCol = "sentiment", outputCol = "label")
le_model = label_encoder.fit(tweets_idf)
tweets_label = le_model.transform(tweets_idf)

tweets_label.display()

ID,Name,Handle,Tweet,Follower_count,Location,Geo,Created_at,sentiment,tokens,filtered,cv,features,label
1594755515485143052,Javier Perdomo,Javierperdomo,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,469,,,Mon Nov 21 18:11:58 +0000 2022,positive,"List(rt, mattgertz, elon, musk, interacting, with, sycophantic, right, wing, influencers, this, weekend, a, thread)","List(rt, mattgertz, elon, musk, interacting, sycophantic, right, wing, influencers, weekend, thread)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.289216376689384, 3.8560088855384462, 3.911237189067705, 4.074051667863911, 4.13403828114073, 4.182362508602561, 4.188570599032641, 4.191588406372737))",0.0
1594755517574164481,Casey Reilley,caseyreilley,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,180,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022,positive,"List(rt, mattgertz, elon, musk, interacting, with, sycophantic, right, wing, influencers, this, weekend, a, thread)","List(rt, mattgertz, elon, musk, interacting, sycophantic, right, wing, influencers, weekend, thread)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.289216376689384, 3.8560088855384462, 3.911237189067705, 4.074051667863911, 4.13403828114073, 4.182362508602561, 4.188570599032641, 4.191588406372737))",0.0
1594755519868043264,Val Ornelas,_surfcowgirl,rt elizableu i d like to make something else clear i don t work for twitter elon musk any government political party group etc i ru,195,Merica,,Mon Nov 21 18:11:59 +0000 2022,positive,"List(rt, elizableu, i, d, like, to, make, something, else, clear, i, don, t, work, for, twitter, elon, musk, any, government, political, party, group, etc, i, ru)","List(rt, elizableu, d, like, make, something, else, clear, work, twitter, elon, musk, government, political, party, group, etc, ru)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 13, 123, 129, 137, 221, 345, 371, 446, 939, 964, 1303, 1626, 2580, 5435), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 13, 123, 129, 137, 221, 345, 371, 446, 939, 964, 1303, 1626, 2580, 5435), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 0.8334641350576671, 3.122005138886446, 4.344564412036965, 4.3966118123194065, 4.447345491395334, 4.6933212349908695, 5.063637649295379, 5.128851096939793, 5.375546579364405, 6.294474095890863, 6.353964872304168, 6.812476123627297, 7.088225975572108, 7.857166995501586, 9.117835143504069))",0.0
1594755519981436961,Name Can't be Blank,adrenaline1073,rt disclosetv just in elon musk has reinstated rep marjorie taylor greene s r ga personal twitter account,915,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022,neutral,"List(rt, disclosetv, just, in, elon, musk, has, reinstated, rep, marjorie, taylor, greene, s, r, ga, personal, twitter, account)","List(rt, disclosetv, elon, musk, reinstated, rep, marjorie, taylor, greene, r, ga, personal, twitter, account)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 7, 23, 56, 58, 60, 101, 120, 130, 133, 179), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 7, 23, 56, 58, 60, 101, 120, 130, 133, 179), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 0.8334641350576671, 2.8037840031880874, 3.4140526688478667, 3.936519867629343, 3.9451439900734404, 3.9413715869828962, 4.1643582632477605, 4.319980613686475, 4.388973485173427, 4.412316983681695, 4.54714138704595))",1.0
1594755520526778368,The Original Johnboy 🇺🇸uLtRA MAgA🇺🇸 #WPS,johnboy02131989,rt behizytweets breaking elon musk just reinstated marjorie taylor greene s account,203,,,Mon Nov 21 18:11:59 +0000 2022,neutral,"List(rt, behizytweets, breaking, elon, musk, just, reinstated, marjorie, taylor, greene, s, account)","List(rt, behizytweets, breaking, elon, musk, reinstated, marjorie, taylor, greene, account)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 7, 17, 23, 56, 58, 60, 5703), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 7, 17, 23, 56, 58, 60, 5703), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 2.8037840031880874, 3.2375349588737126, 3.4140526688478667, 3.936519867629343, 3.9451439900734404, 3.9413715869828962, 9.204846520493698))",1.0
1594755520790937611,⚡️Gideon Henry⚡️🇺🇸🍊,GideonHenry,rt w_terrence elon musk should purchase the rights to the the view on abc i would love to see the look on their faces and watch them,3684,America,,Mon Nov 21 18:11:59 +0000 2022,positive,"List(rt, w_terrence, elon, musk, should, purchase, the, rights, to, the, the, view, on, abc, i, would, love, to, see, the, look, on, their, faces, and, watch, them)","List(rt, w_terrence, elon, musk, purchase, rights, view, abc, love, see, look, faces, watch)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 116, 118, 125, 185, 248, 309, 329, 343, 354, 355), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 116, 118, 125, 185, 248, 309, 329, 343, 354, 355), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 4.260825975479426, 4.2668905381429365, 4.34785698321473, 4.579873707209426, 4.7682743100797635, 4.972938302119136, 5.014732747015558, 5.057871174882977, 5.0840862683318315, 5.0850705203797935))",0.0
1594755521197785106,Leora Smoot,MomOfTwins57,rt dashdobrofsky cnn s jake tapper asked hakeem jeffries what his reaction was to donald trump being reinstated on twitter by elon musk,772,,,Mon Nov 21 18:12:00 +0000 2022,neutral,"List(rt, dashdobrofsky, cnn, s, jake, tapper, asked, hakeem, jeffries, what, his, reaction, was, to, donald, trump, being, reinstated, on, twitter, by, elon, musk)","List(rt, dashdobrofsky, cnn, jake, tapper, asked, hakeem, jeffries, reaction, donald, trump, reinstated, twitter, elon, musk)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 4, 20, 23, 128, 244, 259, 289, 292, 304, 307, 308), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 4, 20, 23, 128, 244, 259, 289, 292, 304, 307, 308), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 0.8334641350576671, 2.4597171286007526, 3.310610659012671, 3.4140526688478667, 4.360898069043994, 4.755798653706688, 4.8025717249898685, 4.870531086824862, 4.873316049834348, 4.9185468191204835, 4.929865321842562, 4.932398037421846))",1.0
1594755522191917056,Howard Lovy,Howard_Lovy,rt jtanews elon musk bantered with kanye west and trolled the anti defamation league this weekend,8820,"Traverse City, MI",,Mon Nov 21 18:12:00 +0000 2022,neutral,"List(rt, jtanews, elon, musk, bantered, with, kanye, west, and, trolled, the, anti, defamation, league, this, weekend)","List(rt, jtanews, elon, musk, bantered, kanye, west, trolled, anti, defamation, league, weekend)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 51, 226, 320, 928, 1527, 1528, 9616, 16560, 28465), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 51, 226, 320, 928, 1527, 1528, 9616, 16560, 28465), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.911237189067705, 4.73511430540071, 4.97381897148473, 6.281359153813035, 6.997571607303977, 7.000910508569492, 10.216447432172178, 0.0, 0.0))",1.0
1594755524024746055,CNM MERCOSUL,CNMMERCOSUL,rt tradutordobr jair bolsonaro elon musk here they call me a myth i don t know why but you really are the myth of our freedom,1598,America do Sul,,Mon Nov 21 18:12:00 +0000 2022,positive,"List(rt, tradutordobr, jair, bolsonaro, elon, musk, here, they, call, me, a, myth, i, don, t, know, why, but, you, really, are, the, myth, of, our, freedom)","List(rt, tradutordobr, jair, bolsonaro, elon, musk, call, myth, know, really, myth, freedom)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 49, 132, 135, 138, 278, 374, 378, 379), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 49, 132, 135, 138, 278, 374, 378, 379), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.8571619557152093, 4.411814597336036, 4.445006309042162, 10.229101093595194, 4.874113180207367, 5.125769430402386, 5.1350430671877145, 5.1365970690544485))",0.0
1594755524003774476,Simon.Mount,pseuderman,whothefisfran watching elon musk masterbate,300,London,,Mon Nov 21 18:12:00 +0000 2022,neutral,"List(whothefisfran, watching, elon, musk, masterbate)","List(whothefisfran, watching, elon, musk, masterbate)","Map(vectorType -> sparse, length -> 57241, indices -> List(1, 2, 776, 45764, 53398), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(1, 2, 776, 45764, 53398), values -> List(0.27868325941847955, 0.2749297788265887, 6.073312705780645, 0.0, 0.0))",1.0


7b. Logistic Regression Classifier

In [0]:
#logistic regression classification without (train/test) paramter tuning
lr = LogisticRegression(maxIter=100)
lr_model = lr.fit(tweets_label)
predictions = lr_model.transform(tweets_label)

#model evaluation using multiclassfication evaluator for lr classification
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)

#print accuracy score and prediction dataframe
print("Accuracy Score: {0:.4f}".format(accuracy))
predictions.display()

Accuracy Score: 0.9952


ID,Name,Handle,Tweet,Follower_count,Location,Geo,Created_at,sentiment,tokens,filtered,cv,features,label,rawPrediction,probability,prediction
1594755515485143052,Javier Perdomo,Javierperdomo,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,469,,,Mon Nov 21 18:11:58 +0000 2022,positive,"List(rt, mattgertz, elon, musk, interacting, with, sycophantic, right, wing, influencers, this, weekend, a, thread)","List(rt, mattgertz, elon, musk, interacting, sycophantic, right, wing, influencers, weekend, thread)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.289216376689384, 3.8560088855384462, 3.911237189067705, 4.074051667863911, 4.13403828114073, 4.182362508602561, 4.188570599032641, 4.191588406372737))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(7.069273715158641, -3.65405866762168, -3.415215047536963))","Map(vectorType -> dense, length -> 3, values -> List(0.9999500105954332, 2.2023899057402388E-5, 2.7965505509446068E-5))",0.0
1594755517574164481,Casey Reilley,caseyreilley,rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,180,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022,positive,"List(rt, mattgertz, elon, musk, interacting, with, sycophantic, right, wing, influencers, this, weekend, a, thread)","List(rt, mattgertz, elon, musk, interacting, sycophantic, right, wing, influencers, weekend, thread)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 19, 45, 51, 86, 96, 104, 107, 108), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.289216376689384, 3.8560088855384462, 3.911237189067705, 4.074051667863911, 4.13403828114073, 4.182362508602561, 4.188570599032641, 4.191588406372737))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(7.069273715158641, -3.65405866762168, -3.415215047536963))","Map(vectorType -> dense, length -> 3, values -> List(0.9999500105954332, 2.2023899057402388E-5, 2.7965505509446068E-5))",0.0
1594755519868043264,Val Ornelas,_surfcowgirl,rt elizableu i d like to make something else clear i don t work for twitter elon musk any government political party group etc i ru,195,Merica,,Mon Nov 21 18:11:59 +0000 2022,positive,"List(rt, elizableu, i, d, like, to, make, something, else, clear, i, don, t, work, for, twitter, elon, musk, any, government, political, party, group, etc, i, ru)","List(rt, elizableu, d, like, make, something, else, clear, work, twitter, elon, musk, government, political, party, group, etc, ru)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 13, 123, 129, 137, 221, 345, 371, 446, 939, 964, 1303, 1626, 2580, 5435), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 13, 123, 129, 137, 221, 345, 371, 446, 939, 964, 1303, 1626, 2580, 5435), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 0.8334641350576671, 3.122005138886446, 4.344564412036965, 4.3966118123194065, 4.447345491395334, 4.6933212349908695, 5.063637649295379, 5.128851096939793, 5.375546579364405, 6.294474095890863, 6.353964872304168, 6.812476123627297, 7.088225975572108, 7.857166995501586, 9.117835143504069))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(9.209121888495966, -14.473426613570622, 5.264304725074653))","Map(vectorType -> dense, length -> 3, values -> List(0.9810127389282202, 5.0871496677539024E-11, 0.018987261020908454))",0.0
1594755519981436961,Name Can't be Blank,adrenaline1073,rt disclosetv just in elon musk has reinstated rep marjorie taylor greene s r ga personal twitter account,915,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022,neutral,"List(rt, disclosetv, just, in, elon, musk, has, reinstated, rep, marjorie, taylor, greene, s, r, ga, personal, twitter, account)","List(rt, disclosetv, elon, musk, reinstated, rep, marjorie, taylor, greene, r, ga, personal, twitter, account)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 7, 23, 56, 58, 60, 101, 120, 130, 133, 179), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 7, 23, 56, 58, 60, 101, 120, 130, 133, 179), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 0.8334641350576671, 2.8037840031880874, 3.4140526688478667, 3.936519867629343, 3.9451439900734404, 3.9413715869828962, 4.1643582632477605, 4.319980613686475, 4.388973485173427, 4.412316983681695, 4.54714138704595))",1.0,"Map(vectorType -> dense, length -> 3, values -> List(-7.000850037810684, 8.75428058572866, -1.753430547917976))","Map(vectorType -> dense, length -> 3, values -> List(1.4375460947797208E-7, 0.999972532067261, 2.7324178129508598E-5))",1.0
1594755520526778368,The Original Johnboy 🇺🇸uLtRA MAgA🇺🇸 #WPS,johnboy02131989,rt behizytweets breaking elon musk just reinstated marjorie taylor greene s account,203,,,Mon Nov 21 18:11:59 +0000 2022,neutral,"List(rt, behizytweets, breaking, elon, musk, just, reinstated, marjorie, taylor, greene, s, account)","List(rt, behizytweets, breaking, elon, musk, reinstated, marjorie, taylor, greene, account)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 7, 17, 23, 56, 58, 60, 5703), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 7, 17, 23, 56, 58, 60, 5703), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 2.8037840031880874, 3.2375349588737126, 3.4140526688478667, 3.936519867629343, 3.9451439900734404, 3.9413715869828962, 9.204846520493698))",1.0,"Map(vectorType -> dense, length -> 3, values -> List(-3.4986998160735316, 11.382881548664011, -7.884181732590479))","Map(vectorType -> dense, length -> 3, values -> List(3.4435879535183834E-7, 0.9999996513515673, 4.289637399228817E-9))",1.0
1594755520790937611,⚡️Gideon Henry⚡️🇺🇸🍊,GideonHenry,rt w_terrence elon musk should purchase the rights to the the view on abc i would love to see the look on their faces and watch them,3684,America,,Mon Nov 21 18:11:59 +0000 2022,positive,"List(rt, w_terrence, elon, musk, should, purchase, the, rights, to, the, the, view, on, abc, i, would, love, to, see, the, look, on, their, faces, and, watch, them)","List(rt, w_terrence, elon, musk, purchase, rights, view, abc, love, see, look, faces, watch)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 116, 118, 125, 185, 248, 309, 329, 343, 354, 355), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 116, 118, 125, 185, 248, 309, 329, 343, 354, 355), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 4.260825975479426, 4.2668905381429365, 4.34785698321473, 4.579873707209426, 4.7682743100797635, 4.972938302119136, 5.014732747015558, 5.057871174882977, 5.0840862683318315, 5.0850705203797935))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(8.961150913129748, 0.5540392755004726, -9.515190188630223))","Map(vectorType -> dense, length -> 3, values -> List(0.9997767665635156, 2.2322397999242205E-4, 9.456491877657635E-9))",0.0
1594755521197785106,Leora Smoot,MomOfTwins57,rt dashdobrofsky cnn s jake tapper asked hakeem jeffries what his reaction was to donald trump being reinstated on twitter by elon musk,772,,,Mon Nov 21 18:12:00 +0000 2022,neutral,"List(rt, dashdobrofsky, cnn, s, jake, tapper, asked, hakeem, jeffries, what, his, reaction, was, to, donald, trump, being, reinstated, on, twitter, by, elon, musk)","List(rt, dashdobrofsky, cnn, jake, tapper, asked, hakeem, jeffries, reaction, donald, trump, reinstated, twitter, elon, musk)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 4, 20, 23, 128, 244, 259, 289, 292, 304, 307, 308), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 3, 4, 20, 23, 128, 244, 259, 289, 292, 304, 307, 308), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 0.8334641350576671, 2.4597171286007526, 3.310610659012671, 3.4140526688478667, 4.360898069043994, 4.755798653706688, 4.8025717249898685, 4.870531086824862, 4.873316049834348, 4.9185468191204835, 4.929865321842562, 4.932398037421846))",1.0,"Map(vectorType -> dense, length -> 3, values -> List(-4.123717908684958, 8.091908116481427, -3.968190207796471))","Map(vectorType -> dense, length -> 3, values -> List(4.952407364215534E-6, 0.9999892618223997, 5.785770236121185E-6))",1.0
1594755522191917056,Howard Lovy,Howard_Lovy,rt jtanews elon musk bantered with kanye west and trolled the anti defamation league this weekend,8820,"Traverse City, MI",,Mon Nov 21 18:12:00 +0000 2022,neutral,"List(rt, jtanews, elon, musk, bantered, with, kanye, west, and, trolled, the, anti, defamation, league, this, weekend)","List(rt, jtanews, elon, musk, bantered, kanye, west, trolled, anti, defamation, league, weekend)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 51, 226, 320, 928, 1527, 1528, 9616, 16560, 28465), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 51, 226, 320, 928, 1527, 1528, 9616, 16560, 28465), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.911237189067705, 4.73511430540071, 4.97381897148473, 6.281359153813035, 6.997571607303977, 7.000910508569492, 10.216447432172178, 0.0, 0.0))",1.0,"Map(vectorType -> dense, length -> 3, values -> List(0.5695981788955957, 15.693096813414266, -16.26269499230986))","Map(vectorType -> dense, length -> 3, values -> List(2.703633870459164E-7, 0.9999997296365997, 1.3236581453888276E-14))",1.0
1594755524024746055,CNM MERCOSUL,CNMMERCOSUL,rt tradutordobr jair bolsonaro elon musk here they call me a myth i don t know why but you really are the myth of our freedom,1598,America do Sul,,Mon Nov 21 18:12:00 +0000 2022,positive,"List(rt, tradutordobr, jair, bolsonaro, elon, musk, here, they, call, me, a, myth, i, don, t, know, why, but, you, really, are, the, myth, of, our, freedom)","List(rt, tradutordobr, jair, bolsonaro, elon, musk, call, myth, know, really, myth, freedom)","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 49, 132, 135, 138, 278, 374, 378, 379), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(0, 1, 2, 49, 132, 135, 138, 278, 374, 378, 379), values -> List(0.25209607074085033, 0.27868325941847955, 0.2749297788265887, 3.8571619557152093, 4.411814597336036, 4.445006309042162, 10.229101093595194, 4.874113180207367, 5.125769430402386, 5.1350430671877145, 5.1365970690544485))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(7.839300310369076, -1.588642074858058, -6.2506582355110165))","Map(vectorType -> dense, length -> 3, values -> List(0.999918802048448, 8.043802172997586E-5, 7.599298222088999E-7))",0.0
1594755524003774476,Simon.Mount,pseuderman,whothefisfran watching elon musk masterbate,300,London,,Mon Nov 21 18:12:00 +0000 2022,neutral,"List(whothefisfran, watching, elon, musk, masterbate)","List(whothefisfran, watching, elon, musk, masterbate)","Map(vectorType -> sparse, length -> 57241, indices -> List(1, 2, 776, 45764, 53398), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 57241, indices -> List(1, 2, 776, 45764, 53398), values -> List(0.27868325941847955, 0.2749297788265887, 6.073312705780645, 0.0, 0.0))",1.0,"Map(vectorType -> dense, length -> 3, values -> List(-0.9693481405274076, 4.15912173620875, -3.1897735956813427))","Map(vectorType -> dense, length -> 3, values -> List(0.00588694955144831, 0.9934739460721733, 6.391043763784413E-4))",1.0


## 8. ML Pipeline

Modeling a pipeline reusable for predicting future tweet sentiment by putting all transformers and estimators in a `Pipeline` object.

In [0]:
#function to determine model accuracy
def evaluate_model(model_name, model_predictions):
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    accuracy = evaluator.evaluate(model_predictions)
    print(str(model_name) + " Accuracy Score: {0:.4f}".format(accuracy))

#function to run model as specified below
def ml_model(model_name):
    model = model_name.fit(trainData)
    predictions = model.transform(testData)
    evaluate_model(model_name, predictions)
    #predictions.display()

In [0]:
#build and fit the pipeline model
pipeline = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, label_encoder])
pipeline_fit = pipeline.fit(df_sent)
dataset = pipeline_fit.transform(df_sent)

In [0]:
#use 70% cases for training, 30% cases for testing
(trainData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

print("Training Dataset Count: " + str(trainData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 229355
Test Dataset Count: 98836


In [0]:
#logistic regression model with parameter tuning
logistic_reg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

#cross-validation to tune hyper parameters;only tuning the count vectors Logistic Regression
paramGrid = (ParamGridBuilder()
             .addGrid(logistic_reg.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(logistic_reg.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())

cross_val = CrossValidator(estimator=logistic_reg, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

In [0]:
#Naive Bayes model
naive_bayes = NaiveBayes(smoothing=1)

In [0]:
#random forest model- grayed out because it kept timing out and throwing errors
#random_forest = RandomForestClassifier(labelCol="label", \
#                            featuresCol="features", \
#                            numTrees = 100, \
#                            maxDepth = 4, \
#                            maxBins = 32)

In [0]:
#calling the ml model function for each machine learning model as specified below
ml_model(logistic_reg)
ml_model(cross_val)
#ml_model(random_forest)
ml_model(naive_bayes)

LogisticRegression_debc837a0756 Accuracy Score: 0.9392
CrossValidator_e5182ae6c632 Accuracy Score: 0.9516
NaiveBayes_c2181b950024 Accuracy Score: 0.9109


## 9. Write Prediction and Raw Data into s3 bucket

In [0]:
#write predictions and raw data into my s3 bucket
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'ptb2-shah', 'my_bucket')

Mounting ptb2-shah
/mnt/my_bucket has been unmounted.
The bucket  ptb2-shah  was mounted to  my_bucket 



In [0]:
#write predictions into parquet file due to array columns
cols = ("tokens","cv","features","rawPrediction","probability") #remove any unnecessary columns before writing to parquet file
predictions=predictions.drop(*cols)
predictions.write.option('header','false').parquet('/mnt/my_bucket/twitter/em_predictions_21.parquet')

In [0]:
#write raw data into csv file
df_sent.write.option('header','false').option('delimiter', '\t').csv('/mnt/my_bucket/twitter/rawdata_21.csv')