## Creating a sentiment analyzer to classify the sentiment of a twitter post as positive or negative

### Using the Standford labelled twitter data to train the model

In [1]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import zipfile

In [2]:
import findspark

findspark.init("/Users/Monika/Documents/Apache_Spark/spark-3.1.1-bin-hadoop2.7")

#ref: https://kevinvecmanis.io/python/pyspark/install/2019/05/31/Installing-Apache-Spark.html
#'Users/vanaurum/server/spark-2.4.3-bin-hadoop2.7'
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import format_number as fmt
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer,Word2Vec,StopWordsRemover,StringIndexer


In [3]:
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [4]:
#AAPL labelled tweets data: https://data.world/crowdflower/apple-twitter-sentiment

#https://stackoverflow.com/questions/2272908/too-many-open-files-how-many-are-open-what-they-are-and-how-many-can-the-jvm

### Loading the data

In [5]:
#Extracting the data from tar file

with zipfile.ZipFile("training.1600000.processed.noemoticon.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("./data")

In [6]:
spark = (SparkSession
    .builder
    .appName('Sentiment_Analyser')
    .config('spark.executor.memory', "1G") 
    .config('spark.driver.memory', "2G") 
    .config('spark.sql.shuffle.partitions',2) 
    .config('spark.worker.cleanup.enabled', 'True')
    .config("spark.local.dir", "/tmp/spark-temp")
    .getOrCreate())

#ref: https://stackoverflow.com/questions/61328134/standalone-pyspark-error-too-many-open-files

In [7]:
#Reading the data from csv file to spark dataframe

data = spark.read.csv("data/training.1600000.processed.noemoticon.csv")

data = data.dropna()
data = data.dropDuplicates()
print("Labelled Tweets count is: ", data.count())


Labelled Tweets count is:  1600000


In [8]:
data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [9]:
data.show(10)

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|  0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  0|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nope t...|
|  0|1467812416|Mon Apr 06 22:20:...|NO_QUERY| erinx3leannexo|spring break in p...|
|  0|1467812723|Mon Apr 06 22:20:...|NO_QUERY|           TLeC|@caregiving I cou...|
|  0|1467812799|Mon Apr 06 22:20:...|NO_QUERY|     HairByJess|@iamjazzyfizzle I...|
|  0|1467812964|Mon Apr 06 22:20:...|NO_QUERY| lovesongwriter|Hollis' death sce...|
|  0|1467813985|Mon Apr 06 22:20:...|NO_QUERY|         quanvu|@alydesigns i was...|
|  0|1467813992|Mon Apr 06 22:20:...|NO_QUERY|     swinspeedx|one of my frie

In [10]:
data = data.withColumnRenamed("_c0","sentiment_value")
data = data.withColumnRenamed("_c5","tweet_content")

In [11]:
data.show(2)

+---------------+----------+--------------------+--------+-------+--------------------+
|sentiment_value|       _c1|                 _c2|     _c3|    _c4|       tweet_content|
+---------------+----------+--------------------+--------+-------+--------------------+
|              0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|mybirch|         Need a hug |
|              0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|   coZZ|@LOLTrish hey  lo...|
+---------------+----------+--------------------+--------+-------+--------------------+
only showing top 2 rows



In [12]:
data.select("sentiment_value","tweet_content").show(10)

+---------------+--------------------+
|sentiment_value|       tweet_content|
+---------------+--------------------+
|              0|         Need a hug |
|              0|@LOLTrish hey  lo...|
|              0|@Tatiana_K nope t...|
|              0|spring break in p...|
|              0|@caregiving I cou...|
|              0|@iamjazzyfizzle I...|
|              0|Hollis' death sce...|
|              0|@alydesigns i was...|
|              0|one of my friend ...|
|              0|this week is not ...|
+---------------+--------------------+
only showing top 10 rows



In [13]:
data.select("sentiment_value").distinct().count()

2

In [14]:
data.select("sentiment_value").distinct().show()

+---------------+
|sentiment_value|
+---------------+
|              4|
|              0|
+---------------+



In [15]:
data.select("sentiment_value").groupBy("sentiment_value").count().show()

+---------------+------+
|sentiment_value| count|
+---------------+------+
|              4|800000|
|              0|800000|
+---------------+------+



In [16]:
data = data.withColumn("sentiment_value",regexp_replace("sentiment_value","4","1")) #ref:https://dwgeek.com/replace-pyspark-dataframe-column-value-methods.html/
data.groupBy("sentiment_value").count().show()

+---------------+------+
|sentiment_value| count|
+---------------+------+
|              1|800000|
|              0|800000|
+---------------+------+



In [17]:
data.show(3)

+---------------+----------+--------------------+--------+---------------+--------------------+
|sentiment_value|       _c1|                 _c2|     _c3|            _c4|       tweet_content|
+---------------+----------+--------------------+--------+---------------+--------------------+
|              0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|              0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|              0|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nope t...|
+---------------+----------+--------------------+--------+---------------+--------------------+
only showing top 3 rows



### Cleaning the data

In [18]:
def clean_pattern(text, pattern):
    """
    Function to identify the given pattern in a string and remove all the occurences
    """
    r = re.findall(pattern, text)
    for i in r:
        text = re.sub(i, '', text)        
    return text

def clean_tweet(text):
    
    """
      Function to remove tweet mentions, retweet tags, URL's, numbers and special characters
    """
    text = clean_pattern(text, 'RT @[\w]*:')

    text = clean_pattern(text, '@[\w]*')
    
    text = clean_pattern(text, 'https?://[A-Za-z0-9./]*')

    text = re.sub('[^A-Za-z]+', ' ', text)
    
    return text

In [19]:
clean_tweet_udf = udf(clean_tweet,StringType())

data = data.withColumn("cleaned_tweet", clean_tweet_udf("tweet_content"))

In [20]:
data.select("tweet_content","cleaned_tweet").show(5)

+--------------------+--------------------+
|       tweet_content|       cleaned_tweet|
+--------------------+--------------------+
|         Need a hug |         Need a hug |
|@LOLTrish hey  lo...| hey long time no...|
|@Tatiana_K nope t...| nope they didn t...|
|spring break in p...|spring break in p...|
|@caregiving I cou...| I couldn t bear ...|
+--------------------+--------------------+
only showing top 5 rows



In [21]:
data = data.withColumn("label",data["sentiment_value"].cast(IntegerType()))

### Train test split

In [22]:
#train_data,test_data = data.randomSplit([0.7,0.3])

#Performing split such that equal proportion of labels are present in both train and test datasets

#ref: https://stackoverflow.com/questions/47637760/stratified-sampling-with-pyspark
# split dataframes between 0s and 1s
zeros = data.filter(data["sentiment_value"]==0)
ones = data.filter(data["sentiment_value"]==1)

# split datasets into training and testing
train0, test0 = zeros.randomSplit([0.8,0.2])
train1, test1 = ones.randomSplit([0.8,0.2])

# stack datasets back together
train_data = train0.union(train1)
test_data = test0.union(test1)

In [23]:
train_data.groupby("sentiment_value").count().show()

+---------------+------+
|sentiment_value| count|
+---------------+------+
|              1|640292|
|              0|640266|
+---------------+------+



### Stop words Removal and Vectorisation of the text

In [24]:
tokenizer = Tokenizer(inputCol='cleaned_tweet', outputCol='tokenized_tweet')

#As this model is a sentiment analyser, removing the "not" word from the stop words list
stop_words = StopWordsRemover.loadDefaultStopWords("english")

stopword_remover = StopWordsRemover(inputCol='tokenized_tweet',outputCol='cleaned_tokens',stopWords=stop_words)
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol='cleaned_tokens', outputCol='features')


### Model creation and execution

In [25]:
pipeline= Pipeline(stages=[tokenizer,stopword_remover,word2Vec])

### Training and evaluation of logistic regression, random forest models

In [26]:
data_cleaning_model = pipeline.fit(train_data)

In [27]:
train_data_vectorised = data_cleaning_model.transform(train_data)

In [28]:
train_data_vectorised.show()

+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|sentiment_value|       _c1|                 _c2|     _c3|            _c4|       tweet_content|       cleaned_tweet|label|     tokenized_tweet|      cleaned_tokens|            features|
+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|              0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |         Need a hug |    0|      [need, a, hug]|         [need, hug]|[0.25837088376283...|
|              0|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nope t...| nope they didn t...|    0|[, nope, they, di...|      [, nope, didn]|[-0.1228572515149...|
|              0|1467812416|Mon Apr 06 22:20:...|NO_QUERY| erinx3leann

In [29]:
test_data_vectorised = data_cleaning_model.transform(test_data)

In [30]:
test_data_vectorised.show()

+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|sentiment_value|       _c1|                 _c2|     _c3|            _c4|       tweet_content|       cleaned_tweet|label|     tokenized_tweet|      cleaned_tokens|            features|
+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|              0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...| hey long time no...|    0|[, hey, long, tim...|[, hey, long, tim...|[0.05560346074903...|
|              0|1467812723|Mon Apr 06 22:20:...|NO_QUERY|           TLeC|@caregiving I cou...| I couldn t bear ...|    0|[, i, couldn, t, ...|[, couldn, bear, ...|[-0.0161670572124...|
|              0|1467819812|Mon Apr 06 22:22:...|NO_QUERY|      IrisJu

In [31]:
test_data_vectorised.columns

['sentiment_value',
 '_c1',
 '_c2',
 '_c3',
 '_c4',
 'tweet_content',
 'cleaned_tweet',
 'label',
 'tokenized_tweet',
 'cleaned_tokens',
 'features']

### Creating a logistic regression model

In [32]:

lr = LogisticRegression(maxIter=100)
model_lr = lr.fit(train_data_vectorised)

In [33]:
lr_prediction = model_lr.transform(test_data_vectorised)

In [34]:
lr_prediction.show()

+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|sentiment_value|       _c1|                 _c2|     _c3|            _c4|       tweet_content|       cleaned_tweet|label|     tokenized_tweet|      cleaned_tokens|            features|       rawPrediction|         probability|prediction|
+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|              0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...| hey long time no...|    0|[, hey, long, tim...|[, hey, long, tim...|[0.05560346074903...|[-0.5610878529843...|[0.36329578891170...|       1.0|
|              0|1467812723|Mon Apr 06 22:20

In [35]:
lr_prediction.printSchema()

root
 |-- sentiment_value: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- tweet_content: string (nullable = true)
 |-- cleaned_tweet: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- tokenized_tweet: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cleaned_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [36]:
lr_prediction.select(['cleaned_tweet','features','label','rawPrediction','probability','prediction']).show(5)

+--------------------+--------------------+-----+--------------------+--------------------+----------+
|       cleaned_tweet|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-----+--------------------+--------------------+----------+
| hey long time no...|[0.05560346074903...|    0|[-0.5610878529843...|[0.36329578891170...|       1.0|
| I couldn t bear ...|[-0.0161670572124...|    0|[1.97295653056637...|[0.87792831930739...|       0.0|
|Oh man was ironin...|[0.01970249810256...|    0|[-0.8912347647744...|[0.29085508106901...|       1.0|
| Not sure what th...|[0.03242030080694...|    0|[0.60232840362207...|[0.64618882743656...|       0.0|
| I hate when that...|[-0.1298839425047...|    0|[2.32798440489947...|[0.91116832840511...|       0.0|
+--------------------+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



## #Creating a randomforest classifier

In [37]:


rf = RandomForestClassifier()
model_rf = rf.fit(train_data_vectorised)

In [38]:
rf_prediction = model_rf.transform(test_data_vectorised)

In [39]:
rf_prediction.show()

+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|sentiment_value|       _c1|                 _c2|     _c3|            _c4|       tweet_content|       cleaned_tweet|label|     tokenized_tweet|      cleaned_tokens|            features|       rawPrediction|         probability|prediction|
+---------------+----------+--------------------+--------+---------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|              0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...| hey long time no...|    0|[, hey, long, tim...|[, hey, long, tim...|[0.05560346074903...|[10.5454892478080...|[0.52727446239040...|       0.0|
|              0|1467812723|Mon Apr 06 22:20

In [40]:
rf_prediction.select(['cleaned_tweet','features','label','rawPrediction','probability','prediction']).show(5)

+--------------------+--------------------+-----+--------------------+--------------------+----------+
|       cleaned_tweet|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-----+--------------------+--------------------+----------+
| hey long time no...|[0.05560346074903...|    0|[10.5454892478080...|[0.52727446239040...|       0.0|
| I couldn t bear ...|[-0.0161670572124...|    0|[9.50408600097941...|[0.47520430004897...|       1.0|
|Oh man was ironin...|[0.01970249810256...|    0|[11.4660164334713...|[0.57330082167356...|       0.0|
| Not sure what th...|[0.03242030080694...|    0|[10.5724233823917...|[0.52862116911958...|       0.0|
| I hate when that...|[-0.1298839425047...|    0|[12.1063624571341...|[0.60531812285670...|       0.0|
+--------------------+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



### Accuracy metrics computation

In [41]:
evaluator = BinaryClassificationEvaluator()
roc_accuracy=evaluator.evaluate(lr_prediction)
print('ROC-Accuracy of logistic classifier model at predicting sentiment is: {:.4f}'.format(roc_accuracy))

ROC-Accuracy of logistic classifier model at predicting sentiment is: 0.7777


In [42]:
accuracy = (lr_prediction.filter(lr_prediction['label'] == lr_prediction['prediction']).count())/(test_data.select(["sentiment_value"]).count())
print('Accuracy of logistic classifier model at predicting sentiment is: {:.4f}'.format(accuracy))

Accuracy of logistic classifier model at predicting sentiment is: 0.7047


In [43]:
evaluator = BinaryClassificationEvaluator()
roc_accuracy=evaluator.evaluate(rf_prediction)
print('ROC-Accuracy of random forest classifier model at predicting sentiment is: {:.4f}'.format(roc_accuracy))

ROC-Accuracy of random forest classifier model at predicting sentiment is: 0.7206


In [44]:
accuracy = rf_prediction.filter(rf_prediction['label'] == rf_prediction['prediction']).count()/test_data.select(["sentiment_value"]).count()
print('Accuracy of random forest classifier model at predicting sentiment is: {:.4f}'.format(accuracy))

Accuracy of random forest classifier model at predicting sentiment is: 0.6570


## Based on the accouracy scores, I conclude that the logistic classifier  is better than random forest classifier

In [46]:
model_lr.save("Sentiment_analyser.model")