# Setup

In [1]:
! pip install -q pyspark==3.3.0 spark-nlp==4.2.4

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.3/281.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.4/448.4 KB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import os
import sys

import sparknlp

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

import pandas as pd

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  4.2.4
Apache Spark version:  3.3.0


In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Thesis/Code/GenerativeDataAugmentation/Coba18 - Testing All Dataset

/content/drive/MyDrive/Thesis/Code/GenerativeDataAugmentation/Coba18 - Testing All Dataset


# Load Data

In [6]:
pd.options.mode.chained_assignment = None
dataset = pd.read_csv("augJoin - AllDataset - filtered.csv", index_col=0, encoding="utf-8")
dataset

Unnamed: 0,text,emotion
0,im not talking about a studio. im talking abou...,anger
1,. . . Wait a fucking second,anger
2,"Oh [NAME], someone did it already, and it was ...",surprise
3,see im on here having fun meming on all yall b...,joy
4,Yeah - thanks - I vaguely remember this. I won...,surprise
...,...,...
78005,Soon: Mass death of pink-skinned Aussies.,disgust
78006,Ohhh sorry. I got it now. Thanks bruv,joy
78007,"Don't listen to the idiot, we love your posts!",anger
78008,i can feel more emotional now,sadness


In [8]:
dataset = dataset.rename({'text': 'Text', 'emotion': 'Emotion'}, axis=1)  # new method

In [9]:
dataset = dataset[dataset.Emotion != 'love']
dataset

Unnamed: 0,Text,Emotion
0,im not talking about a studio. im talking abou...,anger
1,. . . Wait a fucking second,anger
2,"Oh [NAME], someone did it already, and it was ...",surprise
3,see im on here having fun meming on all yall b...,joy
4,Yeah - thanks - I vaguely remember this. I won...,surprise
...,...,...
78005,Soon: Mass death of pink-skinned Aussies.,disgust
78006,Ohhh sorry. I got it now. Thanks bruv,joy
78007,"Don't listen to the idiot, we love your posts!",anger
78008,i can feel more emotional now,sadness


In [10]:
df = spark.createDataFrame(dataset)

In [11]:
df.show(truncate=20)

+--------------------+--------+
|                Text| Emotion|
+--------------------+--------+
|im not talking ab...|   anger|
|. . . Wait a fuck...|   anger|
|Oh [NAME], someon...|surprise|
|see im on here ha...|     joy|
|Yeah - thanks - I...|surprise|
|In that video, th...|surprise|
|   That’s terrifying|    fear|
|i feel comfortabl...|     joy|
|If it sounds weir...| disgust|
|i feel hesitant a...|    fear|
|The rare times wh...| disgust|
|14 and im quite g...| sadness|
|Stranger than fic...| sadness|
|Here come the ang...|   anger|
|Wow. I didn’t rea...|surprise|
|Thats exactly wha...|    fear|
|You’d be surprise...|surprise|
|Ay just like my l...|     joy|
|i feel so worthle...| sadness|
|$70... Now that's...|     joy|
+--------------------+--------+
only showing top 20 rows



In [12]:
from pyspark.sql.functions import col

df.groupBy("Emotion") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
| Emotion|count|
+--------+-----+
|     joy|14744|
| sadness|14168|
|    fear|13037|
|   anger|12987|
|surprise|12330|
| disgust|10744|
+--------+-----+



# Feature Extraction

In [13]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

In [14]:
%%time

document_assembler = DocumentAssembler() \
      .setInputCol("Text") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
      
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("stem")

finisher = Finisher() \
      .setInputCols(["stem"]) \
      .setOutputCols(["token_features"]) \
      .setOutputAsArray(True) \
      .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "Emotion", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            countVectors,
            label_stringIdx])

nlp_model = nlp_pipeline.fit(df)

processed = nlp_model.transform(df)

processed.count()

CPU times: user 346 ms, sys: 58.6 ms, total: 405 ms
Wall time: 21 s


78010

In [15]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf,
            label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(df)

processed_tf = nlp_model_tf.transform(df)

processed_tf.count()

78010

In [16]:
(trainingData, testData) = processed_tf.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 54642
Test Dataset Count: 23368


# Classifier

## Random Forest

In [17]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions_rf = rfModel.transform(testData)

In [18]:
predictions_rf.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|Amazing, I would also be de...|    joy|[0.25843910312407864,0.1657...|  0.0|       0.0|
|i feel happy now that i am ...|    joy|[0.2574705726993286,0.17432...|  0.0|       0.0|
|im feeling mellow and am en...|    joy|[0.257035267304163,0.172306...|  0.0|       0.0|
|So many nice things here. G...|    joy|[0.2529496537748046,0.17393...|  0.0|       0.0|
|So many nice things here. G...|    joy|[0.2529496537748046,0.17393...|  0.0|       0.0|
|So many nice things here. G...|    joy|[0.2529496537748046,0.17393...|  0.0|       0.0|
|Happy birthday! Enjoy your ...|    joy|[0.25160878729078473,0.1704...|  0.0|       0.0|
|enjoy it and happy New Year...|    joy|[0.24777643483777798,0.1707...|  0.0|       0.0|
|enjoy it and happy N

In [19]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = predictions_rf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_rf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.23      0.97      0.37      4316
         1.0       0.64      0.27      0.38      4286
         2.0       0.90      0.34      0.49      3922
         3.0       0.70      0.17      0.28      3838
         4.0       0.88      0.19      0.32      3743
         5.0       0.85      0.03      0.06      3263

    accuracy                           0.35     23368
   macro avg       0.70      0.33      0.32     23368
weighted avg       0.69      0.35      0.33     23368

0.3512067785005135


## Logistic Regression

In [20]:
# training the logistic regression madel
from pyspark.ml.classification import LogisticRegression
lr =  LogisticRegression(featuresCol = "features", labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)

lrModel = lr.fit(trainingData)
predictions_lr = lrModel.transform(testData)

In [21]:
predictions_lr.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Text| Emotion|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| "Bing" is a terrible enoug...| disgust|[0.19084168696397316,0.1808...|  5.0|       0.0|
| ### A surprise, to be sure...|surprise|[0.19084168696397316,0.1808...|  4.0|       0.0|
| *snerk*...no wonder she's ...|surprise|[0.19084168696397316,0.1808...|  4.0|       0.0|
| > And they did enjoy their...|    fear|[0.19084168696397316,0.1808...|  2.0|       0.0|
| >[NAME] is still extremely...| disgust|[0.19084168696397316,0.1808...|  5.0|       0.0|
| >it's a very dangerous thi...|    fear|[0.19084168696397316,0.1808...|  2.0|       0.0|
| A couple things: 1) it was...|    fear|[0.19084168696397316,0.1808...|  2.0|       0.0|
| A girl walked past with a ...|surprise|[0.19084168696397316,0.1808...|  4.0|       0.0|
| Ah yes, 

In [22]:
y_true = predictions_lr.select("label")
y_true = y_true.toPandas()

y_pred = predictions_lr.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.18      1.00      0.31      4316
         1.0       0.00      0.00      0.00      4286
         2.0       0.00      0.00      0.00      3922
         3.0       0.00      0.00      0.00      3838
         4.0       0.00      0.00      0.00      3743
         5.0       0.00      0.00      0.00      3263

    accuracy                           0.18     23368
   macro avg       0.03      0.17      0.05     23368
weighted avg       0.03      0.18      0.06     23368

0.18469702156795617


## Decision Tree

In [23]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = "features", labelCol="label")
dtModel = dt.fit(trainingData)
predictions_dt = dtModel.transform(testData)

In [24]:
predictions_dt.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Text| Emotion|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   I feel personally attacked.|    fear|[0.27721107499159287,0.2892...|  2.0|       1.0|
| i also want to say that th...|    fear|[0.27721107499159287,0.2892...|  2.0|       1.0|
| I know but it feels too da...|    fear|[0.27721107499159287,0.2892...|  2.0|       1.0|
| I'm not the only one [NAME...|    fear|[0.27721107499159287,0.2892...|  2.0|       1.0|
| It’s scary and I feel like...|    fear|[0.27721107499159287,0.2892...|  2.0|       1.0|
| Just tell them how it feel...|    fear|[0.27721107499159287,0.2892...|  2.0|       1.0|
| No one else feels the same...|surprise|[0.27721107499159287,0.2892...|  4.0|       1.0|
| This just made me feel cre...|    fear|[0.27721107499159287,0.2892...|  2.0|       1.0|
| i actual

In [25]:
y_true = predictions_dt.select("label")
y_true = y_true.toPandas()

y_pred = predictions_dt.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      4316
         1.0       0.30      0.52      0.38      4286
         2.0       0.96      0.12      0.22      3922
         3.0       0.67      0.13      0.21      3838
         4.0       0.89      0.27      0.41      3743
         5.0       0.22      0.93      0.36      3263

    accuracy                           0.31     23368
   macro avg       0.51      0.33      0.26     23368
weighted avg       0.50      0.31      0.26     23368

0.31029613146182816


## Naive Bayes

In [26]:
# from pyspark.ml.classification import RandomForestClassifier

# rf = RandomForestClassifier(labelCol="label", \
#                             featuresCol="features", \
#                             numTrees = 100, \
#                             maxDepth = 4, \
#                             maxBins = 32)

# # Train model with Training Data
# rfModel = rf.fit(trainingData)
# predictions_rf = rfModel.transform(testData)

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
nbModel = nb.fit(trainingData)
predictions_nb = nbModel.transform(testData)

In [27]:
predictions_nb.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|i feel if you re learning a...|    joy|[1.0,9.869971376993206E-17,...|  0.0|       0.0|
|I’m glad I was able to find...|    joy|[1.0,9.781079041857067E-17,...|  0.0|       0.0|
|i would lie in bed and feel...|    joy|[1.0,3.344572338386437E-17,...|  0.0|       0.0|
|Lol! I’m glad you feel luck...|    joy|[1.0,2.724573708860178E-17,...|  0.0|       0.0|
|i may be smitten or shy and...|    joy|[1.0,2.679226118186214E-17,...|  0.0|       0.0|
|i was living with when i fi...|    joy|[1.0,2.24516133811146E-17,7...|  0.0|       0.0|
|i was challenged by the cli...|sadness|[1.0,1.894106677124369E-17,...|  1.0|       0.0|
|i just feel like i need a s...|    joy|[1.0,1.616253046396235E-17,...|  0.0|       0.0|
|i just love the feel

In [28]:
y_true = predictions_nb.select("label")
y_true = y_true.toPandas()

y_pred = predictions_nb.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.71      0.68      0.70      4316
         1.0       0.68      0.63      0.65      4286
         2.0       0.74      0.73      0.74      3922
         3.0       0.59      0.56      0.58      3838
         4.0       0.64      0.66      0.65      3743
         5.0       0.53      0.61      0.57      3263

    accuracy                           0.65     23368
   macro avg       0.65      0.65      0.65     23368
weighted avg       0.65      0.65      0.65     23368

0.6501626155426223
