# Setup

In [1]:
! pip install -q pyspark==3.3.0 spark-nlp==4.2.4

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.3/281.3 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.4/448.4 KB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import os
import sys

import sparknlp

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

import pandas as pd

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  4.2.4
Apache Spark version:  3.3.0


In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Thesis/Code/GenerativeDataAugmentation/Coba18 - Testing All Dataset

/content/drive/MyDrive/Thesis/Code/GenerativeDataAugmentation/Coba18 - Testing All Dataset


In [5]:
!ls

'augJoin - AllDataset - filtered.csv'
'augJoin - AllDataset - unfiltered.csv'
'BERT - All Dataset - base.ipynb'
'BERT - All Dataset - filtered.ipynb'
'BERT - All Dataset - unfiltered.ipynb'
'BiLSTM - All Dataset - base.ipynb'
'BiLSTM - All Dataset - filtered.ipynb'
'BiLSTM - All Dataset - unfiltered.ipynb'
 dataset.csv
'LSTM - All Dataset - base.ipynb'
'LSTM - All Dataset - filtered.ipynb'
'LSTM - All Dataset - unfiltered.ipynb'
'ML - All Dataset - base.ipynb'
'RoBERTa - All Dataset - base.ipynb'
'RoBERTa - All Dataset - filtered.ipynb'
'RoBERTa - All Dataset - unfiltered.ipynb'


# Load Data

In [6]:
pd.options.mode.chained_assignment = None
dataset = pd.read_csv("dataset.csv", index_col=0, encoding="utf-8")
dataset

Unnamed: 0,Text,Emotion
0,Thanks! I'm glad you got a kick out of it.,joy
1,I'm more surprised the dash cam was functionin...,surprise
2,i dont know how i feel about my beloved teams ...,love
3,when i learnt that my best friend had failed t...,anger
4,i feel as though that talking for a month is a...,joy
...,...,...
65023,I love truck drivers!!,love
65024,I knew I'd see him here eventually. I really l...,joy
65025,Nope what’s it about?,surprise
65026,Removed and warned. R1,anger


In [7]:
dataset = dataset[dataset.Emotion != 'love']
dataset

Unnamed: 0,Text,Emotion
0,Thanks! I'm glad you got a kick out of it.,joy
1,I'm more surprised the dash cam was functionin...,surprise
3,when i learnt that my best friend had failed t...,anger
4,i feel as though that talking for a month is a...,joy
5,i feel like the addition of sweet fresh corn r...,joy
...,...,...
65022,I had smoothies in the past with these packets...,anger
65024,I knew I'd see him here eventually. I really l...,joy
65025,Nope what’s it about?,surprise
65026,Removed and warned. R1,anger


In [8]:
df = spark.createDataFrame(dataset)

In [9]:
df.show(truncate=20)

+--------------------+--------+
|                Text| Emotion|
+--------------------+--------+
|Thanks! I'm glad ...|     joy|
|I'm more surprise...|surprise|
|when i learnt tha...|   anger|
|i feel as though ...|     joy|
|i feel like the a...|     joy|
|They have disagre...| disgust|
|im trying to be i...|    fear|
|It’s good to see ...|     joy|
|i could have just...| sadness|
|i tell people it ...|     joy|
|i feel shy at the...|    fear|
|Holy shit I didn’...|surprise|
|[NAME] done fucke...|   anger|
|I’m sorry he’s tr...| sadness|
|Trier is low key ...| disgust|
|That is utterly h...| sadness|
|i have the right ...|   anger|
|As a computer sci...| sadness|
|LMAO why are [NAM...| disgust|
|This is the stran...|     joy|
+--------------------+--------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import col

df.groupBy("Emotion") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
| Emotion|count|
+--------+-----+
|     joy|14744|
| sadness|12555|
|   anger|10793|
|surprise| 6233|
|    fear| 5570|
| disgust| 5301|
+--------+-----+



# Feature Extraction

In [11]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

In [12]:
%%time

document_assembler = DocumentAssembler() \
      .setInputCol("Text") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
      
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("stem")

finisher = Finisher() \
      .setInputCols(["stem"]) \
      .setOutputCols(["token_features"]) \
      .setOutputAsArray(True) \
      .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "Emotion", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            countVectors,
            label_stringIdx])

nlp_model = nlp_pipeline.fit(df)

processed = nlp_model.transform(df)

processed.count()

CPU times: user 215 ms, sys: 55.7 ms, total: 271 ms
Wall time: 15.9 s


55196

In [13]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf,
            label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(df)

processed_tf = nlp_model_tf.transform(df)

processed_tf.count()

55196

In [14]:
(trainingData, testData) = processed_tf.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 38555
Test Dataset Count: 16641


# Classifier

## Random Forest

In [15]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions_rf = rfModel.transform(testData)

In [16]:
predictions_rf.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|  Thanks, glad you enjoyed it!|    joy|[0.3379611195658484,0.20341...|  0.0|       0.0|
|  Thanks, glad you enjoyed it!|    joy|[0.3379611195658484,0.20341...|  0.0|       0.0|
|Thank you so much for the w...|    joy|[0.3341905736483746,0.20039...|  0.0|       0.0|
|Aw thank you! Glad you enjo...|    joy|[0.3337133506116945,0.20297...|  0.0|       0.0|
|Glad you enjoyed man, yeah ...|    joy|[0.32910397003021463,0.2076...|  0.0|       0.0|
|Cool thanks, posted on ther...|    joy|[0.32907052596975206,0.2042...|  0.0|       0.0|
|I heard someone say one... ...|    joy|[0.32894945674446274,0.2086...|  0.0|       0.0|
|This is a really lovely end...|    joy|[0.32773605293213287,0.2090...|  0.0|       0.0|
|its ok, glad i watch

In [17]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = predictions_rf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_rf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.27      1.00      0.43      4410
         1.0       0.83      0.09      0.16      3760
         2.0       0.00      0.00      0.00      3240
         3.0       0.00      0.00      0.00      1908
         4.0       0.00      0.00      0.00      1720
         5.0       0.00      0.00      0.00      1603

    accuracy                           0.28     16641
   macro avg       0.18      0.18      0.10     16641
weighted avg       0.26      0.28      0.15     16641

0.2848987440658614


## Logistic Regression

In [18]:
# training the logistic regression madel
from pyspark.ml.classification import LogisticRegression
lr =  LogisticRegression(featuresCol = "features", labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)

lrModel = lr.fit(trainingData)
predictions_lr = lrModel.transform(testData)

In [19]:
predictions_lr.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Text| Emotion|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| No mention of [NAME]? For ...| disgust|[0.26803198985607085,0.2281...|  5.0|       0.0|
| looks unreal, like it's a ...|surprise|[0.26803198985607085,0.2281...|  3.0|       0.0|
| yay!!! Congratulations! Yo...|     joy|[0.26803198985607085,0.2281...|  0.0|       0.0|
|"...And what do I find out?...|   anger|[0.26803198985607085,0.2281...|  2.0|       0.0|
|"If you don't support no go...|   anger|[0.26803198985607085,0.2281...|  2.0|       0.0|
|   "Lazy ass" Oh my, the irony| disgust|[0.26803198985607085,0.2281...|  5.0|       0.0|
|"Oh, we just happen to have...|surprise|[0.26803198985607085,0.2281...|  3.0|       0.0|
|"Sad" but "true" There you ...| sadness|[0.26803198985607085,0.2281...|  1.0|       0.0|
|         

In [20]:
y_true = predictions_lr.select("label")
y_true = y_true.toPandas()

y_pred = predictions_lr.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.27      1.00      0.42      4410
         1.0       0.00      0.00      0.00      3760
         2.0       0.00      0.00      0.00      3240
         3.0       0.00      0.00      0.00      1908
         4.0       0.00      0.00      0.00      1720
         5.0       0.00      0.00      0.00      1603

    accuracy                           0.27     16641
   macro avg       0.04      0.17      0.07     16641
weighted avg       0.07      0.27      0.11     16641

0.2650081124932396


## Decision Tree

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = "features", labelCol="label")
dtModel = dt.fit(trainingData)
predictions_dt = dtModel.transform(testData)

In [22]:
predictions_dt.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|  Aye but happy cake day mate.|    joy|[0.894431554524362,0.048723...|  0.0|       0.0|
|Don’t be clinically depress...|    joy|[0.894431554524362,0.048723...|  0.0|       0.0|
|Because love is a lie and t...|   fear|[0.894431554524362,0.048723...|  4.0|       0.0|
|   A video that makes me happy|    joy|[0.894431554524362,0.048723...|  0.0|       0.0|
|              Bee happy! ' v')|    joy|[0.894431554524362,0.048723...|  0.0|       0.0|
|And I'm happy we all agree ...|    joy|[0.894431554524362,0.048723...|  0.0|       0.0|
|CONGRATS! Happy for both yo...|    joy|[0.894431554524362,0.048723...|  0.0|       0.0|
|CONGRATS! So happy for you....|    joy|[0.894431554524362,0.048723...|  0.0|       0.0|
|Definitely going to 

In [23]:
y_true = predictions_dt.select("label")
y_true = y_true.toPandas()

y_pred = predictions_dt.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.42      0.59      0.49      4410
         1.0       0.88      0.07      0.12      3760
         2.0       0.25      0.77      0.38      3240
         3.0       0.54      0.02      0.04      1908
         4.0       0.58      0.03      0.05      1720
         5.0       0.25      0.00      0.00      1603

    accuracy                           0.33     16641
   macro avg       0.49      0.25      0.18     16641
weighted avg       0.50      0.33      0.24     16641

0.32678324619914667


## Naive Bayes

In [24]:
# from pyspark.ml.classification import RandomForestClassifier

# rf = RandomForestClassifier(labelCol="label", \
#                             featuresCol="features", \
#                             numTrees = 100, \
#                             maxDepth = 4, \
#                             maxBins = 32)

# # Train model with Training Data
# rfModel = rf.fit(trainingData)
# predictions_rf = rfModel.transform(testData)

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
nbModel = nb.fit(trainingData)
predictions_nb = nbModel.transform(testData)

In [25]:
predictions_nb.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|i wind up feeling like the ...|    joy|[1.0,1.0557482955095165E-16...|  0.0|       0.0|
|i am feeling more energetic...|    joy|[1.0,7.723696859623125E-17,...|  0.0|       0.0|
|i feel so ecstatic and reli...|    joy|[1.0,4.266660159103172E-17,...|  0.0|       0.0|
|i feel the most glamorous i...|    joy|[1.0,3.228119095395262E-17,...|  0.0|       0.0|
|i wonder if this is what ma...|    joy|[1.0,2.534469041486088E-17,...|  0.0|       0.0|
|i tend to think that it kin...|    joy|[1.0,2.1007555683534404E-17...|  0.0|       0.0|
|i got the feeling brig is s...|    joy|[1.0,1.908213296946685E-17,...|  0.0|       0.0|
|i feel that students in my ...|    joy|[1.0,8.608377321219521E-18,...|  0.0|       0.0|
|i felt joyful then i

In [26]:
y_true = predictions_nb.select("label")
y_true = y_true.toPandas()

y_pred = predictions_nb.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.74      0.69      0.71      4410
         1.0       0.66      0.61      0.64      3760
         2.0       0.56      0.54      0.55      3240
         3.0       0.47      0.53      0.50      1908
         4.0       0.53      0.58      0.56      1720
         5.0       0.35      0.43      0.38      1603

    accuracy                           0.59     16641
   macro avg       0.55      0.56      0.56     16641
weighted avg       0.60      0.59      0.59     16641

0.5871041403761793
