# Setup

In [1]:
! pip install -q pyspark==3.3.0 spark-nlp==4.2.4

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.3/281.3 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.4/448.4 KB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import os
import sys

import sparknlp

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

import pandas as pd

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  4.2.4
Apache Spark version:  3.3.0


In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Thesis/Code/GenerativeDataAugmentation/Coba18 - Testing All Dataset

/content/drive/MyDrive/Thesis/Code/GenerativeDataAugmentation/Coba18 - Testing All Dataset


# Load Data

In [5]:
pd.options.mode.chained_assignment = None
dataset = pd.read_csv("augJoin - AllDataset - unfiltered.csv", index_col=0, encoding="utf-8")
dataset

Unnamed: 0,Text,Emotion
0,i am feeling ungrateful all the time,sadness
1,i am not feeling like ive missed anything,sadness
2,"But why will it ""activate""??? Will this make m...",sadness
3,i didn t mind going but as i neared the end of...,sadness
4,i wish i could do this but unfortunately i fee...,sadness
...,...,...
65023,I love truck drivers!!,love
65024,I knew I'd see him here eventually. I really l...,joy
65025,Nope what’s it about?,surprise
65026,Removed and warned. R1,anger


In [6]:
dataset = dataset.rename({'text': 'Text', 'emotion': 'Emotion'}, axis=1)  # new method

In [7]:
dataset = dataset[dataset.Emotion != 'love']
dataset

Unnamed: 0,Text,Emotion
0,i am feeling ungrateful all the time,sadness
1,i am not feeling like ive missed anything,sadness
2,"But why will it ""activate""??? Will this make m...",sadness
3,i didn t mind going but as i neared the end of...,sadness
4,i wish i could do this but unfortunately i fee...,sadness
...,...,...
65022,I had smoothies in the past with these packets...,anger
65024,I knew I'd see him here eventually. I really l...,joy
65025,Nope what’s it about?,surprise
65026,Removed and warned. R1,anger


In [8]:
df = spark.createDataFrame(dataset)

In [9]:
df.show(truncate=20)

+--------------------+-------+
|                Text|Emotion|
+--------------------+-------+
|i am feeling ungr...|sadness|
|i am not feeling ...|sadness|
|But why will it "...|sadness|
|i didn t mind goi...|sadness|
|i wish i could do...|sadness|
|i feel so shitty ...|sadness|
|i think about the...|sadness|
|The last page had...|sadness|
|i feel so emotion...|sadness|
|The most depressi...|sadness|
|           I feel ya|sadness|
|       You're lonely|sadness|
|i need not give a...|sadness|
|ive been a very u...|sadness|
|i feel extremely ...|sadness|
|i feel lame for b...|sadness|
|So sad!!! Im just...|sadness|
|i have a feeling ...|sadness|
|I am sorry it hap...|sadness|
|i really feel sor...|sadness|
+--------------------+-------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import col

df.groupBy("Emotion") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
| Emotion|count|
+--------+-----+
|     joy|14744|
| sadness|14480|
|   anger|13940|
|    fear|13870|
|surprise|13284|
| disgust|13037|
+--------+-----+



# Feature Extraction

In [11]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

In [12]:
%%time

document_assembler = DocumentAssembler() \
      .setInputCol("Text") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
      
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("stem")

finisher = Finisher() \
      .setInputCols(["stem"]) \
      .setOutputCols(["token_features"]) \
      .setOutputAsArray(True) \
      .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "Emotion", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            countVectors,
            label_stringIdx])

nlp_model = nlp_pipeline.fit(df)

processed = nlp_model.transform(df)

processed.count()

CPU times: user 387 ms, sys: 43.2 ms, total: 431 ms
Wall time: 22.2 s


83355

In [13]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf,
            label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(df)

processed_tf = nlp_model_tf.transform(df)

processed_tf.count()

83355

In [14]:
(trainingData, testData) = processed_tf.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 58397
Test Dataset Count: 24958


# Classifier

## Random Forest

In [15]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions_rf = rfModel.transform(testData)

In [16]:
predictions_rf.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|Happy birthday! Enjoy your ...|    joy|[0.2390328616758345,0.16379...|  0.0|       0.0|
|The only happy ending I enj...|    joy|[0.23630381255084543,0.1659...|  0.0|       0.0|
|Happy Cake Day [NAME]! Enjo...|    joy|[0.23396372038951926,0.1644...|  0.0|       0.0|
|Happy Cake Day [NAME]! Enjo...|    joy|[0.23396372038951926,0.1644...|  0.0|       0.0|
|Amazing, I would also be de...|    joy|[0.2331882962550037,0.16528...|  0.0|       0.0|
|      So happy for you, enjoy |    joy|[0.23205501241716955,0.1666...|  0.0|       0.0|
|Yeah, that will reduce the ...|    joy|[0.2312694673979316,0.16358...|  0.0|       0.0|
|Now you’re repeating yourse...|    joy|[0.2257983953043564,0.16757...|  0.0|       0.0|
|  Thanks, glad you e

In [17]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = predictions_rf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_rf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.29      0.91      0.44      4404
         1.0       0.54      0.54      0.54      4264
         2.0       0.58      0.28      0.37      4202
         3.0       0.86      0.44      0.58      4192
         4.0       0.71      0.39      0.51      3943
         5.0       0.69      0.12      0.20      3953

    accuracy                           0.45     24958
   macro avg       0.61      0.45      0.44     24958
weighted avg       0.61      0.45      0.44     24958

0.4530811763763122


## Logistic Regression

In [18]:
# training the logistic regression madel
from pyspark.ml.classification import LogisticRegression
lr =  LogisticRegression(featuresCol = "features", labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)

lrModel = lr.fit(trainingData)
predictions_lr = lrModel.transform(testData)

In [19]:
predictions_lr.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
| " 'Don't go near him' soun...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| "I don't have food on me" ...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| "I love him, but he should...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| "I'm not a doctor but if I...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| "I've been threatened with...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| "We are too damn rich, so ...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| "We'll do our best" is an ...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| *I’m crying, just read the...|   fear|[0.17706293829029898,0.1749...|  3.0|       0.0|
| *]How?..* Because i

In [20]:
y_true = predictions_lr.select("label")
y_true = y_true.toPandas()

y_pred = predictions_lr.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.18      1.00      0.30      4404
         1.0       0.00      0.00      0.00      4264
         2.0       0.00      0.00      0.00      4202
         3.0       0.00      0.00      0.00      4192
         4.0       0.00      0.00      0.00      3943
         5.0       0.00      0.00      0.00      3953

    accuracy                           0.18     24958
   macro avg       0.03      0.17      0.05     24958
weighted avg       0.03      0.18      0.05     24958

0.17645644683067555


## Decision Tree

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = "features", labelCol="label")
dtModel = dt.fit(trainingData)
predictions_dt = dtModel.transform(testData)

In [22]:
predictions_dt.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Text| Emotion|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| I'm afraid of that happeni...|    fear|[0.8105820105820106,0.04867...|  3.0|       0.0|
|         Happy birthday to him|surprise|[0.8105820105820106,0.04867...|  4.0|       0.0|
| Sooo unexpected. I’m so ha...|    fear|[0.8105820105820106,0.04867...|  3.0|       0.0|
| i would want her to share ...|    fear|[0.8105820105820106,0.04867...|  3.0|       0.0|
|Ah, well I am very happy fo...|surprise|[0.8105820105820106,0.04867...|  4.0|       0.0|
| I can’t imagine the panic ...|    fear|[0.8105820105820106,0.04867...|  3.0|       0.0|
|        Happy Birthday [NAME]!|surprise|[0.8105820105820106,0.04867...|  4.0|       0.0|
| i have never loved or been...|    fear|[0.8105820105820106,0.04867...|  3.0|       0.0|
|         

In [23]:
y_true = predictions_dt.select("label")
y_true = y_true.toPandas()

y_pred = predictions_dt.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.81      0.08      0.15      4404
         1.0       0.29      0.53      0.37      4264
         2.0       0.56      0.12      0.20      4202
         3.0       0.93      0.12      0.21      4192
         4.0       0.84      0.22      0.35      3943
         5.0       0.24      0.86      0.38      3953

    accuracy                           0.32     24958
   macro avg       0.61      0.32      0.28     24958
weighted avg       0.61      0.32      0.28     24958

0.3184950717204904


## Naive Bayes

In [24]:
# from pyspark.ml.classification import RandomForestClassifier

# rf = RandomForestClassifier(labelCol="label", \
#                             featuresCol="features", \
#                             numTrees = 100, \
#                             maxDepth = 4, \
#                             maxBins = 32)

# # Train model with Training Data
# rfModel = rf.fit(trainingData)
# predictions_rf = rfModel.transform(testData)

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
nbModel = nb.fit(trainingData)
predictions_nb = nbModel.transform(testData)

In [25]:
predictions_nb.select("Text","Emotion","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                          Text|Emotion|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|i feel spiritually invigora...|    joy|[1.0,9.891487111217217E-17,...|  0.0|       0.0|
|i like being in church on s...|    joy|[1.0,8.36286877717925E-17,2...|  0.0|       0.0|
|ive continued to feel energ...|    joy|[1.0,5.957581441700276E-17,...|  0.0|       0.0|
|im feeling that joy every d...|    joy|[1.0,5.846349430844143E-17,...|  0.0|       0.0|
|i worked today on writing a...|    joy|[1.0,4.6291574559934914E-17...|  0.0|       0.0|
|i feel that if i surrender ...|    joy|[1.0,2.6804885330711342E-17...|  0.0|       0.0|
|i don t whoop and holler un...|    joy|[1.0,2.5939469161941842E-17...|  0.0|       0.0|
|i felt a lot of guilt for n...|    joy|[1.0,2.3130253756080574E-17...|  0.0|       0.0|
|im going to go do my

In [26]:
y_true = predictions_nb.select("label")
y_true = y_true.toPandas()

y_pred = predictions_nb.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction, zero_division=0))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.69      0.65      0.67      4404
         1.0       0.62      0.61      0.61      4264
         2.0       0.52      0.50      0.51      4202
         3.0       0.67      0.64      0.66      4192
         4.0       0.56      0.58      0.57      3943
         5.0       0.41      0.46      0.43      3953

    accuracy                           0.57     24958
   macro avg       0.58      0.57      0.57     24958
weighted avg       0.58      0.57      0.58     24958

0.5746454042791891
