In [1]:
pip install spark-nlp



In [2]:
# Import libraries
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, concat_ws, rand
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sparknlp.pretrained import PretrainedPipeline
from sklearn.metrics import classification_report

In [3]:
spark = sparknlp.start()

In [4]:
# Read data
data = spark.read.parquet("/content/drive/MyDrive/cleaned_data/")
data.show()
data.printSchema()

+------+---------+---------------+--------------------+-----------+--------------------+--------------------+--------------------+
|ItemID|Sentiment|SentimentSource|       SentimentText|text_length|               words|      filtered_words|    lemmatized_words|
+------+---------+---------------+--------------------+-----------+--------------------+--------------------+--------------------+
|    62|        1|   Sentiment140|i always get what...|         27|[i, always, get, ...| [always, get, want]| [always, get, want]|
|   194|        1|   Sentiment140|tell  i said happ...|         44|[tell, , i, said,...|[tell, , said, ha...|[tell, , said, ha...|
|   436|        0|   Sentiment140|i hope everyone i...|        136|[i, hope, everyon...|[hope, everyone, ...|[hope, everyone, ...|
|   474|        0|   Sentiment140|all my friends ar...|        101|[all, my, friends...|[friends, gone, h...|[friend, gone, ha...|
|   619|        0|   Sentiment140|bcds closed i gue...|         36|[bcds, closed, i

In [5]:
data.groupBy("Sentiment").count().show(truncate=False)

+---------+------+
|Sentiment|count |
+---------+------+
|1        |790018|
|0        |788387|
+---------+------+



In [6]:
# Take sample size
sample_size = 100000

positive_df = data.filter(data["Sentiment"] == "1").limit(sample_size)
negative_df = data.filter(data["Sentiment"] == "0").limit(sample_size)

balanced_dataset = positive_df.union(negative_df)
balanced_dataset.groupBy("Sentiment").count().show(truncate=False)

# Split data
trainingData = balanced_dataset.orderBy(rand())
(trainData, testData) = trainingData.randomSplit([0.8, 0.2], seed = 100)

+---------+------+
|Sentiment|count |
+---------+------+
|1        |100000|
|0        |100000|
+---------+------+



In [7]:
# ClassifierDLApproach + BertEmbeddings configs

document_assembler = DocumentAssembler() \
    .setInputCol("SentimentText") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

bert_embeddings = BertEmbeddings().pretrained(name='small_bert_L4_256', lang='en') \
    .setInputCols(["document", 'token']) \
    .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("Sentiment") \
    .setMaxEpochs(10) \
    .setLr(0.001) \
    .setBatchSize(16)\
    .setEnableOutputLogs(True)\
    .setOutputLogsPath('logs')\
    .setVerbose(1)\
    .setValidationSplit(0.2)\
    .setDropout(0.2)\

classsifierdl_pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    bert_embeddings,
    embeddingsSentence,
    classsifierdl
])

small_bert_L4_256 download started this may take some time.
Approximate size to download 40.5 MB
[OK!]


In [8]:
classifierdl_model = classsifierdl_pipeline.fit(trainData)

preds = classifierdl_model.transform(testData)

preds_df = preds.select('Sentiment','SentimentText',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

preds_df['result'] = preds_df['result'].astype(int)

print (classification_report(preds_df['result'], preds_df['Sentiment']))

              precision    recall  f1-score   support

           0       0.73      0.80      0.76     18213
           1       0.82      0.75      0.78     22004

    accuracy                           0.77     40217
   macro avg       0.77      0.77      0.77     40217
weighted avg       0.78      0.77      0.77     40217



In [9]:
classifierdl_model.save("/content/classifierdl_model")

In [11]:
log_file_path = '/content/logs/ClassifierDLApproach_62d083ebf2b6.log'

with open(log_file_path, 'r') as file:
    log_content = file.read()

print(log_content)

Training started - epochs: 10 - learning_rate: 0.001 - batch_size: 16 - training_examples: 127836 - classes: 2
Epoch 0/10 - 71.49s - loss: 4619.7197 - acc: 0.71590626 - batches: 7990
Quality on validation dataset (20.0%), validation examples = 31958
Epoch 1/10 - 68.64s - loss: 4488.7236 - acc: 0.73769665 - batches: 7990
Quality on validation dataset (20.0%), validation examples = 31958
Epoch 2/10 - 66.44s - loss: 4418.5386 - acc: 0.7510691 - batches: 7990
Quality on validation dataset (20.0%), validation examples = 31958
Epoch 3/10 - 70.56s - loss: 4347.473 - acc: 0.7628536 - batches: 7990
Quality on validation dataset (20.0%), validation examples = 31958
Epoch 4/10 - 71.33s - loss: 4278.4243 - acc: 0.7747684 - batches: 7990
Quality on validation dataset (20.0%), validation examples = 31958
Epoch 5/10 - 71.13s - loss: 4213.424 - acc: 0.7848135 - batches: 7990
Quality on validation dataset (20.0%), validation examples = 31958
Epoch 6/10 - 68.58s - loss: 4151.4697 - acc: 0.7951167 - batc

In [13]:
model = PipelineModel.load("/content/classifierdl_model")

preds = model.transform(testData)

preds_df = preds.select('Sentiment','SentimentText',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

preds_df['result'] = preds_df['result'].astype(int)

print (classification_report(preds_df['result'], preds_df['Sentiment']))

              precision    recall  f1-score   support

           0       0.73      0.80      0.76     18171
           1       0.82      0.75      0.79     22046

    accuracy                           0.78     40217
   macro avg       0.78      0.78      0.78     40217
weighted avg       0.78      0.78      0.78     40217

