# [Sentiment Analysis Shared Task](https://github.com/blp-workshop/blp_task2) at [BLP Workshop](https://blp-workshop.github.io/) @EMNLP 2023

The main objective of this task is to detect the sentiment associated within a given text. This is a multi-class classification task that involves determining whether the sentiment expressed in the text is Positive, Negative, Neutral.



### Downloading dataset from github

In [None]:
# !wget https://raw.githubusercontent.com/blp-workshop/blp_task2/main/data/blp23_sentiment_train.tsv
# !wget https://raw.githubusercontent.com/blp-workshop/blp_task2/main/data/blp23_sentiment_dev.tsv
# !wget https://raw.githubusercontent.com/blp-workshop/blp_task2/main/data/blp23_sentiment_dev_test.tsv

In [None]:
train_file = 'data/blp23_sentiment_train.tsv'
validation_file = 'data/blp23_sentiment_dev.tsv'
test_file = 'data/blp23_sentiment_test.tsv'
test_file_with_label = 'data/blp23_sentiment_test_with_label.tsv'

# test_file = 'data/blp23_sentiment_dev_test.tsv'

In [None]:
import pandas as pd
train_stat_pd = pd.read_csv(train_file, sep='\t')
validation_stat_pd = pd.read_csv(validation_file, sep='\t')
test_stat_pd = pd.read_csv(test_file_with_label, sep='\t')

print("len(train_stat_pd):", len(train_stat_pd))
print("len(validation_stat_pd):", len(validation_stat_pd))
print("len(test_stat_pd):", len(test_stat_pd))

len(train_stat_pd): 35266
len(validation_stat_pd): 3934
len(test_stat_pd): 6707


### installing required libraries.
 - transformers
 - datasets
 - evaluate
 - accelerate

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install --upgrade accelerate



#### importing required libraries and setting up logger

In [None]:
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import datasets
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import torch

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version


logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

### Defining the training, validation, and test data

In [None]:
train_data_stat = pd.read_csv(train_file, sep='\t')

positive_data = len(train_data_stat[train_data_stat['label'] == 'Positive'])
neutral_data = len(train_data_stat[train_data_stat['label'] == 'Neutral'])
negative_data = len(train_data_stat[train_data_stat['label'] == 'Negative'])

print("# Positives:", positive_data)
print("# Neutrals:", neutral_data)
print("# Negatives:", negative_data)

# Positives: 12364
# Neutrals: 7135
# Negatives: 15767


In [None]:
import torch
from torch import nn

### Setting up the training parameters

In [None]:
training_args = TrainingArguments(
    learning_rate=3-05,
    num_train_epochs=10,
    weight_decay=0.001,
    lr_scheduler_type='linear',
    warmup_ratio=0.001,
    seed=18,
    do_train=True,
    do_eval=True,
    do_predict=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    fp16=True,
    output_dir="./BanglaBERT_large/",
    evaluation_strategy="epoch",
    metric_for_best_model="accuracy",
    overwrite_output_dir=True,
    remove_unused_columns=True,
    local_rank= 1,
    load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="epoch"
)

max_train_samples = None
max_eval_samples=None
max_predict_samples=None
max_seq_length = 512
batch_size = 512

In [None]:
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=8,
gradient_checkpointing=True,
greater_is_better=True,
group_by_length=False,
half_precision_bac

#### Defining the Model

In [None]:
model_name = 'csebuetnlp/banglabert_large'

#### setting the random seed

In [None]:
set_seed(training_args.seed)

#### Loading data files

In [None]:
l2id = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
train_df = pd.read_csv(train_file, sep='\t')
train_df['label'] = train_df['label'].map(l2id)
train_df = Dataset.from_pandas(train_df)
validation_df = pd.read_csv(validation_file, sep='\t')
validation_df['label'] = validation_df['label'].map(l2id)
validation_df = Dataset.from_pandas(validation_df)
test_df = pd.read_csv(test_file, sep='\t')
#test_df['label'] = test_df['label'].map(l2id)
test_df = Dataset.from_pandas(test_df)

data_files = {"train": train_df, "validation": validation_df, "test": test_df}
for key in data_files.keys():
    logger.info(f"loading a local file for {key}")
raw_datasets = DatasetDict(
    {"train": train_df, "validation": validation_df, "test": test_df}
)

INFO:__main__:loading a local file for train
INFO:__main__:loading a local file for validation
INFO:__main__:loading a local file for test


##### Extracting number of unique labels

In [None]:
# Labels
label_list = raw_datasets["train"].unique("label")
label_list.sort()  # sort the labels for determine
num_labels = len(label_list)

### Loading Pretrained Configuration, Tokenizer and Model

In [None]:
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="text-classification",
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
    revision="main",
    use_auth_token=None,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    from_tf=bool(".ckpt" in model_name),
    config=config,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
    ignore_mismatched_sizes=False,
)

[INFO|configuration_utils.py:715] 2023-09-04 19:40:56,114 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--banglabert_large/snapshots/a64fb146d81ec4d7f8838b85e084b0c6a325a22f/config.json
[INFO|configuration_utils.py:775] 2023-09-04 19:40:56,123 >> Model config ElectraConfig {
  "_name_or_path": "csebuetnlp/banglabert_large",
  "_num_labels": 2,
  "amp": true,
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "finetuning_task": "text-classification",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_atten

#### Preprocessing the raw_datasets

In [None]:
non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
sentence1_key= non_label_column_names[1]

# Padding strategy
padding = "max_length"

# Some models have set the order of the labels to use, so let's make sure we do use it.
label_to_id = None
if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id):
    # Some have all caps in their config, some don't.
    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    if sorted(label_name_to_id.keys()) == sorted(label_list):
        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
            "\nIgnoring the model labels as a result.",)

if label_to_id is not None:
    model.config.label2id = label_to_id
    model.config.id2label = {id: label for label, id in config.label2id.items()}

if 512 > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({512}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}.")
max_seq_length = min(512, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    args = (
        (examples[sentence1_key],))
    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)

    # Map labels to IDs (not necessary for GLUE tasks)
    if label_to_id is not None and "label" in examples:
        result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
    return result
raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)


Running tokenizer on dataset:   0%|          | 0/35266 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3934 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/6707 [00:00<?, ? examples/s]

#### Finalize the training data for training the model

In [None]:
if "train" not in raw_datasets:
    raise ValueError("requires a train dataset")
train_dataset = raw_datasets["train"]
if max_train_samples is not None:
    max_train_samples_n = min(len(train_dataset), max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples_n))

In [None]:
train_dataset

Dataset({
    features: ['id', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 35266
})

#### Finalize the development/evaluation data for evaluating the model

In [None]:
if "validation" not in raw_datasets:
    raise ValueError("requires a validation dataset")
eval_dataset = raw_datasets["validation"]
if max_eval_samples is not None:
    max_eval_samples_n = min(len(eval_dataset), max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples_n))

#### Finalize the test data for predicting the unseen test data using the model

In [None]:
if "test" not in raw_datasets and "test_matched" not in raw_datasets:
    raise ValueError("requires a test dataset")
predict_dataset = raw_datasets["test"]
if max_predict_samples is not None:
    max_predict_samples_n = min(len(predict_dataset), max_predict_samples)
    predict_dataset = predict_dataset.select(range(max_predict_samples_n))

#### Log a few random samples from the training set

In [None]:
for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

INFO:__main__:Sample 11879 of the training set: {'id': 'sentinob_11474', 'text': '৩০ টাকায় এতো কিছু । মাশাআল্লাহ', 'label': 2, 'input_ids': [2, 2415, 1, 2564, 916, 205, 6908, 415, 19201, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

#### Get the metric function `accuracy`

In [None]:
metric = evaluate.load("accuracy")

#### Predictions and label_ids field and has to return a dictionary string to float.

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

#### Data Collator

In [None]:
# data_collator = default_data_collator
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

#### Initialize our Trainer

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

#### Training our model

In [None]:
train_result = trainer.train()
metrics = train_result.metrics
max_train_samples = (
    max_train_samples if max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

[INFO|trainer.py:750] 2023-09-04 19:41:13,430 >> The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text, id. If text, id are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
[INFO|trainer.py:1714] 2023-09-04 19:41:13,450 >> ***** Running training *****
[INFO|trainer.py:1715] 2023-09-04 19:41:13,451 >>   Num examples = 35,266
[INFO|trainer.py:1716] 2023-09-04 19:41:13,452 >>   Num Epochs = 10
[INFO|trainer.py:1717] 2023-09-04 19:41:13,453 >>   Instantaneous batch size per device = 32
[INFO|trainer.py:1720] 2023-09-04 19:41:13,453 >>   Total train batch size (w. parallel, distributed & accumulation) = 256
[INFO|trainer.py:1721] 2023-09-04 19:41:13,455 >>   Gradient Accumulation steps = 8
[INFO|trainer.py:1722] 2023-09-04 19:41:13,455 >>   Total optimization steps = 1,370
[INFO|trainer.py:1723] 2023-09-04 19:41:13,458 >>   Number of trainable par

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.729222,0.710219
1,No log,0.694453,0.726233
2,No log,0.788647,0.713523
3,0.575800,0.913463,0.70666
4,0.575800,1.032114,0.710727
5,0.575800,1.20249,0.703864
6,0.575800,1.226838,0.704881
8,0.184200,1.283895,0.711998
8,0.184200,1.359632,0.706914
9,0.184200,1.396003,0.709456


[INFO|trainer.py:750] 2023-09-04 19:54:05,400 >> The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text, id. If text, id are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
[INFO|trainer.py:3119] 2023-09-04 19:54:05,404 >> ***** Running Evaluation *****
[INFO|trainer.py:3121] 2023-09-04 19:54:05,405 >>   Num examples = 3934
[INFO|trainer.py:3124] 2023-09-04 19:54:05,406 >>   Batch size = 64
[INFO|trainer.py:2845] 2023-09-04 19:54:27,922 >> Saving model checkpoint to ./BanglaBERT_large/checkpoint-137
[INFO|configuration_utils.py:460] 2023-09-04 19:54:27,927 >> Configuration saved in ./BanglaBERT_large/checkpoint-137/config.json
[INFO|modeling_utils.py:1953] 2023-09-04 19:54:30,884 >> Model weights saved in ./BanglaBERT_large/checkpoint-137/pytorch_model.bin
[INFO|tokenization_utils_base.py:2235] 2023-09-04 19:54:34,711 >> tokenizer config fi

#### Saving the tokenizer too for easy upload

In [None]:
trainer.save_model()
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

[INFO|trainer.py:2845] 2023-09-04 21:55:01,659 >> Saving model checkpoint to ./BanglaBERT_large/
[INFO|configuration_utils.py:460] 2023-09-04 21:55:01,665 >> Configuration saved in ./BanglaBERT_large/config.json
[INFO|modeling_utils.py:1953] 2023-09-04 21:55:04,890 >> Model weights saved in ./BanglaBERT_large/pytorch_model.bin
[INFO|tokenization_utils_base.py:2235] 2023-09-04 21:55:04,897 >> tokenizer config file saved in ./BanglaBERT_large/tokenizer_config.json
[INFO|tokenization_utils_base.py:2242] 2023-09-04 21:55:04,901 >> Special tokens file saved in ./BanglaBERT_large/special_tokens_map.json


***** train metrics *****
  epoch                    =        9.94
  total_flos               = 304166646GF
  train_loss               =      0.3002
  train_runtime            =  2:13:48.18
  train_samples            =       35266
  train_samples_per_second =      43.928
  train_steps_per_second   =       0.171


#### Evaluating our model on validation/development data

In [None]:
logger.info("*** Evaluate ***")

metrics = trainer.evaluate(eval_dataset=eval_dataset)

max_eval_samples = (
    max_eval_samples if max_eval_samples is not None else len(eval_dataset)
)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

INFO:__main__:*** Evaluate ***
[INFO|trainer.py:750] 2023-09-04 21:55:07,847 >> The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text, id. If text, id are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
[INFO|trainer.py:3119] 2023-09-04 21:55:07,853 >> ***** Running Evaluation *****
[INFO|trainer.py:3121] 2023-09-04 21:55:07,854 >>   Num examples = 3934
[INFO|trainer.py:3124] 2023-09-04 21:55:07,856 >>   Batch size = 64


***** eval metrics *****
  epoch                   =       9.94
  eval_accuracy           =     0.7262
  eval_loss               =     0.6945
  eval_runtime            = 0:00:22.91
  eval_samples            =       3934
  eval_samples_per_second =    171.669
  eval_steps_per_second   =      2.706


### Predecting the test data

In [None]:
id2l = {0:'Negative', 1:'Neutral', 2:'Positive'}
logger.info("*** Predict ***")
#predict_dataset = predict_dataset.remove_columns("label")
ids = predict_dataset['id']
predict_dataset = predict_dataset.remove_columns("id")
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)
output_predict_file = os.path.join(training_args.output_dir, f"predict_results.tsv")
if trainer.is_world_process_zero():
    with open(output_predict_file, "w") as writer:
        logger.info(f"***** Predict results *****")
        writer.write("id\tlabel\n")
        for index, item in enumerate(predictions):
            item = label_list[item]
            item = id2l[item]
            writer.write(f"{ids[index]}\t{item}\n")

INFO:__main__:*** Predict ***
[INFO|trainer.py:750] 2023-09-04 21:55:30,797 >> The following columns in the test set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
[INFO|trainer.py:3119] 2023-09-04 21:55:30,800 >> ***** Running Prediction *****
[INFO|trainer.py:3121] 2023-09-04 21:55:30,801 >>   Num examples = 6707
[INFO|trainer.py:3124] 2023-09-04 21:55:30,802 >>   Batch size = 64


INFO:__main__:***** Predict results *****


In [None]:
ids[0]

7135

#### Saving the model into card

In [None]:
kwargs = {"finetuned_from": model_name, "tasks": "text-classification"}
trainer.create_model_card(**kwargs)

[INFO|modelcard.py:452] 2023-09-04 21:56:09,740 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7262328267097473}]}


In [None]:
from sklearn.metrics import f1_score

gold_tsv = pd.read_csv('data/blp23_sentiment_test_with_label.tsv', sep='\t')
prediction_tsv = pd.read_csv(output_predict_file, sep='\t')

f1_score(gold_tsv['label'], prediction_tsv['label'], average='micro')

0.7153719994036082