# Sentiment Analysis using the emotion dataset with 6 emotions
using HuggingFace Trainer

## Get the emotion dataset

In [1]:
from datasets import load_dataset

ds = load_dataset("emotion")
ds

Using custom data configuration default
Reusing dataset emotion (/home/john/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [2]:
classes={}

for i, mood in enumerate(ds['train'].features['label'].names):
    classes[i]=mood    
classes

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

## Let's see examples of each emotion

In [3]:
mood=0 

for i in range (100):
    if ds['train']['label'][i] ==mood:
        s=ds['train']['text'][i]
        print(f'{classes[mood]}: {s}  ')
        mood+=1
        if mood==6:
            break

sadness: i didnt feel humiliated  
joy: i have been with petronas for years i feel that petronas has performed well and made a huge profit  
love: i feel romantic too  
anger: i think it s the easiest time of year to feel dissatisfied  
fear: i now feel compromised and skeptical of the value of every unit of work i put in  
surprise: i have seen heard and read over the past couple of days i am left feeling impressed by more than a few companies  


## Preprocessing the dataset to make it compatible with model's tokenizer

In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(sample):
    return tokenizer(sample['text'], truncation=True)

tokenized_ds = ds.map(tokenize_fn, batched=True)

100%|██████████| 16/16 [00:00<00:00, 41.43ba/s]
100%|██████████| 2/2 [00:00<00:00, 47.48ba/s]
100%|██████████| 2/2 [00:00<00:00, 22.56ba/s]


## Using DataCollator for dynamic padding

### DataCp;;ator provides dynamic padding. Wehn combined with dataloader, the amount of padding to use depends on the longest sentence in the current batch. This only works when using GPU, and pytorch

In [5]:
from torch.utils.data import DataLoader

collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Importing the model

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(classes), id2label=classes)
model.config.id2label

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

## Hyper-Parameters

In [7]:
import math 

lr=1e-4
epochs=10
batchSize=128
totalSteps=math.ceil(ds['train'].num_rows/batchSize)*epochs
print(f'training has {totalSteps} steps')

training has 1250 steps


## importing optimizer and learning rate scheduler

In [8]:
from transformers import AdamW
from transformers import get_scheduler

opt=AdamW(model.parameters(), lr=lr)
scheduler = get_scheduler("cosine",optimizer=opt,num_warmup_steps=totalSteps//2 ,num_training_steps=totalSteps)

## Training the model using the Trainer and TrainingArgument

In [9]:
from datasets import load_metric 
import numpy as np 

metric=load_metric('accuracy')

def compute_metrics(samples):
    preds, lbls = samples
    preds = np.argmax(preds, axis=1)    
    return metric.compute(predictions=preds, references=lbls)

In [10]:
from transformers import TrainingArguments, Trainer 

args = TrainingArguments(
    "results",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    # learning_rate=lr,
    per_device_train_batch_size=batchSize,
    per_device_eval_batch_size=batchSize,
    num_train_epochs=epochs,
    weight_decay=0.01,        
    load_best_model_at_end=True,
    logging_dir='logs',            # directory for storing logs
    logging_steps=totalSteps//epochs    
)

In [11]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    tokenizer=tokenizer,
    data_collator=collator,    
    optimizers=(opt, scheduler),
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 16000
  Num Epochs = 10
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
 10%|█         | 125/1250 [01:22<13:32,  1.38it/s]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 1.4407, 'learning_rate': 2e-05, 'epoch': 1.0}


                                                  
 10%|█         | 125/1250 [01:26<13:32,  1.38it/s]Saving model checkpoint to results/checkpoint-125
Configuration saved in results/checkpoint-125/config.json


{'eval_loss': 0.835893988609314, 'eval_accuracy': 0.716, 'eval_runtime': 3.8906, 'eval_samples_per_second': 514.057, 'eval_steps_per_second': 4.112, 'epoch': 1.0}


Model weights saved in results/checkpoint-125/pytorch_model.bin
tokenizer config file saved in results/checkpoint-125/tokenizer_config.json
Special tokens file saved in results/checkpoint-125/special_tokens_map.json
 20%|██        | 250/1250 [02:52<09:58,  1.67it/s]  The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.4034, 'learning_rate': 4e-05, 'epoch': 2.0}


                                                  
 20%|██        | 250/1250 [02:56<09:58,  1.67it/s]Saving model checkpoint to results/checkpoint-250
Configuration saved in results/checkpoint-250/config.json


{'eval_loss': 0.1958991438150406, 'eval_accuracy': 0.925, 'eval_runtime': 3.5228, 'eval_samples_per_second': 567.723, 'eval_steps_per_second': 4.542, 'epoch': 2.0}


Model weights saved in results/checkpoint-250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-250/tokenizer_config.json
Special tokens file saved in results/checkpoint-250/special_tokens_map.json
 30%|███       | 375/1250 [04:19<09:02,  1.61it/s]  The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.1672, 'learning_rate': 6e-05, 'epoch': 3.0}


                                                  
 30%|███       | 375/1250 [04:23<09:02,  1.61it/s]Saving model checkpoint to results/checkpoint-375
Configuration saved in results/checkpoint-375/config.json


{'eval_loss': 0.16678932309150696, 'eval_accuracy': 0.936, 'eval_runtime': 3.474, 'eval_samples_per_second': 575.704, 'eval_steps_per_second': 4.606, 'epoch': 3.0}


Model weights saved in results/checkpoint-375/pytorch_model.bin
tokenizer config file saved in results/checkpoint-375/tokenizer_config.json
Special tokens file saved in results/checkpoint-375/special_tokens_map.json
 40%|████      | 500/1250 [05:46<07:42,  1.62it/s]  The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.1274, 'learning_rate': 8e-05, 'epoch': 4.0}


                                                  
 40%|████      | 500/1250 [05:49<07:42,  1.62it/s]Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json


{'eval_loss': 0.15785586833953857, 'eval_accuracy': 0.933, 'eval_runtime': 3.5056, 'eval_samples_per_second': 570.522, 'eval_steps_per_second': 4.564, 'epoch': 4.0}


Model weights saved in results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-500/tokenizer_config.json
Special tokens file saved in results/checkpoint-500/special_tokens_map.json
 50%|█████     | 625/1250 [07:12<06:03,  1.72it/s]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.1139, 'learning_rate': 0.0001, 'epoch': 5.0}


                                                  
 50%|█████     | 625/1250 [07:16<06:03,  1.72it/s]Saving model checkpoint to results/checkpoint-625
Configuration saved in results/checkpoint-625/config.json


{'eval_loss': 0.1627427637577057, 'eval_accuracy': 0.9325, 'eval_runtime': 3.6601, 'eval_samples_per_second': 546.439, 'eval_steps_per_second': 4.372, 'epoch': 5.0}


Model weights saved in results/checkpoint-625/pytorch_model.bin
tokenizer config file saved in results/checkpoint-625/tokenizer_config.json
Special tokens file saved in results/checkpoint-625/special_tokens_map.json
 60%|██████    | 750/1250 [08:41<05:26,  1.53it/s]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.0929, 'learning_rate': 9.045084971874738e-05, 'epoch': 6.0}


                                                  
 60%|██████    | 750/1250 [08:45<05:26,  1.53it/s]Saving model checkpoint to results/checkpoint-750
Configuration saved in results/checkpoint-750/config.json


{'eval_loss': 0.16601631045341492, 'eval_accuracy': 0.9335, 'eval_runtime': 3.471, 'eval_samples_per_second': 576.205, 'eval_steps_per_second': 4.61, 'epoch': 6.0}


Model weights saved in results/checkpoint-750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-750/tokenizer_config.json
Special tokens file saved in results/checkpoint-750/special_tokens_map.json
 70%|███████   | 875/1250 [10:08<03:52,  1.62it/s]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.0651, 'learning_rate': 6.545084971874738e-05, 'epoch': 7.0}


                                                  
 70%|███████   | 875/1250 [10:12<03:52,  1.62it/s]Saving model checkpoint to results/checkpoint-875
Configuration saved in results/checkpoint-875/config.json


{'eval_loss': 0.1848268359899521, 'eval_accuracy': 0.9345, 'eval_runtime': 3.4525, 'eval_samples_per_second': 579.296, 'eval_steps_per_second': 4.634, 'epoch': 7.0}


Model weights saved in results/checkpoint-875/pytorch_model.bin
tokenizer config file saved in results/checkpoint-875/tokenizer_config.json
Special tokens file saved in results/checkpoint-875/special_tokens_map.json
 80%|████████  | 1000/1250 [11:36<02:21,  1.76it/s]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.0376, 'learning_rate': 3.4549150281252636e-05, 'epoch': 8.0}


                                                   
 80%|████████  | 1000/1250 [11:39<02:21,  1.76it/s]Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json


{'eval_loss': 0.23224297165870667, 'eval_accuracy': 0.9375, 'eval_runtime': 3.5348, 'eval_samples_per_second': 565.806, 'eval_steps_per_second': 4.526, 'epoch': 8.0}


Model weights saved in results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in results/checkpoint-1000/special_tokens_map.json
 90%|█████████ | 1125/1250 [13:02<01:11,  1.75it/s]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.0196, 'learning_rate': 9.549150281252633e-06, 'epoch': 9.0}


                                                   
 90%|█████████ | 1125/1250 [13:06<01:11,  1.75it/s]Saving model checkpoint to results/checkpoint-1125
Configuration saved in results/checkpoint-1125/config.json


{'eval_loss': 0.228434756398201, 'eval_accuracy': 0.9375, 'eval_runtime': 3.4866, 'eval_samples_per_second': 573.632, 'eval_steps_per_second': 4.589, 'epoch': 9.0}


Model weights saved in results/checkpoint-1125/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1125/tokenizer_config.json
Special tokens file saved in results/checkpoint-1125/special_tokens_map.json
100%|██████████| 1250/1250 [14:30<00:00,  1.58it/s]The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


{'loss': 0.01, 'learning_rate': 0.0, 'epoch': 10.0}


                                                   
100%|██████████| 1250/1250 [14:34<00:00,  1.58it/s]Saving model checkpoint to results/checkpoint-1250
Configuration saved in results/checkpoint-1250/config.json


{'eval_loss': 0.23848144710063934, 'eval_accuracy': 0.94, 'eval_runtime': 3.5186, 'eval_samples_per_second': 568.406, 'eval_steps_per_second': 4.547, 'epoch': 10.0}


Model weights saved in results/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1250/tokenizer_config.json
Special tokens file saved in results/checkpoint-1250/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-500 (score: 0.15785586833953857).
100%|██████████| 1250/1250 [14:43<00:00,  1.41it/s]

{'train_runtime': 883.7001, 'train_samples_per_second': 181.057, 'train_steps_per_second': 1.415, 'train_loss': 0.24777627553939818, 'epoch': 10.0}





TrainOutput(global_step=1250, training_loss=0.24777627553939818, metrics={'train_runtime': 883.7001, 'train_samples_per_second': 181.057, 'train_steps_per_second': 1.415, 'train_loss': 0.24777627553939818, 'epoch': 10.0})

In [13]:
# use trainer to save the training session if you want to resume training later 
# trainer.save_model('trained')
# calling this will resume training 
# # trainer.train('trained')

## Evaluating the trained model

In [14]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128
100%|██████████| 16/16 [00:03<00:00,  4.90it/s]


{'eval_loss': 0.15785586833953857,
 'eval_accuracy': 0.933,
 'eval_runtime': 3.4756,
 'eval_samples_per_second': 575.443,
 'eval_steps_per_second': 4.604,
 'epoch': 10.0}

## Hyper Parameters tuning using optuna

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(classes) , id2label=classes)

In [None]:
trainer = Trainer(    
    model_init=model_init,
    args=args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    tokenizer=tokenizer,
    data_collator=collator,    
    compute_metrics=compute_metrics
)

## Start the Hyperparameters search
### Trainer will automatically use optuna to do the tuning

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

## Tuning result

In [None]:
best_run

## Set the tuned parameters iunto trainer and train again

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

## Saving the trained model

In [15]:
dir="saved-model"

model.save_pretrained(dir)

Configuration saved in saved-model/config.json
Model weights saved in saved-model/pytorch_model.bin


## Inferencing

### load the trained model

In [16]:
from transformers import AutoModelForSequenceClassification
model2=AutoModelForSequenceClassification.from_pretrained(dir)

loading configuration file saved-model/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "vocab_size": 30522
}

loading weights file saved-model/pytorch_model.

In [17]:
import torch 
device=torch.device('cuda' if torch.cuda.is_available() else "cpu")
model2.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [18]:
def getMood(samples):
    input=tokenizer(samples,padding=True, truncation=True, return_tensors="pt" ).to(device)
    out=model2(**input)
    res=torch.argmax(out.logits , dim=1)    
    return res.cpu()

In [19]:
str1="i am really annoyed"
str2="my heart yearns for her return"
str3="my heart is torn apart"
str4="it's a great sunday today"
str5="i'm left speechless"
str6="i stepped into the unknown"

samples=[str1, str2, str3, str4, str5,str6]
res=getMood(samples)

for i, r in enumerate(res):
    print(f'{samples[i]}: {model2.config.id2label[r.item() ]} ')

i am really annoyed: anger 
my heart yearns for her return: joy 
my heart is torn apart: sadness 
it's a great sunday today: joy 
i'm left speechless: fear 
i stepped into the unknown: fear 
