# Sentiment Analysis using the emotion dataset with 6 emotions
using HuggingFace Trainer

## Get the emotion dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("emotion")
ds

In [None]:
classes={}

for i, mood in enumerate(ds['train'].features['label'].names):
    classes[i]=mood    
classes

## Let's see examples of each emotion

In [None]:
mood=0 

for i in range (100):
    if ds['train']['label'][i] ==mood:
        s=ds['train']['text'][i]
        print(f'{classes[mood]}: {s}  ')
        mood+=1
        if mood==6:
            break

## Preprocessing the dataset to make it compatible with model's tokenizer

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(sample):
    return tokenizer(sample['text'], truncation=True)

tokenized_ds = ds.map(tokenize_fn, batched=True)

## Using DataCollator for dynamic padding

### DataCp;;ator provides dynamic padding. Wehn combined with dataloader, the amount of padding to use depends on the longest sentence in the current batch. This only works when using GPU, and pytorch

In [None]:
from torch.utils.data import DataLoader

collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Importing the model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(classes), id2label=classes)
model.config.id2label

## Hyper-Parameters

In [None]:
import math 

lr=1e-4
epochs=10
batchSize=128
totalSteps=math.ceil(ds['train'].num_rows/batchSize)*epochs
print(f'training has {totalSteps} steps')

## importing optimizer and learning rate scheduler

In [None]:
from transformers import AdamW
from transformers import get_scheduler

opt=AdamW(model.parameters(), lr=lr)
scheduler = get_scheduler("cosine",optimizer=opt,num_warmup_steps=totalSteps//2 ,num_training_steps=totalSteps)

## Training the model using the Trainer and TrainingArgument

In [None]:
from datasets import load_metric 
import numpy as np 

metric=load_metric('accuracy')

def compute_metrics(samples):
    preds, lbls = samples
    preds = np.argmax(preds, axis=1)    
    return metric.compute(predictions=preds, references=lbls)

In [None]:
from transformers import TrainingArguments, Trainer 

args = TrainingArguments(
    "results",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    # learning_rate=lr,
    per_device_train_batch_size=batchSize,
    per_device_eval_batch_size=batchSize,
    num_train_epochs=epochs,
    weight_decay=0.01,        
    load_best_model_at_end=True,
    logging_dir='logs',            # directory for storing logs
    logging_steps=totalSteps//epochs    
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    tokenizer=tokenizer,
    data_collator=collator,    
    optimizers=(opt, scheduler),
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# use trainer to save the training session if you want to resume training later 
# trainer.save_model('trained')
# calling this will resume training 
# # trainer.train('trained')

## Evaluating the trained model

In [None]:
trainer.evaluate()

## Hyper Parameters tuning using optuna

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(classes) , id2label=classes)

In [None]:
trainer = Trainer(    
    model_init=model_init,
    args=args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    tokenizer=tokenizer,
    data_collator=collator,    
    compute_metrics=compute_metrics
)

## Start the Hyperparameters search
### Trainer will automatically use optuna to do the tuning

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

## Tuning result

In [None]:
best_run

## Set the tuned parameters iunto trainer and train again

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

## Saving the trained model

In [None]:
dir="saved-model"

model.save_pretrained(dir)

## Inferencing

### load the trained model

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model2=AutoModelForSequenceClassification.from_pretrained("saved-model")

In [2]:
import torch 
device=torch.device('cuda' if torch.cuda.is_available() else "cpu")
model2.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [3]:
def getMood(samples):
    input=tokenizer(samples,padding=True, truncation=True, return_tensors="pt" ).to(device)
    out=model2(**input)
    res=torch.argmax(out.logits , dim=1)    
    return res.cpu()

In [4]:
str1="i am really annoyed"
str2="my heart yearns for her return"
str3="my heart is torn apart"
str4="it's a great sunday today"
str5="i'm left speechless"
str6="i stepped into the unknown"

samples=[str1, str2, str3, str4, str5,str6]
res=getMood(samples)

for i, r in enumerate(res):
    print(f'{samples[i]}: {model2.config.id2label[r.item() ]} ')

i am really annoyed: anger 
my heart yearns for her return: love 
my heart is torn apart: sadness 
it's a great sunday today: joy 
i'm left speechless: surprise 
i stepped into the unknown: fear 
