# Sentiment Analysis using the emotion dataset with 6 emotions
### native pytorch

In [None]:
from transformers import AutoTokenizer

chkpt="distilbert-base-uncased"
saveModeldir="saved-model"

tokenizer=AutoTokenizer.from_pretrained(chkpt)

## Get the emotion dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("emotion")

In [None]:
classes={}
for i, mood in enumerate(ds['train'].features['label'].names):
    classes[i]=mood    
classes

## Let's see examples of each emotion

In [None]:
mood=0 

for i in range (100):
    if ds['train']['label'][i] ==mood:
        s=ds['train']['text'][i]
        print(f'{classes[mood]}: {s}  ')
        mood+=1
        if mood==6:
            break

## Preprocessing the dataset to make it compatible with model's tokenizer

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(sample):
    return tokenizer(sample['text'], truncation=True)

tokenized_ds = ds.map(tokenize_fn, batched=True)

# remove unwanted columns that miodel is not expecting. renaming column so model can expect it

tokenized_ds=tokenized_ds.remove_columns('text')
tokenized_ds=tokenized_ds.rename_column('label', 'labels')
tokenized_ds
tokenized_ds['train'].column_names

## Using DataCollator with DataLoader

### DataCp;;ator provides dynamic padding. Wehn combined with dataloader, the amount of padding to use depends on the longest sentence in the current batch. This only works when using GPU, and pytorch

In [None]:
from torch.utils.data import DataLoader

batchSize=128

collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dl = DataLoader(tokenized_ds["train"], shuffle=True, batch_size=batchSize, collate_fn=collator)
eval_dl = DataLoader(tokenized_ds["validation"], batch_size=batchSize, collate_fn=collator)
test_dl= DataLoader(tokenized_ds["test"], batch_size=batchSize, collate_fn=collator)

print(f'train dl has {len(train_dl) }  batches')
print(f'validation dl has {len(eval_dl) }  batches')
print(f'test dl has {len(test_dl) }  batches')

In [None]:
batch=next(iter(train_dl))
{k: v.shape for k, v in batch.items()}

### Notice how the 2nd dimension of the batch changes between batches? that's dynamic padding thanks to the data collator.

## Importing the model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(chkpt, num_labels=len(classes), id2label=classes)
model.config.id2label

In [None]:
#try with a batch first to test if model can process a batch
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

## Hyper-Parameters

In [None]:
lr= 8e-5
epochs=10

## importing optimizer and learning rate scheduler

In [None]:
from transformers import AdamW
from transformers import get_scheduler

opt=AdamW(model.parameters(), lr=lr)
scheduler = get_scheduler("linear", optimizer=opt,num_warmup_steps=len(train_dl)*4,num_training_steps= len(train_dl)*epochs)

## Setup device to use GPU

In [None]:
import torch 
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device 
model.to(device=device)

## Setup training loop

In [None]:
def getAccuracy(outs, batch):
    preds=torch.argmax(outs.logits, dim=1)
    y=torch.sum(preds==batch['labels']).item()
    return y

In [None]:
from tqdm import tqdm 

def train(optim, sched=None):
    losses=0.
    accs=0
    model.train() 
    torch.set_grad_enabled(True)   

    for batch in tqdm(train_dl):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        losses+=(loss.item() *len(batch['labels']))
        accs+=getAccuracy(outputs, batch)
        loss.backward()        
        optim.step()
        if sched is not None:
            sched.step()
        opt.zero_grad()
    return losses/ds['train'].num_rows , accs

In [None]:
from tqdm import tqdm 

def evaluate(ldr, trainedModel):
    losses=0. 
    accs=0
    model.eval()
    torch.set_grad_enabled(False)
    
    for batch in tqdm(ldr):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = trainedModel(**batch)
        loss = outputs.loss.item()
        losses+=(loss*len(batch['labels']))
        accs+=getAccuracy(outputs, batch)    
    return losses/ds['validation'].num_rows , accs

In [None]:
import IPython.display as display 


def showResults(TLosses, TAccs, VLosses, VAccs):
    display.clear_output(wait=True)        
    print('epoch\tTLoss\t\t TAcc\t\t ValLoss\t ValAcc\n')
    i=1
    for tl, ta, vl, va in zip(TLosses, TAccs, VLosses, VAccs) :
        print(f'{i}\t {tl:.3f}\t\t {ta*100.:.3f}%\t {vl:.3f}\t\t {va*100.:.3f}% ')
        i+=1

In [None]:
from tqdm.auto import tqdm

TrainLosses=[]
TrainAccs=[]
valLosses=[]
valAccs=[]
curValAcc=0.0 
curEp=-1

totalTrain=len(ds['train'])
totalVal=len(ds['validation'] )

for ep  in range(epochs):
    print(f'epoch: {ep+1} / {epochs}  ')
    print('training...')
    loss, accs=train(opt, scheduler)
    TrainLosses.append(loss)
    acc=accs/totalTrain 
    TrainAccs.append(acc)
    
    #test validation set
    print('evaluating...')
    loss, accs=evaluate(eval_dl, model)
    acc=accs/totalVal
    valLosses.append(loss)
    valAccs.append(acc) 
    if curValAcc < acc:   
        curValAcc = acc
        curEp=ep 
        model.save_pretrained(saveModeldir)
    showResults(TrainLosses, TrainAccs, valLosses, valAccs)

## Let's plot the losses and accuracies

In [None]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(20,20) )
plt.subplot(1,2,1)
plt.title('Losses')
plt.plot(TrainLosses, label='train')
plt.plot(valLosses,label='val')
plt.legend()
plt.subplot(1,2,2)
plt.title('Accuracies')
plt.plot(TrainAccs, label='train')
plt.plot(valAccs, label='val')
plt.legend()
plt.show()

## Load the best performing model for inference

In [1]:
import torch 
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model2=AutoModelForSequenceClassification.from_pretrained("saved-model")
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu' )
model2.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

## Evaluate with test dataset

In [None]:
loss, acc=evaluate(test_dl, model2)
acc=acc/len(ds['test'])
print(f'test accuracy={acc*100.:.3f} %')

## Inferencing

In [3]:
def getMood(samples):
    input=tokenizer(samples,padding=True, truncation=True, return_tensors="pt" ).to(device)
    out=model2(**input)
    res=torch.argmax(out.logits , dim=1)    
    return res.cpu()

In [4]:
str1="i am really annoyed"
str2="my heart yearns for her return"
str3="my heart is torn apart"
str4="it's a great sunday today"
str5="i'm left speechless"
str6="i stepped into the unknown"

strs=[str1, str2, str3, str4, str5, str6]

out=getMood(strs)

for i, res in enumerate(out):
    print(f'{strs[i]}: {model2.config.id2label[res.item()]}')

i am really annoyed: anger
my heart yearns for her return: love
my heart is torn apart: sadness
it's a great sunday today: joy
i'm left speechless: surprise
i stepped into the unknown: fear
