#Sentiment Classification

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pwd

In [None]:
!ls
%cd drive/MyDrive/splore_project
!pwd
!ls

In [3]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as skp
from sklearn.model_selection import train_test_split

In [4]:
dataset = pd.read_csv("topical_chat.csv")
dataset

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpful...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper
...,...,...,...
188373,8628,"Wow, it does not seem like that long. Since I...",Surprised
188374,8628,"I havent seen that episode, I might google it...",Curious to dive deeper
188375,8628,I don't think I have either. That's an insane ...,Curious to dive deeper
188376,8628,"I did, my little brother used to love Thomas ...",Happy


###Remove rows having NULL message

In [5]:
dataset.drop(dataset[dataset['message'].isnull()].index , inplace=True)
dataset

Unnamed: 0,conversation_id,message,sentiment
0,1,Are you a fan of Google or Microsoft?,Curious to dive deeper
1,1,Both are excellent technology they are helpful...,Curious to dive deeper
2,1,"I'm not a huge fan of Google, but I use it a...",Curious to dive deeper
3,1,Google provides online related services and p...,Curious to dive deeper
4,1,"Yeah, their services are good. I'm just not a...",Curious to dive deeper
...,...,...,...
188373,8628,"Wow, it does not seem like that long. Since I...",Surprised
188374,8628,"I havent seen that episode, I might google it...",Curious to dive deeper
188375,8628,I don't think I have either. That's an insane ...,Curious to dive deeper
188376,8628,"I did, my little brother used to love Thomas ...",Happy


###Get unique classes

In [6]:
sentiment_types = dataset.sentiment.unique()
sentiment_types

array(['Curious to dive deeper', 'Happy', 'Neutral', 'Surprised',
       'Disgusted', 'Sad', 'Fearful', 'Angry'], dtype=object)

###Integer encoding of classnames

In [7]:
label_encoder = skp.LabelEncoder()
integer_encoded = label_encoder.fit_transform(dataset.sentiment)
print(integer_encoded)
print(len(integer_encoded))

[1 1 1 ... 1 4 5]
188373


###Check which integer is assigned to which class

In [8]:
sentiment_df = pd.DataFrame(sentiment_types, columns=['sentiment_types'])
sentiment_df['sentiment_integer'] = label_encoder.fit_transform(sentiment_df['sentiment_types'])
sentiment_df

Unnamed: 0,sentiment_types,sentiment_integer
0,Curious to dive deeper,1
1,Happy,4
2,Neutral,5
3,Surprised,7
4,Disgusted,2
5,Sad,6
6,Fearful,3
7,Angry,0


###Split dataset into train and eval sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dataset[['message']].to_numpy().flatten().tolist(), integer_encoded, test_size=0.10, random_state=42)
print(len(X_train))
print(y_train.shape)
print(len(X_test))
print(y_test.shape)

169535
(169535,)
18838
(18838,)


###Using HuggingFace transformers

In [None]:
!pip install transformers huggingface_hub
!pip install accelerate -U

In [11]:
import transformers
from transformers import DistilBertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from transformers import TrainingArguments, Trainer, AdamW
from torch.utils.data import DataLoader
import torch, gc
import torch.nn as nn
torch.cuda.is_available()

True

###Using DistilRoberta model as it is an approximation to BERT with lesser parameters, hence faster to train

In [12]:
class My_DistilRoberta_Model(nn.Module):
   def __init__(self, model_name, num_class):
       super(My_DistilRoberta_Model, self).__init__()
       self.bert = AutoModelForSequenceClassification.from_pretrained(model_name)
       self.bert.classifier.out_proj = nn.Linear(self.bert.config.hidden_size, num_class) ##changed last layer to output 8 classes
       self.sigmoid = nn.Sigmoid()

   def forward(self, input, mask):
       output = self.bert(input_ids=input, attention_mask=mask, return_dict=False)
       out = self.sigmoid(output[0])
       return out


class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


###Using DistilRoberta pre-trained on emotion dataset

In [None]:
model_name = "j-hartmann/emotion-english-distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = My_DistilRoberta_Model(model_name, len(sentiment_df))
model.cuda()

###Tokenizing sentence for DistilRoberta

In [14]:
tok_X_train = tokenizer(X_train, truncation=True, padding=True)
tok_X_test = tokenizer(X_test, truncation=True, padding=True)

In [15]:
train_dataset = Dataset(tok_X_train, y_train)
val_dataset = Dataset(tok_X_test, y_test)

###Fine-tuning model on our dataset

In [None]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset)

optim = AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

model.train()
prev_total_loss_val = 0

for epoch in range(3):
    total_acc_train = 0
    total_loss_train = 0
    b=0
    gc.collect()
    torch.cuda.empty_cache()

    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss_train += loss.item()
        acc = (outputs.argmax(dim=1) == labels).sum().item()
        total_acc_train += acc
        loss.backward()
        optim.step()
        if b % 500 == 0:
          print("Batch "+str(b)+" Loss: "+str(loss.item()))
          torch.save(model.state_dict(), "./saved_models/emotion_model1")
        b+=1

    total_acc_val = 0
    total_loss_val = 0
    with torch.no_grad():
      for batch in val_loader:
          input_ids = batch['input_ids'].cuda()
          attention_mask = batch['attention_mask'].cuda()
          labels = batch['labels'].cuda()
          outputs = model(input_ids, attention_mask)
          loss = criterion(outputs, labels)
          total_loss_val += loss.item()
          acc = (outputs.argmax(dim=1) == labels).sum().item()
          total_acc_val += acc
      if total_loss_val < prev_total_loss_val:
          torch.save(model.state_dict(), "./saved_models/best_emotion_model1")
          prev_total_loss_val = total_loss_val

    print(
      f'Epochs: {epoch} | Train Loss: {total_loss_train / len(y_train): .3f} \
      | Train Accuracy: {total_acc_train / len(y_train): .3f} \
      | Val Loss: {total_loss_val / len(y_test): .3f} \
      | Val Accuracy: {total_acc_val / len(y_test): .3f}')



Batch 0 Loss: 2.098695993423462
Batch 500 Loss: 1.710263729095459
Batch 1000 Loss: 1.6171774864196777
Epochs: 0 | Train Loss:  0.013       | Train Accuracy:  0.459       | Val Loss:  1.642       | Val Accuracy:  0.477
Batch 0 Loss: 1.6411569118499756
Batch 500 Loss: 1.6516953706741333
Batch 1000 Loss: 1.6721714735031128
Epochs: 1 | Train Loss:  0.013       | Train Accuracy:  0.456       | Val Loss:  1.638       | Val Accuracy:  0.435
Batch 0 Loss: 1.6371641159057617
Batch 500 Loss: 1.5777541399002075
Batch 1000 Loss: 1.6324611902236938
Epochs: 2 | Train Loss:  0.013       | Train Accuracy:  0.460       | Val Loss:  1.636       | Val Accuracy:  0.441


###Calculating Precision, Recall & F1 scores on Val data

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [43]:
test_model = My_DistilRoberta_Model(model_name, len(sentiment_df))
test_model.load_state_dict(torch.load("./saved_models/emotion_model1"), strict=False)
test_model.cuda()

val_loader = DataLoader(val_dataset, batch_size=1)
all_preds = []

with torch.no_grad():
    i = 0
    total_acc_val = 0
    for batch in val_loader:
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        labels = batch['labels'].cuda()
        outputs = test_model(input_ids, attention_mask)
        preds = outputs.argmax(dim=1)
        all_preds.append(preds.item())
        acc = (preds == labels).sum().item()
        total_acc_val += acc
        if i>=100:
            break
        i+=1
    i+=1
    print(f' Accuracy: {total_acc_val / i: .3f} \
      | Recall: {recall_score(y_test[:i], all_preds, average="macro"): .3f} \
      | Precision: {precision_score(y_test[:i], all_preds, average="macro"): .3f} \
      | F1 Score: {f1_score(y_test[:i], all_preds, average="macro"): .3f} ')

 Accuracy:  0.455       | Recall:  0.197       | Precision:  0.243       | F1 Score:  0.184 


  _warn_prf(average, modifier, msg_start, len(result))
