In [97]:
import json
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Model, GPT2PreTrainedModel, GPT2Config
import pickle
import wandb
from tqdm import tqdm
from sklearn.metrics import classification_report

In [98]:
# Source: https://github.com/LCS2-IIITD/Emotion-Flip-Reasoning/blob/main/Dataloaders/nlp_utils.py
import string
import nltk
import re

numbers = {
    "0":"zero",
    "1":"one",
    "2":"two",
    "3":"three",
    "4":"four",
    "5":"five",
    "6":"six",
    "7":"seven",
    "8":"eight",
    "9":"nine"
}

def remove_puntuations(txt):
    punct = set(string.punctuation)
    txt = " ".join(txt.split("."))
    txt = " ".join(txt.split("!"))
    txt = " ".join(txt.split("?"))
    txt = " ".join(txt.split(":"))
    txt = " ".join(txt.split(";"))
    
    txt = "".join(ch for ch in txt if ch not in punct)
    return txt

def number_to_words(txt):
    for k in numbers.keys():
        txt = txt.replace(k,numbers[k]+" ")
    return txt

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'_',' ',text)
    text = number_to_words(text)
    text = remove_puntuations(text)
    text = ''.join([i if ord(i) < 128 else '' for i in text])
    text = ' '.join(text.split())
    return text

In [99]:
train_data = json.load(open('../../Dataset/ERC_conversational_level/train_conversation_level.json'))
val_data = json.load(open('../../Dataset/ERC_conversational_level/val_conversation_level.json'))

In [100]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [101]:
emotion2int = {
    'anger': 0,
    'joy': 1,
    'fear': 2,
    'disgust': 3,
    'neutral': 4,
    'surprise': 5,
    'sadness': 6
}

In [102]:
utterance2vec = pickle.load(open('../../Dataset/Embeddings/sentence_transformer_utterance2vec_768.pkl', 'rb'))

In [103]:
MAX_CONV_LEN = 35
# Defined index 7 for padding
class ERC_Dataset_Conv_Level(Dataset):
    def __init__(self, data, utterance2vec):
        self.data = data
        self.utterance2vec = utterance2vec

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        conversation = self.data[idx]['conversation']
        texts = [utterance['text'] for utterance in conversation]
        emotions = [emotion2int[utterance['emotion']] for utterance in conversation]
        text_embeddings = [torch.from_numpy(self.utterance2vec[preprocess_text(text)]) for text in texts]
                
        if(len(text_embeddings)<MAX_CONV_LEN):
            num_pads = MAX_CONV_LEN - len(text_embeddings)
            attention_mask = [1]*len(text_embeddings) + [0]*num_pads
            text_embeddings = text_embeddings + [torch.zeros(768)]*num_pads
            emotions = emotions + [7]*num_pads # 7 is the index for padding
        else:
            text_embeddings = text_embeddings[len(text_embeddings)-MAX_CONV_LEN:]
            attention_mask = [1]*MAX_CONV_LEN
            emotions = emotions[len(emotions)-MAX_CONV_LEN:]

        text_embeddings = torch.stack(text_embeddings)
        attention_mask = torch.tensor(attention_mask)
        emotions = torch.tensor(emotions)
        return {
            'text_embeddings': text_embeddings,
            'attention_mask': attention_mask,
            'emotions': emotions
        }

In [104]:
train_dataset = ERC_Dataset_Conv_Level(train_data, utterance2vec)
val_dataset = ERC_Dataset_Conv_Level(val_data, utterance2vec)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [105]:
class ERC_GPT2(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.GPT2 = GPT2Model(config)
        self.classifier = nn.Linear(config.n_embd, self.num_labels)

        self.post_init()

    def forward(self, inputs_embeds, attention_mask, labels=None):
        outputs = self.GPT2(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
        outputs = outputs.last_hidden_state.reshape(-1, 768)
        attention_mask, labels = attention_mask.reshape(-1), labels.reshape(-1)
        outputs = [outputs[i] for i in range(len(attention_mask)) if attention_mask[i] == 1]
        labels = [labels[i] for i in range(len(attention_mask)) if attention_mask[i] == 1]
        labels = torch.tensor(labels).cpu()
        outputs = torch.stack(outputs)
        logits = self.classifier(outputs).cpu()
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        return {
            'loss': loss,
            'logits': logits,
            'labels': labels    
        }

In [106]:
config = GPT2Config.from_pretrained('gpt2', num_labels=7)
model = ERC_GPT2.from_pretrained('gpt2', config=config)

Some weights of ERC_GPT2 were not initialized from the model checkpoint at gpt2 and are newly initialized: ['GPT2.h.0.attn.c_attn.bias', 'GPT2.h.0.attn.c_attn.weight', 'GPT2.h.0.attn.c_proj.bias', 'GPT2.h.0.attn.c_proj.weight', 'GPT2.h.0.ln_1.bias', 'GPT2.h.0.ln_1.weight', 'GPT2.h.0.ln_2.bias', 'GPT2.h.0.ln_2.weight', 'GPT2.h.0.mlp.c_fc.bias', 'GPT2.h.0.mlp.c_fc.weight', 'GPT2.h.0.mlp.c_proj.bias', 'GPT2.h.0.mlp.c_proj.weight', 'GPT2.h.1.attn.c_attn.bias', 'GPT2.h.1.attn.c_attn.weight', 'GPT2.h.1.attn.c_proj.bias', 'GPT2.h.1.attn.c_proj.weight', 'GPT2.h.1.ln_1.bias', 'GPT2.h.1.ln_1.weight', 'GPT2.h.1.ln_2.bias', 'GPT2.h.1.ln_2.weight', 'GPT2.h.1.mlp.c_fc.bias', 'GPT2.h.1.mlp.c_fc.weight', 'GPT2.h.1.mlp.c_proj.bias', 'GPT2.h.1.mlp.c_proj.weight', 'GPT2.h.10.attn.c_attn.bias', 'GPT2.h.10.attn.c_attn.weight', 'GPT2.h.10.attn.c_proj.bias', 'GPT2.h.10.attn.c_proj.weight', 'GPT2.h.10.ln_1.bias', 'GPT2.h.10.ln_1.weight', 'GPT2.h.10.ln_2.bias', 'GPT2.h.10.ln_2.weight', 'GPT2.h.10.mlp.c_fc.bias

In [107]:
epochs = 5
optimizer = AdamW(model.parameters(), lr=1e-5)

In [108]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("wandb_login_key")
# wandb.login(key=secret_value_0)

In [109]:
# wandb.init(project='TECPEC', name='GPT2_Conv_Level', config={
#     'Embedding': 'Sentence-Transformer',
#     'Level': 'Conversation Level',
#     'Epochs': epochs,
#     'Optimizer': 'AdamW',
#     'Learning Rate': 1e-5,
#     'Batch Size': 16
# })

In [110]:
for epoch in range(epochs):
    model.train()
    train_pred, train_true, train_loss = [], [], 0.0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        text_embeddings, attention_mask, emotions = batch['text_embeddings'].to(device), batch['attention_mask'].to(device), batch['emotions'].to(device)
        outputs = model(inputs_embeds=text_embeddings, attention_mask=attention_mask, labels=emotions)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        train_pred.extend(torch.argmax(outputs['logits'], 1).tolist())
        train_true.extend(outputs['labels'].tolist())
        train_loss += loss.item()
    train_loss /= len(train_loader) 
    model.eval()
    val_pred, val_true, val_loss = [], [], 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            text_embeddings, attention_mask, emotions = batch['text_embeddings'].to(device), batch['attention_mask'].to(device), batch['emotions'].to(device)
            outputs = model(inputs_embeds=text_embeddings, attention_mask=attention_mask, labels=emotions)
            loss = outputs['loss']
            val_pred.extend(torch.argmax(outputs['logits'], 1).tolist())
            val_true.extend(outputs['labels'].tolist())
            val_loss += loss.item()
            
    val_loss /= len(val_loader)
    train_report = classification_report(train_true, train_pred, target_names=emotion2int.keys(), zero_division=0)
    val_report = classification_report(val_true, val_pred, target_names=emotion2int.keys(), zero_division=0)

    train_report_dict = classification_report(train_true, train_pred, target_names=emotion2int.keys(), output_dict=True, zero_division=0)
    val_report_dict = classification_report(val_true, val_pred, target_names=emotion2int.keys(), output_dict=True, zero_division=0)
    # wandb.log({
    #     'train_loss': train_loss,
    #     'val_loss': val_loss,
    #     'train_accuracy': train_report_dict['accuracy'],
    #     'val_accuracy': val_report_dict['accuracy'],
    #     'Macro train_f1': train_report_dict['macro avg']['f1-score'],
    #     'Macro val_f1': val_report_dict['macro avg']['f1-score'],
    #     'Weighted train_f1': train_report_dict['weighted avg']['f1-score'],
    #     'Weighted val_f1': val_report_dict['weighted avg']['f1-score'],
    # })
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss}, Val Loss: {val_loss}")
    print(f"Train Report: \n{train_report}")
    print(f"Val Report: \n{val_report}")


  4%|▍         | 3/78 [00:12<05:06,  4.08s/it]


KeyboardInterrupt: 