In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification,AutoConfig,DataCollatorWithPadding,AdamW,get_scheduler
from transformers.utils import ModelOutput
import torch
from torch import nn
import torch.nn.functional as F
from dataclasses import dataclass
from torch.utils.data import DataLoader
from datasets import Dataset,DatasetDict
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
from typing import Optional, Tuple
import random

In [None]:
# random.seed(46)
# np.random.seed(46)
# torch.manual_seed(46)

In [None]:
# hyper params
class Hyper_params:
    def __init__(self):
        self.max_length=128
        self.batch_size=8
        self.learning_rate=0.00002
        self.num_epochs=3
        self.aggression_n_hidden=2
        self.offense_n_hidden=3
        self.dropout_prob=0.15
        self.w_a=0.4
        self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
hypams=Hyper_params()

In [None]:
@dataclass
class Custom_Model_ouputs(ModelOutput):

    loss: Optional[torch.FloatTensor] = None
    loss_a: Optional[torch.FloatTensor] = None
    loss_o: Optional[torch.FloatTensor] = None
    logits_a: torch.FloatTensor = None
    logits_o: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


In [None]:
# model_name="l3cube-pune/hing-roberta"
# model_name='FacebookAI/xlm-roberta-base'
# model_name='FacebookAI/roberta-base'
model_name='google-bert/bert-base-uncased'

# train_path="/home/bharat/bharath/LBP/original_split/train.csv"
# test_path="/home/bharat/bharath/LBP/original_split/test.csv"
# val_path="/home/bharat/bharath/LBP/original_split/validation.csv"


# train_path="/home/bharat/bharath/LBP/same_split/cs_train.csv"
# test_path="/home/bharat/bharath/LBP/same_split/cs_test.csv"
# val_path="/home/bharat/bharath/LBP/same_split/cs_validation.csv"

train_path="/home/bharat/bharath/LBP/same_split/ms_train.csv"
test_path="/home/bharat/bharath/LBP/same_split/ms_test.csv"
val_path="/home/bharat/bharath/LBP/same_split/ms_validation.csv"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [None]:
train_set=pd.read_csv(train_path,encoding='utf-8')
test_set=pd.read_csv(test_path,encoding='utf-8')
val_set=pd.read_csv(val_path,encoding='utf-8')

# train_set=train_set.head(10)
# test_set=test_set.head(10)
# val_set=val_set.head(10)
print(train_set.shape,test_set.shape,val_set.shape)
train_set.head()

In [None]:
train_ds=Dataset.from_pandas(train_set[['tweet','aggression','offense']])
test_ds=Dataset.from_pandas(test_set[['tweet','aggression','offense']])
val_ds=Dataset.from_pandas(val_set[['tweet','aggression','offense']])
data =DatasetDict({
    'train' : train_ds,
    'val': val_ds,
    'test' : test_ds
})
data

In [None]:
def tokenize(batch):
    return tokenizer(batch['tweet'],truncation=True,max_length=hypams.max_length,padding='max_length',return_tensors='pt')

tokenized_ds=data.map(tokenize,batched=True)
tokenized_ds

In [None]:
tokenized_ds.set_format('torch',columns=['input_ids','attention_mask','aggression','offense'])
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_ds

In [None]:
class ClassificationHead(nn.Module):
    def __init__(self,num_labels,inp_size):
        super().__init__()
        self.dense = nn.Linear(inp_size, 768)
        self.dropout = nn.Dropout(hypams.dropout_prob)
        self.out_proj = nn.Linear(768, num_labels)

    def forward(self, features):
        x = features[:, 0, :]  
        # x = self.dropout(x)
        x = self.dense(x)
        # x = torch.tanh(x)
        # x = self.dropout(x)
        x = self.out_proj(x)
        return x


In [None]:

class WeightedCrossEntropyLoss(nn.Module):
    def __init__(self, num_classes):
        super(WeightedCrossEntropyLoss, self).__init__()
        self.num_classes = num_classes

    def forward(self, input, target):
        # Compute class weights
        class_counts = torch.bincount(target, minlength=self.num_classes).float()
        total_samples = class_counts.sum()
        class_weights = total_samples / (self.num_classes * class_counts)

        # Compute cross-entropy loss with class weights
        ce_loss = F.cross_entropy(input, target, weight=class_weights)
        return ce_loss


In [None]:
class task_specific_hidden_layers(nn.Module):
    def __init__ (self,n_hidden,hin_size=768,hout_size=768):
        super(task_specific_hidden_layers,self).__init__()
        self.dropout=nn.Dropout(hypams.dropout_prob)
        self.relu=nn.LayerNorm(hin_size)
        self.hidden=nn.ModuleList()
        for k in range(0,n_hidden):
            self.hidden.append(
                nn.Linear(hin_size,hout_size)
            )

    def forward(self,input):
        for layer in self.hidden[:-1]:
            input=layer(input)
            input=self.relu(input)
            input=self.dropout(input)

        return self.hidden[-1](input)
        

In [None]:
class Hingroberta_MTL (nn.Module):

    def __init__(self,n_a_labels=3,n_o_labels=2):

        super(Hingroberta_MTL,self).__init__()
        self.device=torch.device('cuda')
        self.n_a_labels=n_a_labels
        self.n_o_labels=n_o_labels
        self.model=AutoModelForSequenceClassification.from_pretrained(model_name,config=AutoConfig.from_pretrained(model_name,
                                                                                                            # num_labels=5,
                                                                                                            # problem_type="multi_label_classification",
                                                                                                            hidden_dropout_prob=hypams.dropout_prob,
                                                                                                            output_attentions=True,
                                                                                                            output_hidden_states=True,
                                                                                                            # return_dict=False
                                                                                                            ))
        self.classifier_a=ClassificationHead(n_a_labels,768)
        self.classifier_o=ClassificationHead(n_o_labels,768)
        self.aggression_task=task_specific_hidden_layers(n_hidden=hypams.aggression_n_hidden)
        self.offensize_task=task_specific_hidden_layers(n_hidden=hypams.offense_n_hidden)
        

    def forward(self,input_ids=None,attention_mask=None,aggression=None,offense=None):
        outputs=self.model(input_ids=input_ids,attention_mask=attention_mask)
        sequence_output=outputs.hidden_states[-1]
        # sequence_output_a=self.aggression_task(sequence_output)
        sequence_output_o=self.offensize_task(sequence_output)
        logits_a=self.classifier_a(sequence_output)
        logits_o=self.classifier_o(sequence_output_o+sequence_output)
        loss=None
        if aggression is not None and offense is not None:

            loss_fct_a=WeightedCrossEntropyLoss(self.n_a_labels)
            loss_fct_o=WeightedCrossEntropyLoss(self.n_o_labels)
            loss_a=loss_fct_a(logits_a.view(-1,self.n_a_labels),aggression.view(-1))
            loss_o=loss_fct_o(logits_o.view(-1,self.n_o_labels),offense.view(-1))
            
            loss=(hypams.w_a)*loss_a+(1-hypams.w_a)*loss_o

        return Custom_Model_ouputs(
            loss=loss,
            loss_a=loss_a,
            loss_o=loss_o,
            logits_a=logits_a,
            logits_o=logits_o,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions
        )
    

In [None]:
model=Hingroberta_MTL().to(hypams.device)


In [None]:
train_dataloader=DataLoader(
    tokenized_ds['train'],shuffle=True,batch_size=hypams.batch_size,collate_fn=data_collator
)
eval_dataloader=DataLoader(
    tokenized_ds['val'],batch_size=hypams.batch_size,collate_fn=data_collator
)
len(train_dataloader)

In [None]:
optimizer=AdamW(model.parameters(),lr=hypams.learning_rate,weight_decay=0.5)

num_training_steps=hypams.num_epochs*len(train_dataloader)
lr_scheduler=get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:

progress_bar_train =tqdm(range(num_training_steps))
progress_bar_eval=tqdm(range(hypams.num_epochs*len(eval_dataloader)))

for epoch in range(hypams.num_epochs):
    
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(hypams.device) for k, v in batch.items()}
        outputs = model(**batch)
        xyz=outputs
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)


    model.eval()
    eval_loss=[]
    eval_loss_a=[]
    eval_loss_o=[]
    all_preds_a = []
    all_labels_a = []
    all_preds_o = []
    all_labels_o = []
    all_labels_o = []
    for batch in eval_dataloader:
        batch = {k: v.to(hypams.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        eval_loss.append(outputs.loss.cpu().item())
        eval_loss_a.append(outputs.loss_a.cpu().item())
        eval_loss_o.append(outputs.loss_o.cpu().item())
        logits_a = outputs.logits_a
        logits_o = outputs.logits_o
        pred_a = torch.argmax(logits_a, dim=-1)
        pred_o = torch.argmax(logits_o, dim=-1)
        # Collecting predictions and labels for each batch
        all_preds_a.extend(pred_a.cpu().numpy())
        all_labels_a.extend(batch['aggression'].cpu().numpy())
        all_preds_o.extend(pred_o.cpu().numpy())
        all_labels_o.extend(batch['offense'].cpu().numpy())
        progress_bar_eval.update(1)


    print(classification_report(all_labels_a, all_preds_a))
    print(classification_report(all_labels_o, all_preds_o))
    


In [None]:
model.eval()
test_dataloader=DataLoader(
    tokenized_ds['test'],batch_size=hypams.batch_size,collate_fn=data_collator
)
progress_bar_test=tqdm(range(len(test_dataloader)))

all_preds_a = []
all_labels_a = []
all_preds_o = []
all_labels_o = []

for batch in test_dataloader:
    batch = {k: v.to(hypams.device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits_a = outputs.logits_a
    logits_o = outputs.logits_o
    pred_a = torch.argmax(logits_a, dim=-1)
    pred_o = torch.argmax(logits_o, dim=-1)
    # Collecting predictions and labels for each batch
    all_preds_a.extend(pred_a.cpu().numpy())
    all_labels_a.extend(batch['aggression'].cpu().numpy())
    all_preds_o.extend(pred_o.cpu().numpy())
    all_labels_o.extend(batch['offense'].cpu().numpy())
    progress_bar_test.update(1)


print(classification_report(all_labels_a, all_preds_a))
print(classification_report(all_labels_o, all_preds_o))
