<h1>Mulitclass classification fine tuned bert</h1>

In [1]:
# libraries

import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
import torchmetrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

# import + preprocess the data
def preprocessing(df):     
    # 0: comment is not hateful
    # 1: comment is hateful (target_race)
    # 2: comment is hateful (target_religion)
    # 3: comment is hateful (target_origin)
    # 4: comment is hateful (target_gender)
    # 5: comment is hateful (target_sexuality)
    # 6: comment is hateful (target_age)
    # 7: comment is hateful (target_disability)
    l = []
    for i in range(len(df)):
        list_class = [0] * 8
        list_class[int(df['label'].iloc[i])] = 1
        l.append(list_class)
    
    df['Class'] = l
    
    return df

def create_class(i):
    """
    Creating a list of lenght 8 with only zeros except from a one at index i
    """
    l = [0] * 8
    l[i]=1
    return l



# Custome the data for our need
class HateSpeechData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __getitem__(self, index):
        item = {key:torch.tensor(value[index]) for key, value in self.X.items()}
        item["labels"] = torch.tensor(self.y[index], dtype=torch.float)
        return item
    
    def __len__(self):
        return len(self.X["input_ids"])
    

# Dataloader
def dataloader(df, val_frac, test_frac, batch_size, max_lenght=None):

    if max_lenght is None:
        X = list(df['text'])
        y = list(df['Class'])
    else:
        X = list(df['text'][:MAXIMUM])
        y = list(df['Class'][:MAXIMUM])

    # split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_frac, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=test_frac, stratify=y_train)

    # initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the comment text --> create an object that has free keys : input_ids, attention_mask, 
    X_train_tokenize = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenize = tokenizer(X_val, padding=True, truncation=True, max_length=512)  
    X_test_tokenize = tokenizer(X_test, padding=True, truncation=True, max_length=512)
    
    train_dataset = HateSpeechData(X_train_tokenize, y_train)
    val_dataset = HateSpeechData(X_val_tokenize, y_val)
    test_dataset = HateSpeechData(X_test_tokenize, y_test)

    trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle = True, num_workers=0)

    return trainloader, validloader, testloader

    
#Create the BERT model we will use to fine tuned
class BERTForFineTuning(torch.nn.Module):
    def __init__(self):
        super(BERTForFineTuning, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        # feature bert input is 768 and we want the prediction on the 8 class
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):

        output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1[1])
        output = self.l3(output_2)
        return output
    

def loss_fn(outputs, targets):
    #print(len(targets))
    #print(len(outputs))
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

def train(nm_epoch, training_loader):
    
    model = BERTForFineTuning()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

    # set the model to training mode
    model.train()
    for epoch in range(nm_epoch):
        running_loss = 0.0
        for i, data in enumerate(training_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            attention_mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data['labels'].to(device, dtype = torch.float)

            # initialize the optimizer
            optimizer.zero_grad()
            #forward inputs
            outputs = model.forward(ids, attention_mask, token_type_ids)
            # define the loss
            loss = loss_fn(outputs, labels)
            # backpropagate
            loss.backward()
            # print("Capturing:", torch.cuda.is_current_stream_capturing())
            optimizer.step()
            # add the loss to the running loss
            running_loss+=loss.item()

            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
    
    try:
        torch.save(model.state_dict(), 'fine_tuned_bert.pt')
        print('Model has been saved !')
    except:
        print('The model has already been saved!')

    return model

def validation(epoch, testing_loader):
    model = BERTForFineTuning()
    model.load_state_dict(torch.load('fine_tuned_bert.pt'))
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    fin_targets=[]
    fin_outputs=[]
    
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df.head()

Unnamed: 0,comment_id,text,hate_speech_score,label,Class
0,1,White people are trash.,0.46,0,"[1, 0, 0, 0, 0, 0, 0, 0]"
1,2,"You've caught the big gay, it's known to be ai...",0.03,0,"[1, 0, 0, 0, 0, 0, 0, 0]"
2,3,"I'm not saying you said that, I'm just saying ...",-1.29,0,"[1, 0, 0, 0, 0, 0, 0, 0]"
3,4,Donald Trump. Yeet myself off a building onto ...,-0.24,0,"[1, 0, 0, 0, 0, 0, 0, 0]"
4,5,Fabrice Fabrice is ostensibly black or black/l...,-2.84,0,"[1, 0, 0, 0, 0, 0, 0, 0]"


In [2]:

df = pd.read_csv('../data/hate_speech_preprocessed.csv')
df = preprocessing(df)
trainloader, validloader, testloader = dataloader(df, val_frac=0.2, test_frac=0.3, batch_size=8)


In [3]:
m = train(2,trainloader )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1,  2000] loss: 0.180
[2,  2000] loss: 0.118
Model has been saved !


In [3]:
for epoch in range(2):
    outputs, targets = validation(epoch, validloader)
    outputs = np.array(outputs) >= 0.5
    accuracy = accuracy_score(targets, outputs)
    f1_score_micro = f1_score(targets, outputs, average='micro')
    f1_score_macro = f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Accuracy Score = 0.805762669025654
F1 Score (Micro) = 0.836569790173335
F1 Score (Macro) = 0.5331431122474659


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Accuracy Score = 0.805762669025654
F1 Score (Micro) = 0.836569790173335
F1 Score (Macro) = 0.5331431122474659
