<h1>Mulitclass classification fine tuned bert</h1>

In [5]:
# libraries

import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
import torchmetrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

# import + preprocess the data
def preprocessing(df):     
    # 0: comment is not hateful
    # 1: comment is hateful (target_race)
    # 2: comment is hateful (target_religion)
    # 3: comment is hateful (target_origin)
    # 4: comment is hateful (target_gender)
    # 5: comment is hateful (target_sexuality)
    # 6: comment is hateful (target_age)
    # 7: comment is hateful (target_disability)
    l = []
    for i in range(len(df)):
        list_class = [0] * 8
        list_class[int(df['label'].iloc[i])] = 1
        l.append(list_class)
    
    df['Class'] = l
    
    return df

def create_class(i):
    """
    Creating a list of lenght 8 with only zeros except from a one at index i
    """
    l = [0] * 8
    l[i]=1
    return l



# Custome the data for our need
class HateSpeechData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __getitem__(self, index):
        item = {key:torch.tensor(value[index]) for key, value in self.X.items()}
        item["labels"] = torch.tensor(self.y[index], dtype=torch.float)
        return item
    
    def __len__(self):
        return len(self.X["input_ids"])
    

# Dataloader
def dataloader(df, val_frac, test_frac, batch_size, max_lenght=None):

    if max_lenght is None:
        X = list(df['text'])
        y = list(df['Class'])
    else:
        X = list(df['text'][:max_lenght])
        y = list(df['Class'][:max_lenght])

    # split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_frac, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=test_frac, stratify=y_train)

    # initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the comment text --> create an object that has free keys : input_ids, attention_mask, 
    X_train_tokenize = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenize = tokenizer(X_val, padding=True, truncation=True, max_length=512)  
    X_test_tokenize = tokenizer(X_test, padding=True, truncation=True, max_length=512)
    
    train_dataset = HateSpeechData(X_train_tokenize, y_train)
    val_dataset = HateSpeechData(X_val_tokenize, y_val)
    test_dataset = HateSpeechData(X_test_tokenize, y_test)

    trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle = True, num_workers=0)

    return trainloader, validloader, testloader

    
#Create the BERT model we will use to fine tuned
class BERTForFineTuning(torch.nn.Module):
    def __init__(self):
        super(BERTForFineTuning, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-cased', output_hidden_states = True)
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        # feature bert input is 768 and we want the prediction on the 8 class
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(outputs.last_hidden_state)
        output = self.l3(output_2)
        return outputs.hidden_states, output

class BERTForFineTuningtWithPooling(torch.nn.Module):
    def __init__(self):
        super(BERTForFineTuningtWithPooling, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        output_2 = self.l2(pooled_output)
        output = self.l3(output_2)
        return outputs.hidden_states, output
    

def loss_fn(outputs, targets):
    #include weights
    # weights = 1 / (torch.sqrt(torch.unique(torch.tensor(all_labels), return_counts = True)[1]))
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

def train(nm_epoch, training_loader):
    
    model = BERTForFineTuningtWithPooling()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

    # set the model to training mode
    model.train()


    accuracy=0.0
    f1_score_micro=0.0
    f1_score_micro=0.0
    
    for epoch in range(nm_epoch):
        running_loss = 0.0
        fin_targets=[]
        fin_outputs=[]
        for i, data in enumerate(training_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            attention_mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data['labels'].to(device, dtype = torch.float)

            # initialize the optimizer
            optimizer.zero_grad()
            #forward inputs
            _, output = model.forward(ids, attention_mask, token_type_ids)
            # define the loss
            loss = loss_fn(output, labels)
            # backpropagate
            loss.backward()
            # print("Capturing:", torch.cuda.is_current_stream_capturing())
            optimizer.step()
            # add the loss to the running loss
            running_loss+=loss.item()

            fin_targets.extend(labels.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
            

            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
        
        fin_outputs = np.array(fin_outputs) >= 0.5
        accuracy = accuracy_score(fin_targets, fin_outputs)
        f1_score_micro = f1_score(fin_targets, fin_outputs, average='micro')
        f1_score_macro = f1_score(fin_targets, fin_outputs, average='macro')
    
    try:
        torch.save(model.state_dict(), f'fine_tuned_bert{nm_epoch}.pt')
        print('Model has been saved !')
    except:
        print('The model has already been saved!')

    return model, accuracy, f1_score_micro, f1_score_macro

def validation(validation_loader, model_name):
    model = BERTForFineTuningtWithPooling()
    model.load_state_dict(torch.load(model_name))
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    fin_targets=[]
    fin_outputs=[]
    
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            _,output = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets


def model_selection(num_epochs, training_loader, validation_loader):
    
    report = []

    for epoch in range(1,num_epochs):
        
        # dict to report performance
        metrics = {
            'num_epochs' : epoch,
            'Training_Accuracy' : 0.0,
            'Training_f1_micro': 0.0,
            'Training_f1_macro': 0.0,
            'Validation_Accuracy' : 0.0,
            'Validation_f1_micro': 0.0,
            'Validation_f1_macro': 0.0,
        }
    
        print(f'Epoch{epoch}: Start Training...')
        model, accuracy_train,f1_score_micro_train, f1_score_macro_train = train(epoch, training_loader)
        print(f'Epoch{epoch}: Traning is done...')
        print(f'Epoch{epoch}: Start Validation...')
        model_name = f'fine_tuned_bert{epoch}.pt'
        outputs, targets = validation(validation_loader, model_name)
        print(f'Epoch{epoch}: Validation is done...')
        outputs = np.array(outputs) >= 0.5
        accuracy_valid = accuracy_score(targets, outputs)
        f1_score_micro_valid = f1_score(targets, outputs, average='micro')
        f1_score_macro_valid = f1_score(targets, outputs, average='macro')
        # print results
        metrics['Training_Accuracy'] = accuracy_train
        metrics['Training_f1_micro'] = f1_score_micro_train
        metrics['Training_f1_macro'] = f1_score_macro_train
        metrics['Validation_Accuracy'] = accuracy_valid
        metrics['Validation_f1_micro'] = f1_score_micro_valid
        metrics['Validation_f1_macro'] = f1_score_macro_valid

        report.append(metrics)
    
    return report


In [2]:

df = pd.read_csv('../data/hate_speech_preprocessed.csv')
df = preprocessing(df)
trainloader, validloader, testloader = dataloader(df, val_frac=0.2, test_frac=0.3, batch_size=4)


In [40]:
len(trainloader.dataset)

22156

In [None]:
report = model_selection(2, trainloader, validloader)

In [3]:
m = train(1,trainloader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1,  2000] loss: 0.180
[2,  2000] loss: 0.118
Model has been saved !


In [3]:
for epoch in range(2):
    outputs, targets = validation(epoch, validloader)
    outputs = np.array(outputs) >= 0.5
    accuracy = accuracy_score(targets, outputs)
    f1_score_micro = f1_score(targets, outputs, average='micro')
    f1_score_macro = f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Accuracy Score = 0.805762669025654
F1 Score (Micro) = 0.836569790173335
F1 Score (Macro) = 0.5331431122474659


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Accuracy Score = 0.805762669025654
F1 Score (Micro) = 0.836569790173335
F1 Score (Macro) = 0.5331431122474659


In [3]:
class BERTForFineTuningTestWithPooling(torch.nn.Module):
    def __init__(self):
        super(BERTForFineTuningTestWithPooling, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        output_2 = self.l2(pooled_output)
        output = self.l3(output_2)
        return outputs.hidden_states, output

In [None]:
class BERTForFineTuningTest(torch.nn.Module):
    def __init__(self):
        super(BERTForFineTuningTest, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(outputs[1])
        output = self.l3(output_2)
        print(output.size())
        return outputs.hidden_states, output

In [46]:
#Create the BERT model we will use to fine tuned
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        # feature bert input is 768 and we want the prediction on the 8 class
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1[1])
        output = self.l3(output_2)
        return output

In [62]:
test = BERTForFineTuningTestWithPooling()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
test.to(device)
with torch.no_grad():
    for _, data in enumerate(validloader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.float)
        hiddent_states,output = test(ids, mask, token_type_ids)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [64]:
output

tensor([[ 0.1555,  0.0198,  0.0561, -0.2436, -0.2888, -0.0205, -0.3739, -0.2089]],
       device='cuda:0')

In [27]:
test1 = BERTClass()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
test1.to(device)
with torch.no_grad():
    for _, data in enumerate(validloader, 0):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.float)
        output1 = test1(ids, mask, token_type_ids)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
output1

tensor([[ 0.1759,  0.2314,  0.1860, -0.9042,  0.1226,  0.6073, -0.5687,  0.3000]],
       device='cuda:0')

In [5]:
output.size()

torch.Size([1, 168, 8])