<h1>Mulitclass classification fine tuned bert</h1>

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
import torchmetrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

#Create the BERT model we will use to fine tuned
class BERTForFineTuning(torch.nn.Module):
    def __init__(self):
        super(BERTForFineTuning, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-cased', output_hidden_states = True)
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        # feature bert input is 768 and we want the prediction on the 8 class
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(outputs.last_hidden_state)
        output = self.l3(output_2)
        return outputs.hidden_states, output

# Dataloader
def dataloader(df, val_frac, test_frac, batch_size, max_lenght=None):

    if max_lenght is None:
        X = list(df['text'])
        y = list(df['Class'])
    else:
        X = list(df['text'][:max_lenght])
        y = list(df['Class'][:max_lenght])

    # split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_frac, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=test_frac, stratify=y_train)

    # initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the comment text --> create an object that has free keys : input_ids, attention_mask, 
    X_train_tokenize = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenize = tokenizer(X_val, padding=True, truncation=True, max_length=512)  
    X_test_tokenize = tokenizer(X_test, padding=True, truncation=True, max_length=512)
    
    train_dataset = HateSpeechData(X_train_tokenize, y_train)
    val_dataset = HateSpeechData(X_val_tokenize, y_val)
    test_dataset = HateSpeechData(X_test_tokenize, y_test)

    trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    validloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle = True, num_workers=0)

    return trainloader, validloader, testloader

In [7]:
# libraries

import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
import torchmetrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from tokenize_B import tokenize_BERT

# import + preprocess the data
def preprocessing(df):     
    # 0: comment is not hateful
    # 1: comment is hateful (target_race)
    # 2: comment is hateful (target_religion)
    # 3: comment is hateful (target_origin)
    # 4: comment is hateful (target_gender)
    # 5: comment is hateful (target_sexuality)
    # 6: comment is hateful (target_age)
    # 7: comment is hateful (target_disability)
    l = []
    for i in range(len(df)):
        list_class = [0] * 8
        list_class[int(df['label'].iloc[i])] = 1
        l.append(list_class)
    
    df['Class'] = l
    
    return df

def create_class(i):
    """
    Creating a list of lenght 8 with only zeros except from a one at index i
    """
    l = [0] * 8
    l[i]=1
    return l

def get_class(output):
    l = []
    for pred in output:
        class_pred = [0] * 8
        idx = np.argmax(pred)
        class_pred[idx] = 1.0
        l.append(class_pred)
    return l

# Custome the data for our need
class HateSpeechData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __getitem__(self, index):
        item = {key:torch.tensor(value[index]) for key, value in self.X.items()}
        item["labels"] = torch.tensor(self.y[index], dtype=torch.float)
        return item
    
    def __len__(self):
        return len(self.X["input_ids"])
    

# Dataloader
def data_loader(df,batch_size, max_lenght=None):

    if max_lenght is None:
        X = list(df['text'])
        y = list(df['Class'])
    else:
        X = list(df['text'][:max_lenght])
        y = list(df['Class'][:max_lenght])

    # initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the comment text --> create an object that has free keys : input_ids, attention_mask, 
    X_tokenize = tokenizer(X, padding=True, truncation=True, max_length=512)
    
    dataset = HateSpeechData(X_tokenize, y)

    dataloader_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    return dataloader_loader
    

class BERTForFineTuningtWithPooling(torch.nn.Module):
    def __init__(self):
        super(BERTForFineTuningtWithPooling, self).__init__()
        # first layer is the bert
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        # apply a dropout
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask):
        outputs = self.l1(ids, attention_mask=mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        output_2 = self.l2(pooled_output)
        output = self.l3(output_2)
        return outputs.hidden_states, output
    
# train_data, val_data, test_data = tokenize_BERT()

# # import the data
df_train = pd.read_csv('../data/train_data.csv')
all_labels = df_train['label']
WEIGHTS = 1 / (torch.sqrt(torch.unique(torch.tensor(all_labels), return_counts = True)[1])).to('cuda')
# df_train = preprocessing(df_train)

# df_test = pd.read_csv('../data/test_data.csv')
# df_test = preprocessing(df_test)

# df_valid = pd.read_csv('../data/val_data.csv')
# df_valid = preprocessing(df_valid)

# trainloader= data_loader(df_train, batch_size=4)
# validloader= data_loader(df_valid, batch_size=4)
# testloader= data_loader(df_test, batch_size=4)


def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss(pos_weight=WEIGHTS)(outputs, targets)

def validation(validation_loader, model_name):
    model = BERTForFineTuningtWithPooling()
    model.load_state_dict(torch.load(model_name))
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    fin_targets=[]
    fin_outputs=[]
    
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            _,output = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

def training_model(nb_epochs, train_dataloader, val_dataloader, patience):
    """
    This function trains the model on training data
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = BERTForFineTuningtWithPooling()
    model.to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)
    best_val_loss = np.inf

    for epoch in range(nb_epochs):

        model.train()
        running_loss = 0.0

        for i, data in enumerate(train_dataloader, 0):

            ids = data['input_ids'].to(device, dtype = torch.long)
            attention_mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data['labels'].to(device, dtype = torch.float)
            
             # initialize the optimizer
            optimizer.zero_grad()
            #forward inputs
            _, output = model.forward(ids, attention_mask, token_type_ids)
            # define the loss
            loss = loss_fn(output, labels)
            # backpropagate
            loss.backward()
            # print("Capturing:", torch.cuda.is_current_stream_capturing())
            optimizer.step()
            # add the loss to the running loss
            running_loss+=loss.item()
            
            print('\rEpoch: {}\tbatch: {}\tLoss =  {:.3f}'.format(epoch, i, loss), end="")

        running_loss = running_loss / len(train_dataloader)
        
        # validation
        model.eval()
        with torch.no_grad():
            outputs, targets, val_loss = validation(validation_loader=val_dataloader, model= model)
            outputs = get_class(outputs)
            outputs = np.array(outputs)
            val_accuracy = accuracy_score(targets, outputs)
            val_f1_score_micro = f1_score(targets, outputs, average='micro')
            val_f1_score_macro = f1_score(targets, outputs, average='macro')
            print(f"Epoch {epoch+1}: train CE loss = {running_loss}", 
                  f"|| Valid: CE loss = {val_loss}   acc = {val_accuracy}   macro-F1 = {val_f1_score_macro}    micro-F1 = {val_f1_score_micro}")

        # early-stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            dict_model = model.state_dict()
            pat = 0
        else:
            pat += 1
            print("pat ", pat)
            if pat == patience:
                print("Early Stopping: Validation Loss did not decrease for", patience, "epochs.")
                break
        
        print("\n")
    torch.save(dict_model, 'Fine_Tuned_Bert.pt')


In [33]:
from tokenize_B import tokenize_BERT
train_data, val_data, test_data = tokenize_BERT()

In [34]:
new = torch.cat((train_data[3], val_data[3], test_data[3]))

5918

In [30]:
def preprocessing_1(tuple):     
    # 0: comment is not hateful
    # 1: comment is hateful (target_race)
    # 2: comment is hateful (target_religion)
    # 3: comment is hateful (target_origin)
    # 4: comment is hateful (target_gender)
    # 5: comment is hateful (target_sexuality)
    # 6: comment is hateful (target_age)
    # 7: comment is hateful (target_disability)
    labels = tuple[3]
    l = []
    for i in range(len(labels)):
        list_class = [0] * 8
        list_class[int(labels[i])] = 1
        l.append(list_class)
    
    new_tuple = (tuple[0], tuple[1], tuple[2], torch.tensor(l))
    
    return new_tuple

In [31]:
traindata = preprocessing_1(train_data)

tensor([[1, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0]])

In [2]:
def validation(validation_loader, model):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    fin_targets=[]
    fin_outputs=[]
    running_loss = 0.0
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            _,output = model(ids, mask, token_type_ids)
            loss = loss_fn(output, targets)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())
            # add the loss to the running loss
            running_loss+=loss.item()

    return fin_outputs, fin_targets, running_loss/len(validation_loader)

def training_model(nb_epochs, train_dataloader, val_dataloader, patience):
    """
    This function trains the model on training data
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = BERTForFineTuningtWithPooling()
    model.to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)
    best_val_loss = np.inf

    for epoch in range(nb_epochs):

        model.train()
        running_loss = 0.0

        for i, data in enumerate(train_dataloader, 0):

            ids = data['input_ids'].to(device, dtype = torch.long)
            attention_mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data['labels'].to(device, dtype = torch.float)
            
             # initialize the optimizer
            optimizer.zero_grad()
            #forward inputs
            _, output = model.forward(ids, attention_mask, token_type_ids)
            # define the loss
            loss = loss_fn(output, labels)
            # backpropagate
            loss.backward()
            # print("Capturing:", torch.cuda.is_current_stream_capturing())
            optimizer.step()
            # add the loss to the running loss
            running_loss+=loss.item()
            
            print('\rEpoch: {}\tbatch: {}\tLoss =  {:.3f}'.format(epoch, i, loss), end="")

        running_loss = running_loss / len(train_dataloader)
        
        print("\n")
        # validation
        model.eval()
        with torch.no_grad():
            outputs, targets, val_loss = validation(validation_loader=val_dataloader, model= model)
            outputs = get_class(outputs)
            outputs = np.array(outputs)
            val_accuracy = accuracy_score(targets, outputs)
            val_f1_score_micro = f1_score(targets, outputs, average='micro')
            val_f1_score_macro = f1_score(targets, outputs, average='macro')
            print(f"Epoch {epoch+1}: train CE loss = {running_loss}", 
                  f"|| Valid: CE loss = {val_loss}   acc = {val_accuracy}   macro-F1 = {val_f1_score_macro}    micro-F1 = {val_f1_score_micro}")

        # early-stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            dict_model = model.state_dict()
            pat = 0
        else:
            pat += 1
            print("pat ", pat)
            if pat == patience:
                print("Early Stopping: Validation Loss did not decrease for", patience, "epochs.")
                break
        
        print("\n")
    torch.save(dict_model, 'Fine_Tuned_Bert.pt')

In [3]:
training_model(15, trainloader, validloader, 3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 0	batch: 9549	Loss =  0.007

Epoch 1: train CE loss = 0.013036077102397554 || Valid: CE loss = 0.007797628375654411   acc = 0.6456925675675675   macro-F1 = 0.488740040648217    micro-F1 = 0.6456925675675675


Epoch: 1	batch: 9549	Loss =  0.005

Epoch 2: train CE loss = 0.007602591407631587 || Valid: CE loss = 0.00873977635637857   acc = 0.7432432432432432   macro-F1 = 0.5283525848603585    micro-F1 = 0.7432432432432431
pat  1


Epoch: 2	batch: 9549	Loss =  0.003

Epoch 3: train CE loss = 0.005819182833571338 || Valid: CE loss = 0.014366051187067028   acc = 0.6980574324324325   macro-F1 = 0.5005929273505167    micro-F1 = 0.6980574324324325
pat  2


Epoch: 3	batch: 9549	Loss =  0.003

Epoch 4: train CE loss = 0.004438413083900731 || Valid: CE loss = 0.021176147788193357   acc = 0.7027027027027027   macro-F1 = 0.4870425113251453    micro-F1 = 0.7027027027027027
pat  3
Early Stopping: Validation Loss did not decrease for 3 epochs.
