# Fine Tuning Transformer for MultiLabel Text Classification

In [58]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch.nn as nn
from transformers import RobertaForSequenceClassification

In [2]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
cols_to_check = ['processed_text','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train_df = pd.read_csv("/kaggle/input/toxic-comment-classification-labeled/train_toxic_dataset.csv")
train_df = train_df.dropna(subset=cols_to_check, how='any')
train_df['list'] = [[int(float(val)) for val in sublist] for sublist in train_df[train_df.columns[-6:]].values.tolist()]

test_df = pd.read_csv("/kaggle/input/toxic-comment-classification-labeled/test_toxic_dataset.csv")
test_df = test_df.dropna(subset=cols_to_check, how='any')
test_df['list'] = [[int(float(val)) for val in sublist] for sublist in test_df[test_df.columns[-6:]].values.tolist()]

In [5]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [6]:
train_dataset = train_df[['processed_text', 'list']].copy()
test_dataset = test_df[['processed_text', 'list']].copy()

In [7]:
train_dataset.head()

Unnamed: 0,processed_text,list
0,feature vc tim guleris views generative ai ive...,"[0, 0, 0, 0, 0, 0]"
1,connectingthedots open ai founder sam altman s...,"[0, 0, 0, 0, 0, 0]"
2,fab data thrilled see worldbank highlighting p...,"[0, 0, 0, 0, 0, 0]"
3,gpt4 week already changing lives ape use gpt4 ...,"[0, 0, 1, 0, 0, 0]"
4,rising popularity chatgpt organizations may fi...,"[0, 0, 0, 0, 0, 0]"


In [8]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [9]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.processed_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [10]:
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [11]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [12]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6).to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




In [13]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [14]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [15]:
def train(epoch):
    model.train()
    for i,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)
        logits = outputs[0]

        optimizer.zero_grad()
        loss = loss_fn(logits, targets)
        if i%500==0:
            print(f'Epoch: {epoch}, Step: {i}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [16]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Step: 0, Loss:  0.6875798106193542
Epoch: 0, Step: 500, Loss:  0.24165257811546326
Epoch: 0, Step: 1000, Loss:  0.03252239525318146


In [17]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            outputs = outputs[0]
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [18]:
from sklearn.metrics import precision_score, recall_score

outputs, targets = validation(epoch)
outputs = np.array(outputs) >= 0.5

In [19]:
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
precision = precision_score(targets, outputs, average='micro')
recall = recall_score(targets, outputs, average='micro')
print(f"Accuracy Score = {accuracy}")
print(f"Precision Score (Macro) = {precision}")
print(f"Recall (Macro) = {recall}")
print(f"F1 Score (Micro) = {f1_score_micro}")

Accuracy Score = 0.8911968348170128
Precision Score (Macro) = 0.8637873754152824
Recall (Macro) = 0.7323943661971831
F1 Score (Micro) = 0.7926829268292683


In [20]:
from sklearn.metrics import classification_report

report = classification_report(targets, outputs, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
print(report)

               precision    recall  f1-score   support

        toxic       0.94      0.67      0.78       221
 severe_toxic       0.00      0.00      0.00        22
      obscene       0.75      0.85      0.80       230
       threat       0.00      0.00      0.00        25
       insult       0.97      0.86      0.91       177
identity_hate       0.96      0.71      0.82        35

    micro avg       0.86      0.73      0.79       710
    macro avg       0.60      0.52      0.55       710
 weighted avg       0.82      0.73      0.77       710
  samples avg       0.18      0.18      0.18       710



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
from sklearn.metrics import multilabel_confusion_matrix
confusion_matrix = multilabel_confusion_matrix(targets, outputs)
class_accuracies = np.sum(confusion_matrix[:, 1:, 1]) / np.sum(confusion_matrix, axis=(0, 2))
print(class_accuracies)

[0.04552618 0.73239437]


In [41]:
from sklearn.metrics import multilabel_confusion_matrix


# Get the confusion matrix
conf_matrix = multilabel_confusion_matrix(targets, outputs)

In [56]:
from sklearn.metrics import accuracy_score


# Calculate accuracy for each label
label_accuracies = []
for i in range(len(targets[0])):
    y_true_label = [sample[i] for sample in targets]
    y_pred_label = [sample[i] for sample in outputs]
    label_accuracy = accuracy_score(y_true_label, y_pred_label)
    label_accuracies.append(label_accuracy)

print("Accuracy for each label:")
for label, accuracy in enumerate(label_accuracies):
    print(f"Label {label}: {accuracy}")


Accuracy for each label:
Label 0: 0.9584569732937686
Label 1: 0.9891196834817013
Label 2: 0.9500494559841741
Label 3: 0.9876360039564788
Label 4: 0.9856577645895154
Label 5: 0.9945598417408507


In [59]:
accuracy = metrics.accuracy_score(targets, outputs)
accuracy

0.8911968348170128