In [None]:
#https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

In [33]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DebertaV2Tokenizer
from transformers import DebertaV2Model
import tqdm

In [34]:
#Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
df = pd.read_csv("", encoding='latin-1')
df = pd.DataFrame(df)
df = df.iloc[: , 1:]

np.random.seed(1337)

#Creates the dataframe
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['CASEID', 'comment_text', 'list']].copy()

#Applies float to list
new_df['list'] = new_df['list'].apply(lambda x: [float(i) for i in x])
new_df.head(10)

In [36]:
#Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8 #8
VALID_BATCH_SIZE = 4 #4
EPOCHS = 15
LEARNING_RATE = 1e-05

#Defining Tokenizer

tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")

In [37]:
#Defining CustomDataset class
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.CASEID = self.data.CASEID
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'caseid': self.CASEID[index],
            'text': comment_text,
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [38]:
#Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (13987, 3)
TRAIN Dataset: (11190, 3)
TEST Dataset: (2797, 3)


In [39]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [58]:
#Creating the customized model by adding dropout.

from torch.nn import functional as F
import torch.nn as nn

# class BERTClass(torch.nn.Module):
#     def __init__(self):
#         super(BERTClass, self).__init__()
#         self.l1 = transformers.RobertaModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.2)
#         self.l3 = torch.nn.Linear(768, 13)  # Num of labels is 13
    
#     def forward(self, ids, mask, token_type_ids):
#         _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
#         output = self.l3(output_2)
#         return output

class DEBERTAClass(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.l1 = DebertaV2Model.from_pretrained('microsoft/deberta-v3-base')
        self.dropout = nn.Dropout(0.3)
        self.layer_norm = nn.LayerNorm(self.l1.config.hidden_size)
        self.l2 = nn.Linear(self.l1.config.hidden_size, num_classes)

    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs[0]  # Get the last hidden state
        
        # Pooling: Use the [CLS] token representation (first token)
        pooled_output = last_hidden_state[:, 0, :]
        
        output_2 = self.dropout(pooled_output)
        output_3 = self.layer_norm(output_2)
        output = self.l2(output_3)
        return output    
    
    
model = DEBERTAClass(13)
model.to(device)

DEBERTAClass(
  (l1): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropo

In [59]:
#Loss function
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets) #This is binary cross entropy in keras I believe

In [60]:
#Optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

In [61]:
#Defined training function
def train(epoch):
    model.train()
    for batch in tqdm.tqdm(training_loader, desc=f"Epoch {epoch}"):
        # Unpack the batch
        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss: {loss.item()}')

In [62]:
#Training Function
for epoch in range(EPOCHS):
    train(epoch)

Epoch 0: 100%|██████████| 1399/1399 [03:04<00:00,  7.58it/s]


Epoch: 0, Loss: 0.10601064562797546


Epoch 1: 100%|██████████| 1399/1399 [03:05<00:00,  7.56it/s]


Epoch: 1, Loss: 0.04568728432059288


Epoch 2: 100%|██████████| 1399/1399 [03:04<00:00,  7.60it/s]


Epoch: 2, Loss: 0.020508555695414543


Epoch 3: 100%|██████████| 1399/1399 [03:04<00:00,  7.60it/s]


Epoch: 3, Loss: 0.0027236351743340492


Epoch 4: 100%|██████████| 1399/1399 [03:04<00:00,  7.60it/s]


Epoch: 4, Loss: 0.0027477932162582874


Epoch 5: 100%|██████████| 1399/1399 [03:03<00:00,  7.60it/s]


Epoch: 5, Loss: 0.06260765343904495


Epoch 6: 100%|██████████| 1399/1399 [03:03<00:00,  7.61it/s]


Epoch: 6, Loss: 0.0037785936146974564


Epoch 7: 100%|██████████| 1399/1399 [03:03<00:00,  7.61it/s]


Epoch: 7, Loss: 0.008677409961819649


Epoch 8: 100%|██████████| 1399/1399 [03:03<00:00,  7.61it/s]


Epoch: 8, Loss: 0.008691200986504555


Epoch 9: 100%|██████████| 1399/1399 [03:04<00:00,  7.60it/s]


Epoch: 9, Loss: 0.05751186981797218


Epoch 10: 100%|██████████| 1399/1399 [03:03<00:00,  7.61it/s]


Epoch: 10, Loss: 0.006522975862026215


Epoch 11: 100%|██████████| 1399/1399 [03:03<00:00,  7.61it/s]


Epoch: 11, Loss: 0.0009502205066382885


Epoch 12: 100%|██████████| 1399/1399 [03:04<00:00,  7.60it/s]


Epoch: 12, Loss: 0.0479586236178875


Epoch 13: 100%|██████████| 1399/1399 [03:04<00:00,  7.59it/s]


Epoch: 13, Loss: 0.0043518138118088245


Epoch 14: 100%|██████████| 1399/1399 [03:03<00:00,  7.61it/s]

Epoch: 14, Loss: 0.0012626174138858914





In [63]:
#Validation Function
def validation(epoch):
    model.eval()
    fin_targets = []
    fin_outputs = []
    texts = []
    caseid = []
    
    with torch.no_grad():
        for batch in tqdm.tqdm(testing_loader, desc=f"Validation Epoch {epoch}"):
            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            texts.extend(batch['text'])  # Extracting the text column
            caseid.extend(batch['caseid'])
    
    return fin_outputs, fin_targets, texts, caseid

In [69]:
#Metrics function
for epoch in range(1):
    outputs, targets, texts, caseid = validation(epoch)
    outputs = np.array(outputs) >= 0.65                                      #This can be tuned
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    #print("First three items of texts:", texts[:3])
    #print("First three items of fin_outputs:", outputs[:3])
    #print("First three items of fin_targets:", targets[:3])
    
    #Create a dictionary with column names as keys and lists as values
    final_df = {
        'Outputs': outputs,
        'Targets': targets,
        'Texts': texts,
        'Caseid': caseid
    }

Validation Epoch 0: 100%|██████████| 700/700 [00:14<00:00, 48.65it/s]

Accuracy Score = 0.8902395423668216
F1 Score (Micro) = 0.9174342656171497
F1 Score (Macro) = 0.7432268256014265





In [65]:
#Creating final output
final_case = final_df['Caseid']
final_case = pd.DataFrame(final_case, columns = ["Caseid"])

final_text = final_df['Texts']
final_text = pd.DataFrame(final_text, columns = ["Text"])

final_output = final_df['Outputs']
final_output = pd.DataFrame(final_output, columns = ["Freedom and Rights", "E Plurabus Unum", "Representation of the People", "Popular Will and Equality", "National Identity and Heritage", "Not a Democracy a Republic", 
                "Flawed Democracy", "Institution and Constitution", "Unclassified/Other", "Don't Know", "Nothing/Disaffected", "Nothing More to Add", "NA"])
final_output.replace({True: 1, False: 0}, inplace=True)

final_df = final_text.join(final_output)
final_df = final_case.join(final_df)

In [None]:
final_df.head(10)

In [67]:
from sklearn.metrics import multilabel_confusion_matrix

true_np = targets
pred_np = outputs

#Calculate confusion matrix
conf_matrix = multilabel_confusion_matrix(true_np, pred_np)

#Define label titles
label_titles = ["Freedom and Rights", "E Plurabus Unum", "Representation of the People", "Popular Will and Equality", "National Identity and Heritage", "Not a Democracy a Republic", 
                "Flawed Democracy", "Institution and Constitution", "Unclassified/Other", "Don't Know", "Nothing/Disaffected", "Nothing More to Add", "NA"]

#Create a dictionary to store confusion matrices with titles
conf_matrix_dict = {}
for i, title in enumerate(label_titles):
    conf_matrix_dict[title] = conf_matrix[i]

#Print confusion matrices with titles
for title, matrix in conf_matrix_dict.items():
    print(f"Confusion matrix for {title}:")
    print(matrix)

Confusion matrix for Freedom and Rights:
[[2214   37]
 [  28  518]]
Confusion matrix for E Plurabus Unum:
[[2775    7]
 [  10    5]]
Confusion matrix for Representation of the People:
[[2704   26]
 [  20   47]]
Confusion matrix for Popular Will and Equality:
[[2219   47]
 [  27  504]]
Confusion matrix for National Identity and Heritage:
[[2788    4]
 [   4    1]]
Confusion matrix for Not a Democracy a Republic:
[[2761    2]
 [   0   34]]
Confusion matrix for Flawed Democracy:
[[2710   21]
 [  19   47]]
Confusion matrix for Institution and Constitution:
[[2684   21]
 [  32   60]]
Confusion matrix for Unclassified/Other:
[[2594   28]
 [  59  116]]
Confusion matrix for Don't Know:
[[2748    4]
 [   2   43]]
Confusion matrix for Nothing/Disaffected:
[[2720   32]
 [  10   35]]
Confusion matrix for Nothing More to Add:
[[2592   13]
 [  24  168]]
Confusion matrix for NA:
[[1620   16]
 [   0 1161]]
