In [None]:
import tqdm
import pandas as pd
import torch
import torch.nn as nn
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from torch.optim import AdamW
from transformers import DebertaV2Model
from sklearn.model_selection import train_test_split
import random

random.seed(1337)

device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
### Preprocessing Data

df = pd.read_csv("roberta_from_scratch/motn_data.csv", encoding = 'latin-1')
df = df.iloc[: , 1:]

# Creates the dataframe
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['CASEID', 'comment_text', 'list']].copy()

# Applies float to list
new_df['list'] = new_df['list'].apply(lambda x: [float(i) for i in x])
new_df.head(10)

Unnamed: 0,CASEID,comment_text,list
0,2061638667,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2056600635,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2058253621,Free nation where citizens elect their represe...,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2058997303,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2058184341,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,2057930711,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,2058524165,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,2057837907,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,2058736151,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,2057900787,__NA__,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [3]:
### Define CustomDataset Class

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len = 512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.CASEID = self.data.CASEID
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            truncation = True,
            return_token_type_ids = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'caseid': self.CASEID[index],
            'text': comment_text,
            'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask, dtype = torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype = torch.long),
            'targets': torch.tensor(self.targets[index], dtype = torch.float)
        }

In [4]:
### Split into Train and Validation

# Split the dataset into train and test
train_val_size = 0.8
train_dataset, val_dataset = train_test_split(new_df, test_size = 1 - train_val_size, random_state = 1337, shuffle = True)

train_dataset = train_dataset.reset_index(drop = True)
val_dataset = val_dataset.reset_index(drop = True)

print("Full Dataset: {}".format(new_df.shape))
print("Train Dataset: {}".format(train_dataset.shape))
print("Validation Dataset: {}".format(val_dataset.shape))

# Defining Tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base") # sentencepiece won't install, so not using deberta tokenizer

# Create the datasets
batch_size = 8
train_set = CustomDataset(train_dataset, tokenizer)
val_set = CustomDataset(val_dataset, tokenizer)

params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

train_loader = DataLoader(train_set, **params)
val_loader = DataLoader(val_set, **params)

Full Dataset: (13987, 3)
Train Dataset: (11189, 3)
Validation Dataset: (2798, 3)


In [5]:
### Create Customized Model with Dropout

class DEBERTAClass(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.l1 = DebertaV2Model.from_pretrained('microsoft/deberta-v3-base')
        self.dropout = nn.Dropout(0.2)
        self.l2 = nn.Linear(self.l1.config.hidden_size, num_classes)

    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids = token_type_ids)
        last_hidden_state = outputs[0]  # Get the last hidden state
        
        # Pooling: Use the [CLS] token representation (first token)
        pooled_output = last_hidden_state[:, 0, :]
        
        output_2 = self.dropout(pooled_output)
        output = self.l2(output_2)
        return output    
    
model = DEBERTAClass(13)
model.to(device)

DEBERTAClass(
  (l1): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, ele

In [6]:
### Defining the Training Function

def train_model(model, train_dataset, val_dataset, num_epochs = 50, batch_size = 16, learning_rate = 1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, drop_last = True)
    val_dataloader = DataLoader(val_dataset, batch_size = batch_size, drop_last = True)
    
    optimizer = AdamW(model.parameters(), lr = learning_rate)
    
    loss_fn = nn.BCEWithLogitsLoss()  # Combines sigmoid + BCE, for multi-label
    
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        
        total_train_loss = 0

        # Training
        for batch in tqdm.tqdm(train_dataloader, desc = f"Epoch {epoch + 1} - Training"):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['targets'].to(device)
            
            optimizer.zero_grad()
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            
        train_loss = total_train_loss / len(train_dataloader)
        
        model.eval()
        total_val_loss = 0
        total_exact_matches = 0
        total_samples = 0
        total_jaccard = 0
        all_preds = []
        all_labels = []
        
        # Validation
        with torch.no_grad():
            for batch in tqdm.tqdm(val_dataloader, desc = f"Epoch {epoch + 1} - Validation"):
                ids = batch['ids'].to(device)
                mask = batch['mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['targets'].to(device)
                outputs = model(ids, mask, token_type_ids)
                
                loss = loss_fn(outputs, labels)
                total_val_loss += loss.item()
                preds = torch.sigmoid(outputs) > 0.5
                
                # Flatten tensors before appending to handle batch dimension properly
                all_preds.append(preds.cpu().view(-1, preds.size(-1)))
                all_labels.append(labels.cpu().view(-1, labels.size(-1)))
                
                # 2. Exact accuracy
                total_exact_matches += torch.sum(torch.all(preds == labels, dim=1)).item()
                total_samples += labels.size(0)
                
                # 3. Jaccard Index (intersection over union)
                preds_bool = preds.bool()
                labels_bool = labels.bool()
                intersection = torch.sum(preds_bool & labels_bool, dim=1).float()
                union = torch.sum(preds_bool | labels_bool, dim=1).float()
                batch_jaccard = torch.mean(intersection / (union + 1e-8)).item()
                total_jaccard += batch_jaccard
        
        # Calculate metrics val_loss, exact_accuracy, jaccard_accuracy
        val_loss = total_val_loss / len(val_dataloader)
        exact_accuracy = total_exact_matches / total_samples
        jaccard_accuracy = total_jaccard / len(val_dataloader)
        
        # Calculate micro F1
        all_preds = torch.cat(all_preds, dim=0)
        all_labels = torch.cat(all_labels, dim=0)
        all_preds_bool = all_preds.bool()
        all_labels_bool = all_labels.bool()
        
        micro_tp = torch.sum(all_preds_bool & all_labels_bool).float()
        micro_fp = torch.sum(all_preds_bool & ~all_labels_bool).float()
        micro_fn = torch.sum(~all_preds_bool & all_labels_bool).float()
        
        micro_precision = micro_tp / (micro_tp + micro_fp + 1e-8)
        micro_recall = micro_tp / (micro_tp + micro_fn + 1e-8)
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall + 1e-8)
        
        print(
            f"Epoch {epoch + 1} | train_loss: {train_loss:.4f} | val_loss: {val_loss:.4f} | "
            f"micro_f1: {micro_f1:.4f} | exact_acc: {exact_accuracy:.4f} | jaccard: {jaccard_accuracy:.4f}"
        )

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "deberta_model.pth")
        else:
            patience_counter += 1
            if patience_counter >= 3: # Stop if validation loss does not improve for 3 epochs
                print(f"Early stopping at epoch {epoch + 1}")
                break
    
    return model

In [None]:
### Train

batch_size = 8
epochs = 10
learning_rate = 1e-05

trained_model = train_model(
    model = model,
    train_dataset = train_set,
    val_dataset = val_set,
    num_epochs = epochs,
    batch_size = batch_size,
    learning_rate = learning_rate
)

Epoch 1 - Training: 100%|██████████| 1398/1398 [06:50<00:00,  3.41it/s]
Epoch 1 - Validation: 100%|██████████| 349/349 [00:42<00:00,  8.19it/s]


Epoch 1 | train_loss: 0.1489 | val_loss: 0.1124 | micro_f1: 0.7359 | exact_acc: 0.6938 | jaccard: 0.7199


Epoch 2 - Training:   1%|▏         | 20/1398 [00:08<09:32,  2.41it/s]


KeyboardInterrupt: 