In [2]:
import tqdm
import pandas as pd
import torch
import torch.nn as nn
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from torch.optim import AdamW
from transformers import RobertaModel
from sklearn.model_selection import train_test_split
import random

random.seed(1337)

device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
### Preprocessing Data

df = pd.read_csv("roberta_from_scratch/motn_data.csv", encoding = 'latin-1')
df = df.iloc[: , 1:]

# Remove duplicates (excluding CASEID)
columns_to_check = ['comment_text'] + [col for col in df.columns if col not in ['CASEID', 'comment_text']]
df = df.drop_duplicates(subset = columns_to_check)

df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['CASEID', 'comment_text', 'list']].copy()

new_df['list'] = new_df['list'].apply(lambda x: [float(i) for i in x])

In [4]:
### Define CustomDataset Class

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len = 512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.CASEID = self.data.CASEID
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            truncation = True,
            return_token_type_ids = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'caseid': self.CASEID[index],
            'text': comment_text,
            'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask, dtype = torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype = torch.long),
            'targets': torch.tensor(self.targets[index], dtype = torch.float)
        }

In [5]:
### Split into Train and Validation

# Split the dataset into train and test
train_val_size = 0.8
train_dataset, val_dataset = train_test_split(new_df, test_size = 1 - train_val_size, random_state = 1337, shuffle = True)

train_dataset = train_dataset.reset_index(drop = True)
val_dataset = val_dataset.reset_index(drop = True)

print("Full Dataset: {}".format(new_df.shape))
print("Train Dataset: {}".format(train_dataset.shape))
print("Validation Dataset: {}".format(val_dataset.shape))

# Defining Tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base") # sentencepiece won't install, so not using deberta tokenizer

# Create the datasets
batch_size = 8
train_set = CustomDataset(train_dataset, tokenizer)
val_set = CustomDataset(val_dataset, tokenizer)

params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

train_loader = DataLoader(train_set, **params)
val_loader = DataLoader(val_set, **params)

Full Dataset: (5327, 3)
Train Dataset: (4261, 3)
Validation Dataset: (1066, 3)


In [None]:
### Create Customized Model

class RoBERTaClass(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_classes)
        
    def forward(self, ids, mask, token_type_ids = None):
        outputs = self.roberta(ids, attention_mask = mask)
        pooled_output = outputs.pooler_output # Use the pooled output (CLS token representation)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
### Defining the Training Function

def train_model(model, train_dataset, val_dataset, num_epochs = 10, batch_size = 8, learning_rate = 1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, drop_last = True)
    val_dataloader = DataLoader(val_dataset, batch_size = batch_size, drop_last = True)
    
    optimizer = AdamW(model.parameters(), lr = learning_rate)
    
    loss_fn = nn.BCEWithLogitsLoss()  # Combines sigmoid + BCE, for multi-label

    # Learning rate scheduler 
    lr_scheduler = torch.optim.lr_scheduler.StepLR( 
        optimizer,
        step_size = 4, # Every 4 epochs, learning rate is reduced
        gamma = 0.3 # Multiplicative factor of learning rate decay  
    )

    for epoch in range(num_epochs):
        model.train()
        
        total_train_loss = 0

        # Training
        for batch in tqdm.tqdm(train_dataloader, desc = f"Epoch {epoch + 1} - Training"):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['targets'].to(device)
            
            optimizer.zero_grad()
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            
        train_loss = total_train_loss / len(train_dataloader)
        
        model.eval()
        total_val_loss = 0
        total_exact_matches = 0
        total_samples = 0
        total_jaccard = 0
        all_preds = []
        all_labels = []
        
        # Validation
        with torch.no_grad():
            for batch in tqdm.tqdm(val_dataloader, desc = f"Epoch {epoch + 1} - Validation"):
                ids = batch['ids'].to(device)
                mask = batch['mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['targets'].to(device)
                outputs = model(ids, mask, token_type_ids)
                
                loss = loss_fn(outputs, labels)
                total_val_loss += loss.item()
                preds = torch.sigmoid(outputs) > 0.5
                
                all_preds.append(preds.cpu().view(-1, preds.size(-1)))
                all_labels.append(labels.cpu().view(-1, labels.size(-1)))
                
                # 1. Exact accuracy
                total_exact_matches += torch.sum(torch.all(preds == labels, dim=1)).item()
                total_samples += labels.size(0)
                
                # 2. Jaccard Index (intersection over union)
                preds_bool = preds.bool()
                labels_bool = labels.bool()
                intersection = torch.sum(preds_bool & labels_bool, dim=1).float()
                union = torch.sum(preds_bool | labels_bool, dim=1).float()
                batch_jaccard = torch.mean(intersection / (union + 1e-8)).item()
                total_jaccard += batch_jaccard
        
        # Calculate metrics val_loss, exact_accuracy, jaccard_accuracy
        val_loss = total_val_loss / len(val_dataloader)
        exact_accuracy = total_exact_matches / total_samples
        jaccard_accuracy = total_jaccard / len(val_dataloader)
        
        # 3. Calculate micro F1
        all_preds = torch.cat(all_preds, dim=0)
        all_labels = torch.cat(all_labels, dim=0)
        all_preds_bool = all_preds.bool()
        all_labels_bool = all_labels.bool()
        
        micro_tp = torch.sum(all_preds_bool & all_labels_bool).float()
        micro_fp = torch.sum(all_preds_bool & ~all_labels_bool).float()
        micro_fn = torch.sum(~all_preds_bool & all_labels_bool).float()
        
        micro_precision = micro_tp / (micro_tp + micro_fp + 1e-8)
        micro_recall = micro_tp / (micro_tp + micro_fn + 1e-8)
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall + 1e-8)
        
        # Cache learning rate for logging
        current_lr = optimizer.param_groups[0]['lr']
        
        print(
            f"Epoch {epoch + 1} | train_loss: {train_loss:.4f} | val_loss: {val_loss:.4f} | "
            f"micro_f1: {micro_f1:.4f} | exact_acc: {exact_accuracy:.4f} | jaccard: {jaccard_accuracy:.4f} | "
            f"lr: {current_lr:.2e}"
        )
        
        lr_scheduler.step()
    
    return model

In [11]:
### Train

batch_size = 8
epochs = 10
learning_rate = 1e-05

model = RoBERTaClass(13) # Number of classes

trained_model = train_model(
    model = model,
    train_dataset = train_set,
    val_dataset = val_set,
    num_epochs = epochs,
    batch_size = batch_size,
    learning_rate = learning_rate
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Training: 100%|██████████| 532/532 [01:44<00:00,  5.08it/s]
Epoch 1 - Validation: 100%|██████████| 133/133 [00:07<00:00, 16.64it/s]


Epoch 1 | train_loss: 0.2172 | val_loss: 0.1485 | micro_f1: 0.6748 | exact_acc: 0.5197 | jaccard: 0.5710 | lr: 1.00e-05


Epoch 2 - Training: 100%|██████████| 532/532 [01:38<00:00,  5.42it/s]
Epoch 2 - Validation: 100%|██████████| 133/133 [00:07<00:00, 17.31it/s]


Epoch 2 | train_loss: 0.1288 | val_loss: 0.1145 | micro_f1: 0.7542 | exact_acc: 0.6118 | jaccard: 0.6659 | lr: 1.00e-05


Epoch 3 - Training: 100%|██████████| 532/532 [01:32<00:00,  5.77it/s]
Epoch 3 - Validation: 100%|██████████| 133/133 [00:06<00:00, 19.54it/s]


Epoch 3 | train_loss: 0.0958 | val_loss: 0.1057 | micro_f1: 0.7866 | exact_acc: 0.6814 | jaccard: 0.7331 | lr: 1.00e-05


Epoch 4 - Training: 100%|██████████| 532/532 [01:28<00:00,  6.03it/s]
Epoch 4 - Validation: 100%|██████████| 133/133 [00:06<00:00, 19.75it/s]


Epoch 4 | train_loss: 0.0741 | val_loss: 0.0991 | micro_f1: 0.7955 | exact_acc: 0.6983 | jaccard: 0.7450 | lr: 1.00e-05


Epoch 5 - Training: 100%|██████████| 532/532 [01:26<00:00,  6.17it/s]
Epoch 5 - Validation: 100%|██████████| 133/133 [00:06<00:00, 19.14it/s]


Epoch 5 | train_loss: 0.0554 | val_loss: 0.1043 | micro_f1: 0.7963 | exact_acc: 0.6955 | jaccard: 0.7484 | lr: 3.00e-06


Epoch 6 - Training: 100%|██████████| 532/532 [01:26<00:00,  6.15it/s]
Epoch 6 - Validation: 100%|██████████| 133/133 [00:06<00:00, 19.47it/s]


Epoch 6 | train_loss: 0.0491 | val_loss: 0.1010 | micro_f1: 0.7931 | exact_acc: 0.6964 | jaccard: 0.7498 | lr: 3.00e-06


Epoch 7 - Training: 100%|██████████| 532/532 [01:24<00:00,  6.29it/s]
Epoch 7 - Validation: 100%|██████████| 133/133 [00:06<00:00, 21.64it/s]


Epoch 7 | train_loss: 0.0443 | val_loss: 0.1032 | micro_f1: 0.7901 | exact_acc: 0.6945 | jaccard: 0.7462 | lr: 3.00e-06


Epoch 8 - Training: 100%|██████████| 532/532 [01:21<00:00,  6.49it/s]
Epoch 8 - Validation: 100%|██████████| 133/133 [00:06<00:00, 19.62it/s]


Epoch 8 | train_loss: 0.0396 | val_loss: 0.1060 | micro_f1: 0.7911 | exact_acc: 0.6945 | jaccard: 0.7507 | lr: 3.00e-06


Epoch 9 - Training: 100%|██████████| 532/532 [01:25<00:00,  6.23it/s]
Epoch 9 - Validation: 100%|██████████| 133/133 [00:07<00:00, 18.89it/s]


Epoch 9 | train_loss: 0.0354 | val_loss: 0.1081 | micro_f1: 0.7863 | exact_acc: 0.6889 | jaccard: 0.7474 | lr: 9.00e-07


Epoch 10 - Training: 100%|██████████| 532/532 [01:24<00:00,  6.27it/s]
Epoch 10 - Validation: 100%|██████████| 133/133 [00:06<00:00, 21.15it/s]

Epoch 10 | train_loss: 0.0339 | val_loss: 0.1072 | micro_f1: 0.7801 | exact_acc: 0.6814 | jaccard: 0.7404 | lr: 9.00e-07



