In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding

In [12]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df['sentence'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        enc = self.tokenizer(text, return_tensors="pt", truncation=True)
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [13]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

def eval_epoch(model, dataloader, device):
    model.eval()
    losses = []
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            losses.append(loss.item())
            preds = outputs.logits.argmax(dim=1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
    acc = accuracy_score(all_labels, all_preds)
    return sum(losses) / len(losses), acc

In [14]:
def fine_tune(model_name, train_df, val_df, epochs=4, batch_size=16, lr=2e-5):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_dataset = TextDataset(train_df, tokenizer)
    val_dataset = TextDataset(val_df, tokenizer)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)

    optimizer = AdamW(model.parameters(), lr=lr)

    best_loss = float('inf')
    for epoch in range(1, epochs + 1):
        train_epoch(model, train_loader, optimizer, device)
        val_loss, val_acc = eval_epoch(model, val_loader, device)
        print(f"Epoch: {epoch} Val loss: {val_loss:.3f} Val acc: {val_acc:.3f}")
        if val_loss < best_loss:
            best_loss = val_loss
            model.save_pretrained(f"{model_name}_ft")
            tokenizer.save_pretrained(f"{model_name}_ft")

In [15]:
data = pd.read_csv("../1-dataset/VUAMC_sentences_labeled.csv")
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)
models = ["bert-base-uncased", "roberta-base"]

for model_name in models:
    print(f"{model_name}...")
    fine_tune(model_name, train_data, val_data)
    print()

bert-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1 Val loss: 0.303 Val acc: 0.873
Epoch: 2 Val loss: 0.373 Val acc: 0.856
Epoch: 3 Val loss: 0.365 Val acc: 0.879
Epoch: 4 Val loss: 0.442 Val acc: 0.870

roberta-base...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1 Val loss: 0.300 Val acc: 0.870
Epoch: 2 Val loss: 0.420 Val acc: 0.839
Epoch: 3 Val loss: 0.355 Val acc: 0.874
Epoch: 4 Val loss: 0.404 Val acc: 0.859

