In [1]:
import os
import torch
from torch.utils.data import DataLoader
from transformers import AutoModel
from models.ModelRetriever import get_full_classification_model, get_classification_head_model, get_adapters_model, get_lora_model
import json
import time
import pickle
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_from_disk
from torch import nn


In [7]:
MODEL_NAME = 'distilbert-base-uncased'
SAVE_MODELS_DIR = './models/fine_tuned_models'
METRICS_DIR = './training/training_metrics'
TRAINER_DIR = './training/trainer_metrics'
SEED = 42

LR = 2e-4
EPOCHS = 10 
NUM_LABELS = 2
MAX_LENGTH = 512
BATCH_SIZE=32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"



In [8]:
DATASETS_PATH = './datasets/processed_data/pytorch_datasets/'
DATASETS_PATHS = {
    'small': {
        'train': 'train_dataset_small',
        'validation': 'validation_dataset_small',
        'test': 'test_dataset'
    },
    'medium': {
        'train': 'train_dataset_medium',
        'validation': 'validation_dataset_medium',
        'test': 'test_dataset'
    },
    'full': {
        'train': 'full_train',
        'validation': 'full_validation',
        'test': 'test_dataset'
    }
}

In [9]:
def accuracy(preds, labels):
    return accuracy_score(labels, preds)
def f1_weighted(preds, labels, num_labels):
    return f1_score(labels, preds, average="weighted", labels=list(range(num_labels)))

In [10]:
class TrainingLogger:
    """
    Tracks per-epoch time and GPU memory usage.
    """
    def __init__(self):
        self.epoch_times = []
        self.memory_log = []
        self.start_time = None
        self.total_training_time = None

    def start_training(self):
        self.start_time = time.time()

    def end_training(self):
        self.total_training_time = time.time() - self.start_time

    def log_epoch(self, epoch, epoch_start_time):
        # time
        epoch_time = time.time() - epoch_start_time
        self.epoch_times.append({"epoch": epoch + 1, "time_seconds": epoch_time})

        # GPU mem
        if torch.cuda.is_available():
            mem_mb = torch.cuda.memory_allocated(0) // (1024 ** 2)
            self.memory_log.append({"epoch": epoch + 1, "gpu_memory_used_mb": int(mem_mb)})

In [11]:
class ModelTrainer:
    def __init__(self, model: nn.Module, device: str = 'cpu'):
        self.model = model.to(device)
        self.device = device

    def train(
        self,
        train_loader: DataLoader,
        val_loader: DataLoader,
        optimizer: torch.optim.Optimizer,
        loss_fn: nn.Module,
        epochs: int,
        save_dir: TRAINER_DIR,
        filename: str,
        num_labels: int = 2,
        grad_clip: float = 1.0
    ):
        os.makedirs(save_dir, exist_ok=True)

        logger = TrainingLogger()
        logger.start_training()

        history = {"train_loss": [], "val_loss": [], "val_accuracy": [], "val_f1": []}

        for epoch in range(epochs):
            epoch_start = time.time()
            self.model.train()
            running_loss = 0.0

            for batch in train_loader:
                input_ids     = batch["input_ids"].to(self.device)
                attention_mask= batch["attention_mask"].to(self.device)
                labels        = batch["label"].to(self.device)

                optimizer.zero_grad(set_to_none=True)
                logits = self.model(input_ids, attn_mask=attention_mask)
                loss = loss_fn(logits, labels)
                loss.backward()

                if grad_clip is not None:
                    torch.nn.utils.clip_grad_norm_(
                        (p for p in self.model.parameters() if p.requires_grad), grad_clip
                    )

                optimizer.step()

                running_loss += loss.item()

            avg_train_loss = running_loss / max(1, len(train_loader))
            history["train_loss"].append(avg_train_loss)

            # ---- validation ----
            self.model.eval()
            val_loss = 0.0
            all_preds, all_labels = [], []
            with torch.no_grad():
                for batch in val_loader:
                    input_ids      = batch["input_ids"].to(self.device)
                    attention_mask = batch["attention_mask"].to(self.device)
                    labels         = batch["label"].to(self.device)

                    logits = self.model(input_ids, attn_mask=attention_mask)
                    loss = loss_fn(logits, labels)
                    val_loss += loss.item()

                    preds = torch.argmax(logits, dim=1).cpu()
                    all_preds.extend(preds.tolist())
                    all_labels.extend(labels.cpu().tolist())

            avg_val_loss = val_loss / max(1, len(val_loader))
            val_acc = accuracy(torch.tensor(all_preds), torch.tensor(all_labels))
            val_f1  = f1_weighted(all_preds, all_labels, num_labels=num_labels)

            history["val_loss"].append(avg_val_loss)
            history["val_accuracy"].append(val_acc)
            history["val_f1"].append(val_f1)

            logger.log_epoch(epoch, epoch_start)

            print(
                f"Epoch {epoch+1}/{epochs} | "
                f"Train Loss: {avg_train_loss:.4f} | "
                f"Val Loss: {avg_val_loss:.4f} | "
                f"Val Acc: {val_acc:.4f} | "
                f"Val F1: {val_f1:.4f}"
            )

        logger.end_training()

        # persist logs
        log_blob = {
            "history": history,
            "epoch_times": logger.epoch_times,
            "gpu_memory_log": logger.memory_log,
            "total_training_time_seconds": logger.total_training_time,
        }
        
        file_path = os.path.join(save_dir, f"{filename}.pkl")
        with open(file_path, "wb") as f:
            pickle.dump(log_blob, f)

        print(f"Saved metrics to: {file_path}")
        return history

In [12]:
def Train(model, size, name):
    print(f"\n========== Training {name} ==========")
    save_dir = os.path.join(METRICS_DIR, name)

    train_ds = load_from_disk(os.path.join(DATASETS_PATH, DATASETS_PATHS[size]['train']))
    val_ds = load_from_disk(os.path.join(DATASETS_PATH, DATASETS_PATHS[size]['validation']))

    train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)     

    # optimizer only on trainable params
    optim = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=LR)
    loss_fn = nn.CrossEntropyLoss()

    # log param counts
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params     = sum(p.numel() for p in model.parameters())
    os.makedirs(os.path.join(save_dir), exist_ok=True)
    with open(os.path.join(save_dir, "run_config.json"), "w") as f:
        json.dump({
            "model_name": name,
            "epochs": EPOCHS,
            "lr": LR,
            "num_labels": NUM_LABELS,
            "max_length": MAX_LENGTH,
            "trainable_params": int(trainable_params),
            "total_params": int(total_params),
            "device": DEVICE
        }, f, indent=2)

    # Train model
    trainer = ModelTrainer(model, device=DEVICE)
    trainer.train(
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optim,
        loss_fn=loss_fn,
        epochs=EPOCHS,
        save_dir=save_dir,
        filename=name,
        num_labels=NUM_LABELS,
        grad_clip=1.0
    )

    # Save model weights
    filepath = os.path.join(SAVE_MODELS_DIR, f"{name}.pt")
    torch.save(model.state_dict(), filepath)
    print(f" Saved model weights to {filepath}")



In [None]:
for size in ['small', 'medium', 'full']:
        name = f"full_fine_tuning_set_{size}"

        base_model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
        model = get_full_classification_model(base_model)

        Train(model, size, name)     

print("\nAll runs completed.")

In [None]:
for size in ['small', 'medium', 'full']:
        name = f"head_fine_tuning_set_{size}"

        base_model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
        model = get_classification_head_model(base_model)

        Train(model, size, name)     

print("\nAll runs completed.")

2025-09-01 00:02:37.984242: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



Epoch 1/10 | Train Loss: 0.7048 | Val Loss: 0.6888 | Val Acc: 0.6000 | Val F1: 0.6000
Epoch 2/10 | Train Loss: 0.6816 | Val Loss: 0.6885 | Val Acc: 0.5000 | Val F1: 0.4949
Epoch 3/10 | Train Loss: 0.6675 | Val Loss: 0.6884 | Val Acc: 0.5000 | Val F1: 0.4949
Epoch 4/10 | Train Loss: 0.6965 | Val Loss: 0.6883 | Val Acc: 0.5000 | Val F1: 0.4949
Epoch 5/10 | Train Loss: 0.6895 | Val Loss: 0.6875 | Val Acc: 0.6000 | Val F1: 0.6000
Epoch 6/10 | Train Loss: 0.6900 | Val Loss: 0.6868 | Val Acc: 0.6000 | Val F1: 0.6000
Epoch 7/10 | Train Loss: 0.6802 | Val Loss: 0.6864 | Val Acc: 0.6000 | Val F1: 0.6000
Epoch 8/10 | Train Loss: 0.6610 | Val Loss: 0.6859 | Val Acc: 0.5000 | Val F1: 0.4949
Epoch 9/10 | Train Loss: 0.6516 | Val Loss: 0.6857 | Val Acc: 0.6000 | Val F1: 0.5833
Epoch 10/10 | Train Loss: 0.6547 | Val Loss: 0.6853 | Val Acc: 0.6000 | Val F1: 0.5833
Saved metrics to: ./training/training_metrics/full_fine_tuning_set_small/full_fine_tuning_set_small.pkl
 Saved model weights to ./models/f

In [None]:
ADAPTERS_SIZE = [
    32, 64, 128
]

In [None]:
for adapter_size in ADAPTERS_SIZE:
    for size in ['small', 'medium', 'full']:
        name = f"adapters_a_size_{adapter_size}_set_{size}"

        base_model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
        model = get_adapters_model(base_model, adapter_size=adapter_size)

        Train(model, size, name)     

print("\nAll runs completed.")

In [None]:
LORA_CONFIGS = [
    {"r": 4, "alpha": 2},
    {"r": 8, "alpha": 4},
    {"r": 32, "alpha": 16},
    {"r": 4, "alpha": 8},
    {"r": 8, "alpha": 16},
    {"r": 32, "alpha": 64},
]

In [None]:
for cfg in LORA_CONFIGS:
    for size in ['small', 'medium', 'full']:
        r = cfg['r']
        alpha = cfg['alpha']
        name = f"lora_r_{r}_alpha_{alpha}_set_{size}"

        base_model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
        model = get_lora_model(base_model, rank=r, alpha=alpha)

        Train(model, size, name)     

print("\nAll runs completed.")