# LLM Fine-Tuning with Encoder

## Environment & Version Checks

In [1]:
import transformers
print(transformers.__version__)

import sys, os, json, copy
sys.path.append("..")

from datasets import load_dataset
from transformers import TrainingArguments, DataCollatorWithPadding
from src.EncoderTrainer import EncoderTrainer
from peft import LoraConfig, TaskType
import numpy as np
import torch
import time



4.41.2


## Global Configuration

In [2]:
# ===== paths & constants =====
OUTPUT_FOLDER = "../outputs"
DATA_FOLDER = "../data"

MODEL_NAME = "bert-base-uncased"

VAL_JSON  = f"{DATA_FOLDER}/processed/val.json"
TEST_JSON = f"{DATA_FOLDER}/processed/test.json"

MAX_LENGTH = 128
LABELS = ["negative", "neutral", "positive"]
NUM_LABELS = len(LABELS)

LORA_TUNING_DIR = f"{OUTPUT_FOLDER}/lora_tuning/{MODEL_NAME}"
os.makedirs(LORA_TUNING_DIR, exist_ok=True)

## Dataset Loading

In [3]:
dataset = load_dataset(
    "json",
    data_files={
        "train": f"{DATA_FOLDER}/processed/train.json",
        "validation": f"{DATA_FOLDER}/processed/val.json",
        "test": f"{DATA_FOLDER}/processed/test.json",
    }
)


In [4]:
from collections import Counter

# Extract outputs
outputs = dataset['test']['output']

# Count occurrences
label_counts = Counter(outputs)
total = sum(label_counts.values())

counts = np.array([label_counts.get(label, 0) for label in LABELS])

print("Counts:", counts)

inverse_weights = total / (len(LABELS) * np.maximum(counts, 1))

# Normalize weights
inverse_weights = inverse_weights / inverse_weights.sum()

CLASS_WEIGHTS = torch.tensor(inverse_weights, dtype=torch.float)

print("Class Weights (Inverse Normalized):", CLASS_WEIGHTS)

Counts: [ 34 143  65]
Class Weights (Inverse Normalized): tensor([0.5679, 0.1350, 0.2971])


## Prompt Formatting

In [5]:
trainer = EncoderTrainer(
    model_name=MODEL_NAME,
    labels=LABELS,
    load_in_4bit=False,
)

tokenizer = trainer.tokenizer

def format_encoder(example):
    input = example["input"]
    label = LABELS.index(example["output"])  # or map from output

    return {
        "input": input,
        "label": label,
    }

dataset = dataset.map(
    format_encoder,
    batched=False,
    num_proc=1,
    desc="Formatting prompts"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization & Data Collation

In [6]:
from transformers import DataCollatorWithPadding

def tokenize_encoder(batch):
    enc = tokenizer(
        batch["input"],
        truncation=True,
        max_length=MAX_LENGTH,
        add_special_tokens=True,
    )
    enc["labels"] = int(batch["label"])
    return enc

tokenized_ds = dataset.map(
    tokenize_encoder,
    batched=False,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing encoder inputs",
)

data_collator = DataCollatorWithPadding(tokenizer)

Tokenizing encoder inputs:   0%|          | 0/242 [00:00<?, ? examples/s]

## Training Configuration

In [7]:
BASE_TRAINING_ARGS = dict(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,   # effective batch = 8
    num_train_epochs=4,
    lr_scheduler_type="cosine", 
    weight_decay=0.01,
    # warmup_steps=100,
    warmup_ratio=0.1,

    # precision (BF16 ONLY)
    fp16=False,
    bf16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,

    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    
    save_total_limit=1,
    report_to="none",
)


## Define the Experiment Function

In [8]:
RESULTS = []

# -------------------------
# Base configuration
# -------------------------
best_cfg = {
    "use_lora": True,              
    "learning_rate": 2e-4,
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "use_class_weights": False,
}

def run_experiment(cfg, stage_name):
    tag = f"{stage_name}_" + "_".join([f"{k}_{v}" for k, v in cfg.items()])
    out_dir = f"{LORA_TUNING_DIR}/{tag}"
    os.makedirs(out_dir, exist_ok=True)

    print(f"\n===== {stage_name} | Running config: {cfg} =====")

    # ----------------------------------
    # Build TrainingArguments
    # ----------------------------------
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=cfg["learning_rate"],
        **BASE_TRAINING_ARGS,
    )

    # ----------------------------------
    # Initialize Trainer
    # ----------------------------------
    trainer = EncoderTrainer(
        model_name=MODEL_NAME,
        labels=LABELS,
        load_in_4bit=False,
    )

    # ----------------------------------
    # Configure LoRA (if enabled)
    # ----------------------------------
    if cfg.get("use_lora", True):
        trainer.configure_lora(
            r=cfg["r"],
            lora_alpha=cfg["lora_alpha"],
            lora_dropout=cfg["lora_dropout"],
            target_modules=["query", "value"],
        )

    # ----------------------------------
    # Class weights
    # ----------------------------------
    if cfg["use_class_weights"]:
        trainer.class_weights = CLASS_WEIGHTS

    # ----- Train -----
    metrics = trainer.train(
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        training_args=training_args,
        data_collator=data_collator,
        classification_eval_fn=lambda: trainer.evaluate_classification(
            test_path=VAL_JSON,
            labels=LABELS,
            verbose=False,
        )
    )

    # ----------------------------------
    # Save model (FFT + LoRA unified)
    # ----------------------------------
    trainer.save_model(out_dir)
    
    # ----------------------------------
    # Save metrics
    # ----------------------------------
    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    # ----------------------------------
    # Save experiment metadata (FULL INFO)
    # ----------------------------------
    experiment_metadata = {
        "model": MODEL_NAME,
        "experiment_type": stage_name,
        "learning_rate": cfg.get("learning_rate"),
        "r": cfg.get("r"),
        "lora_alpha": cfg.get("lora_alpha"),
        "lora_dropout": cfg.get("lora_dropout"),
        "use_lora": cfg.get("use_lora"),
        "use_class_weights": cfg.get("use_class_weights"),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }

    with open(os.path.join(out_dir, "exp_config.json"), "w") as f:
        json.dump(experiment_metadata, f, indent=2)

    # ----------------------------------
    # Store in memory
    # ----------------------------------
    RESULTS.append({
        "stage": stage_name,
        "config": copy.deepcopy(cfg),
        "metrics": metrics,
        "output_dir": out_dir,
    })

    return metrics


## Stage 1 — Full Fine-tuning

In [9]:
full_ft_cfg = {
    "use_lora": False,      # Full FT
    "learning_rate": 2e-4,
    "r": None,
    "lora_alpha": None,
    "lora_dropout": None,
    "use_class_weights": False,
}

lr_candidates = [5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1
best_lr = None

for lr in lr_candidates:
    cfg = copy.deepcopy(full_ft_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"FFT_LR_{lr}")
    score = metrics["f1"]   # use macro F1 ideally

    if score > best_metric:
        best_metric = score
        best_lr = lr

print("Best FFT LR:", best_lr)


===== FFT_LR_5e-05 | Running config: {'use_lora': False, 'learning_rate': 5e-05, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.739,0.552196
2,0.3295,0.334662
3,0.2892,0.322897
4,0.1894,0.32452



[Classification Metrics]
accuracy: 0.7851
precision: 0.8384
recall: 0.6230
f1: 0.6194
auc_ovr: 0.9108

[Classification Metrics]
accuracy: 0.8802
precision: 0.8491
recall: 0.8967
f1: 0.8698
auc_ovr: 0.9714

[Classification Metrics]
accuracy: 0.9008
precision: 0.8834
recall: 0.8988
f1: 0.8907
auc_ovr: 0.9749

[Classification Metrics]
accuracy: 0.9008
precision: 0.8834
recall: 0.8988
f1: 0.8907
auc_ovr: 0.9751

===== FFT_LR_0.0001 | Running config: {'use_lora': False, 'learning_rate': 0.0001, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5566,0.425778
2,0.3569,0.370229
3,0.1419,0.454883
4,0.0349,0.446762



[Classification Metrics]
accuracy: 0.8223
precision: 0.8390
recall: 0.7641
f1: 0.7936
auc_ovr: 0.9419

[Classification Metrics]
accuracy: 0.8926
precision: 0.8719
recall: 0.8990
f1: 0.8845
auc_ovr: 0.9667

[Classification Metrics]
accuracy: 0.9050
precision: 0.8944
recall: 0.9083
f1: 0.9011
auc_ovr: 0.9618

[Classification Metrics]
accuracy: 0.9050
precision: 0.8944
recall: 0.9083
f1: 0.9011
auc_ovr: 0.9615

===== FFT_LR_0.0002 | Running config: {'use_lora': False, 'learning_rate': 0.0002, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.8694,0.532286
2,0.4102,0.487692
3,0.3886,0.603577
4,0.1354,0.617184



[Classification Metrics]
accuracy: 0.8264
precision: 0.8842
recall: 0.7425
f1: 0.7911
auc_ovr: 0.9043

[Classification Metrics]
accuracy: 0.8430
precision: 0.8234
recall: 0.8400
f1: 0.8313
auc_ovr: 0.9174

[Classification Metrics]
accuracy: 0.8595
precision: 0.8514
recall: 0.8613
f1: 0.8562
auc_ovr: 0.9342

[Classification Metrics]
accuracy: 0.8595
precision: 0.8514
recall: 0.8613
f1: 0.8562
auc_ovr: 0.9365

===== FFT_LR_0.0003 | Running config: {'use_lora': False, 'learning_rate': 0.0003, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.8002,0.962297
2,0.9227,0.93005
3,0.9389,0.921285
4,0.9743,0.921302



[Classification Metrics]
accuracy: 0.5992
precision: 0.3678
recall: 0.3498
f1: 0.2900
auc_ovr: 0.5871

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4926

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5035

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5229

===== FFT_LR_0.0005 | Running config: {'use_lora': False, 'learning_rate': 0.0005, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9906,0.930398
2,0.9243,0.930521
3,0.9455,0.921013
4,0.9792,0.920809



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4850

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5145

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4955

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4960

===== FFT_LR_0.0006 | Running config: {'use_lora': False, 'learning_rate': 0.0006, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.0056,0.945216
2,0.9379,0.945513
3,0.9634,0.920905
4,0.9789,0.920669



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5000

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4696

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5127

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4718
Best FFT LR: 0.0001


## Stage 2 — LoRA (Tune Learning Rate)

In [10]:
lr_candidates = [5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1

for lr in lr_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"LORA_LR_{lr}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["learning_rate"] = lr

print("Best LR:", best_cfg["learning_rate"])



===== LORA_LR_5e-05 | Running config: {'use_lora': True, 'learning_rate': 5e-05, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.949,0.912518
2,0.8656,0.884546
3,0.8801,0.87136
4,0.8972,0.870968



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.6126





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.6932





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7064





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7056





===== LORA_LR_0.0001 | Running config: {'use_lora': True, 'learning_rate': 0.0001, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.9043,0.805218
2,0.6537,0.675343
3,0.7074,0.651331
4,0.6552,0.651756



[Classification Metrics]
accuracy: 0.6405
precision: 0.5322
recall: 0.4375
f1: 0.4302
auc_ovr: 0.7655





[Classification Metrics]
accuracy: 0.6942
precision: 0.6074
recall: 0.5367
f1: 0.5547
auc_ovr: 0.8250





[Classification Metrics]
accuracy: 0.6901
precision: 0.5832
recall: 0.5176
f1: 0.5311
auc_ovr: 0.8367





[Classification Metrics]
accuracy: 0.6860
precision: 0.5789
recall: 0.5129
f1: 0.5262
auc_ovr: 0.8371





===== LORA_LR_0.0002 | Running config: {'use_lora': True, 'learning_rate': 0.0002, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.7985,0.676547
2,0.5478,0.548785
3,0.5873,0.500083
4,0.4625,0.496061



[Classification Metrics]
accuracy: 0.6942
precision: 0.4253
recall: 0.4673
f1: 0.4415
auc_ovr: 0.8268





[Classification Metrics]
accuracy: 0.7438
precision: 0.6647
recall: 0.6577
f1: 0.6610
auc_ovr: 0.8892





[Classification Metrics]
accuracy: 0.8182
precision: 0.8328
recall: 0.7209
f1: 0.7545
auc_ovr: 0.9104





[Classification Metrics]
accuracy: 0.8182
precision: 0.8329
recall: 0.7185
f1: 0.7534
auc_ovr: 0.9118





===== LORA_LR_0.0003 | Running config: {'use_lora': True, 'learning_rate': 0.0003, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.7466,0.62916
2,0.4108,0.359915
3,0.4383,0.328236
4,0.2922,0.32606



[Classification Metrics]
accuracy: 0.7149
precision: 0.7775
recall: 0.5170
f1: 0.5114
auc_ovr: 0.8540





[Classification Metrics]
accuracy: 0.8760
precision: 0.8501
recall: 0.8730
f1: 0.8602
auc_ovr: 0.9529





[Classification Metrics]
accuracy: 0.8843
precision: 0.8980
recall: 0.8681
f1: 0.8820
auc_ovr: 0.9618





[Classification Metrics]
accuracy: 0.8843
precision: 0.8884
recall: 0.8776
f1: 0.8829
auc_ovr: 0.9625





===== LORA_LR_0.0005 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6923,0.5658
2,0.3547,0.353987
3,0.3427,0.347639
4,0.1902,0.350815



[Classification Metrics]
accuracy: 0.7397
precision: 0.8024
recall: 0.5474
f1: 0.5218
auc_ovr: 0.9034





[Classification Metrics]
accuracy: 0.8760
precision: 0.8494
recall: 0.8969
f1: 0.8701
auc_ovr: 0.9658





[Classification Metrics]
accuracy: 0.9008
precision: 0.8977
recall: 0.8893
f1: 0.8932
auc_ovr: 0.9687





[Classification Metrics]
accuracy: 0.8967
precision: 0.8919
recall: 0.8966
f1: 0.8941
auc_ovr: 0.9683





===== LORA_LR_0.0006 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6578,0.457296
2,0.3642,0.341068
3,0.3029,0.34453
4,0.1736,0.349418



[Classification Metrics]
accuracy: 0.8017
precision: 0.8265
recall: 0.7644
f1: 0.7908
auc_ovr: 0.9250





[Classification Metrics]
accuracy: 0.8802
precision: 0.8534
recall: 0.8896
f1: 0.8695
auc_ovr: 0.9668





[Classification Metrics]
accuracy: 0.8967
precision: 0.8995
recall: 0.8798
f1: 0.8892
auc_ovr: 0.9728





[Classification Metrics]
accuracy: 0.8926
precision: 0.8879
recall: 0.8775
f1: 0.8826
auc_ovr: 0.9719




Best LR: 0.0005


## Stage 3 — LoRA (Tune Rank)

In [11]:
rank_candidates = [4, 8, 16]
best_metric = -1

for r in rank_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["r"] = r
    cfg["lora_alpha"] = 2 * r

    metrics = run_experiment(cfg, f"LORA_RANK_{r}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["r"] = r
        best_cfg["lora_alpha"] = 2 * r

print("Best Rank:", best_cfg["r"])


===== LORA_RANK_4 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 4, 'lora_alpha': 8, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 149,763 || all params: 109,634,310 || trainable%: 0.1366023099885428


Epoch,Training Loss,Validation Loss
1,0.7244,0.607426
2,0.3896,0.3298
3,0.4064,0.321567
4,0.2468,0.326185



[Classification Metrics]
accuracy: 0.6942
precision: 0.4406
recall: 0.5051
f1: 0.4684
auc_ovr: 0.8771





[Classification Metrics]
accuracy: 0.8802
precision: 0.8618
recall: 0.8921
f1: 0.8747
auc_ovr: 0.9672





[Classification Metrics]
accuracy: 0.8760
precision: 0.8658
recall: 0.8658
f1: 0.8658
auc_ovr: 0.9709





[Classification Metrics]
accuracy: 0.8719
precision: 0.8616
recall: 0.8634
f1: 0.8625
auc_ovr: 0.9710





===== LORA_RANK_8 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6923,0.5658
2,0.3547,0.353987
3,0.3427,0.347639
4,0.1902,0.350815



[Classification Metrics]
accuracy: 0.7397
precision: 0.8024
recall: 0.5474
f1: 0.5218
auc_ovr: 0.9034





[Classification Metrics]
accuracy: 0.8760
precision: 0.8494
recall: 0.8969
f1: 0.8701
auc_ovr: 0.9658





[Classification Metrics]
accuracy: 0.9008
precision: 0.8977
recall: 0.8893
f1: 0.8932
auc_ovr: 0.9687





[Classification Metrics]
accuracy: 0.8967
precision: 0.8919
recall: 0.8966
f1: 0.8941
auc_ovr: 0.9683





===== LORA_RANK_16 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,0.6265,0.431994
2,0.3891,0.341003
3,0.3188,0.37847
4,0.1468,0.39284



[Classification Metrics]
accuracy: 0.8554
precision: 0.8945
recall: 0.8259
f1: 0.8474
auc_ovr: 0.9395





[Classification Metrics]
accuracy: 0.8802
precision: 0.8618
recall: 0.8921
f1: 0.8747
auc_ovr: 0.9643





[Classification Metrics]
accuracy: 0.8843
precision: 0.8800
recall: 0.8800
f1: 0.8800
auc_ovr: 0.9677





[Classification Metrics]
accuracy: 0.8926
precision: 0.8876
recall: 0.8942
f1: 0.8908
auc_ovr: 0.9665




Best Rank: 8


## Stage 4 — LoRA (Tune Alpha)

In [12]:
alpha_candidates = [
    best_cfg["r"],
    2 * best_cfg["r"],
    4 * best_cfg["r"]
]

best_metric = -1

for alpha in alpha_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_alpha"] = alpha

    metrics = run_experiment(cfg, f"LORA_ALPHA_{alpha}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_alpha"] = alpha

print("Best Alpha:", best_cfg["lora_alpha"])



===== LORA_ALPHA_8 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 8, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.7148,0.597075
2,0.3669,0.347432
3,0.3793,0.3314
4,0.2211,0.333725



[Classification Metrics]
accuracy: 0.7025
precision: 0.4424
recall: 0.5050
f1: 0.4709
auc_ovr: 0.8801





[Classification Metrics]
accuracy: 0.8554
precision: 0.8290
recall: 0.8661
f1: 0.8456
auc_ovr: 0.9666





[Classification Metrics]
accuracy: 0.8802
precision: 0.8695
recall: 0.8777
f1: 0.8735
auc_ovr: 0.9689





[Classification Metrics]
accuracy: 0.8760
precision: 0.8667
recall: 0.8730
f1: 0.8697
auc_ovr: 0.9685





===== LORA_ALPHA_16 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6923,0.5658
2,0.3547,0.353987
3,0.3427,0.347639
4,0.1902,0.350815



[Classification Metrics]
accuracy: 0.7397
precision: 0.8024
recall: 0.5474
f1: 0.5218
auc_ovr: 0.9034





[Classification Metrics]
accuracy: 0.8760
precision: 0.8494
recall: 0.8969
f1: 0.8701
auc_ovr: 0.9658





[Classification Metrics]
accuracy: 0.9008
precision: 0.8977
recall: 0.8893
f1: 0.8932
auc_ovr: 0.9687





[Classification Metrics]
accuracy: 0.8967
precision: 0.8919
recall: 0.8966
f1: 0.8941
auc_ovr: 0.9683





===== LORA_ALPHA_32 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6785,0.401744
2,0.3664,0.339935
3,0.3246,0.358124
4,0.1653,0.366351



[Classification Metrics]
accuracy: 0.8388
precision: 0.8332
recall: 0.8498
f1: 0.8411
auc_ovr: 0.9401





[Classification Metrics]
accuracy: 0.8760
precision: 0.8650
recall: 0.8849
f1: 0.8745
auc_ovr: 0.9663





[Classification Metrics]
accuracy: 0.9008
precision: 0.9043
recall: 0.8893
f1: 0.8965
auc_ovr: 0.9676





[Classification Metrics]
accuracy: 0.8967
precision: 0.8913
recall: 0.8894
f1: 0.8903
auc_ovr: 0.9677




Best Alpha: 32


## Stage 5 — LoRA (Tune Dropout)

In [13]:
dropout_candidates = [0.0, 0.05, 0.1]
best_metric = -1

for d in dropout_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_dropout"] = d

    metrics = run_experiment(cfg, f"LORA_DROPOUT_{d}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_dropout"] = d

print("Best Dropout:", best_cfg["lora_dropout"])



===== LORA_DROPOUT_0.0 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.0, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6408,0.455039
2,0.3689,0.310271
3,0.3052,0.349705
4,0.1397,0.360638



[Classification Metrics]
accuracy: 0.8388
precision: 0.8857
recall: 0.7999
f1: 0.8260
auc_ovr: 0.9342





[Classification Metrics]
accuracy: 0.8802
precision: 0.8589
recall: 0.8873
f1: 0.8718
auc_ovr: 0.9660





[Classification Metrics]
accuracy: 0.8926
precision: 0.8750
recall: 0.8966
f1: 0.8850
auc_ovr: 0.9701





[Classification Metrics]
accuracy: 0.8926
precision: 0.8750
recall: 0.8966
f1: 0.8850
auc_ovr: 0.9698





===== LORA_DROPOUT_0.05 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6785,0.401744
2,0.3664,0.339935
3,0.3246,0.358124
4,0.1653,0.366351



[Classification Metrics]
accuracy: 0.8388
precision: 0.8332
recall: 0.8498
f1: 0.8411
auc_ovr: 0.9401





[Classification Metrics]
accuracy: 0.8760
precision: 0.8650
recall: 0.8849
f1: 0.8745
auc_ovr: 0.9663





[Classification Metrics]
accuracy: 0.9008
precision: 0.9043
recall: 0.8893
f1: 0.8965
auc_ovr: 0.9676





[Classification Metrics]
accuracy: 0.8967
precision: 0.8913
recall: 0.8894
f1: 0.8903
auc_ovr: 0.9677





===== LORA_DROPOUT_0.1 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.1, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6891,0.403569
2,0.3511,0.339317
3,0.317,0.380968
4,0.1628,0.383643



[Classification Metrics]
accuracy: 0.8471
precision: 0.8274
recall: 0.8424
f1: 0.8345
auc_ovr: 0.9398





[Classification Metrics]
accuracy: 0.8760
precision: 0.8561
recall: 0.8826
f1: 0.8680
auc_ovr: 0.9650





[Classification Metrics]
accuracy: 0.8760
precision: 0.8650
recall: 0.8849
f1: 0.8745
auc_ovr: 0.9660





[Classification Metrics]
accuracy: 0.8884
precision: 0.8774
recall: 0.8919
f1: 0.8843
auc_ovr: 0.9651




Best Dropout: 0.05


## Stage 6 — LoRA (Class Weights)

In [14]:
for use_weights in [False, True]:
    cfg = copy.deepcopy(best_cfg)
    cfg["use_class_weights"] = use_weights

    metrics = run_experiment(cfg, f"LORA_CLASS_WEIGHT")
    score = metrics["f1"]

    if score > best_metric:
        best_metric = score
        best_cfg["use_class_weights"] = use_weights

print("Best use_class_weights:", best_cfg["use_class_weights"])



===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6785,0.401744
2,0.3664,0.339935
3,0.3246,0.358124
4,0.1653,0.366351



[Classification Metrics]
accuracy: 0.8388
precision: 0.8332
recall: 0.8498
f1: 0.8411
auc_ovr: 0.9401





[Classification Metrics]
accuracy: 0.8760
precision: 0.8650
recall: 0.8849
f1: 0.8745
auc_ovr: 0.9663





[Classification Metrics]
accuracy: 0.9008
precision: 0.9043
recall: 0.8893
f1: 0.8965
auc_ovr: 0.9676





[Classification Metrics]
accuracy: 0.8967
precision: 0.8913
recall: 0.8894
f1: 0.8903
auc_ovr: 0.9677





===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': True} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,0.6582,0.526459
2,0.4146,0.363862
3,0.3825,0.37789
4,0.1495,0.386344



[Classification Metrics]
accuracy: 0.8017
precision: 0.8132
recall: 0.7787
f1: 0.7942
auc_ovr: 0.9189





[Classification Metrics]
accuracy: 0.8719
precision: 0.8495
recall: 0.8850
f1: 0.8656
auc_ovr: 0.9686





[Classification Metrics]
accuracy: 0.8926
precision: 0.8862
recall: 0.8966
f1: 0.8913
auc_ovr: 0.9711





[Classification Metrics]
accuracy: 0.8926
precision: 0.8862
recall: 0.8966
f1: 0.8913
auc_ovr: 0.9711




Best use_class_weights: False


In [15]:
print("\n===== FINAL BEST CONFIG =====")
print(best_cfg)



===== FINAL BEST CONFIG =====
{'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False}
