# LLM Fine-Tuning with Encoder

## Environment & Version Checks

In [1]:
import transformers
print(transformers.__version__)

import sys, os, json, copy
sys.path.append("..")

from datasets import load_dataset
from transformers import TrainingArguments, DataCollatorWithPadding
from src.EncoderTrainer import EncoderTrainer
from peft import LoraConfig, TaskType
import numpy as np
import torch
import time



4.41.2


## Global Configuration

In [2]:
# ===== paths & constants =====
OUTPUT_FOLDER = "../outputs"
DATA_FOLDER = "../data"

MODEL_NAME = "bert-base-cased"

VAL_JSON  = f"{DATA_FOLDER}/processed/val.json"
TEST_JSON = f"{DATA_FOLDER}/processed/test.json"

MAX_LENGTH = 128
LABELS = ["negative", "neutral", "positive"]
NUM_LABELS = len(LABELS)

LORA_TUNING_DIR = f"{OUTPUT_FOLDER}/lora_tuning/{MODEL_NAME}"
os.makedirs(LORA_TUNING_DIR, exist_ok=True)

## Dataset Loading

In [3]:
dataset = load_dataset(
    "json",
    data_files={
        "train": f"{DATA_FOLDER}/processed/train.json",
        "validation": f"{DATA_FOLDER}/processed/val.json",
        "test": f"{DATA_FOLDER}/processed/test.json",
    }
)


In [4]:
from collections import Counter

# Extract outputs
outputs = dataset['test']['output']

# Count occurrences
label_counts = Counter(outputs)
total = sum(label_counts.values())

counts = np.array([label_counts.get(label, 0) for label in LABELS])

print("Counts:", counts)

inverse_weights = total / (len(LABELS) * np.maximum(counts, 1))

# Normalize weights
inverse_weights = inverse_weights / inverse_weights.sum()

CLASS_WEIGHTS = torch.tensor(inverse_weights, dtype=torch.float)

print("Class Weights (Inverse Normalized):", CLASS_WEIGHTS)

Counts: [ 34 143  65]
Class Weights (Inverse Normalized): tensor([0.5679, 0.1350, 0.2971])


## Prompt Formatting

In [5]:
trainer = EncoderTrainer(
    model_name=MODEL_NAME,
    num_labels=NUM_LABELS,
    load_in_4bit=False,
)

tokenizer = trainer.tokenizer

def format_encoder(example):
    input = example["input"]
    label = LABELS.index(example["output"])  # or map from output

    return {
        "input": input,
        "label": label,
    }

dataset = dataset.map(
    format_encoder,
    batched=False,
    num_proc=1,
    desc="Formatting prompts"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization & Data Collation

In [6]:
from transformers import DataCollatorWithPadding

def tokenize_encoder(batch):
    enc = tokenizer(
        batch["input"],
        truncation=True,
        max_length=MAX_LENGTH,
    )
    enc["labels"] = int(batch["label"])
    return enc

tokenized_ds = dataset.map(
    tokenize_encoder,
    batched=False,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing encoder inputs",
)

data_collator = DataCollatorWithPadding(tokenizer)

Tokenizing encoder inputs:   0%|          | 0/242 [00:00<?, ? examples/s]

## Training Configuration

In [7]:
BASE_TRAINING_ARGS = dict(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,   # effective batch = 8
    num_train_epochs=4,
    lr_scheduler_type="cosine", 
    weight_decay=0.01,
    # warmup_steps=100,
    warmup_ratio=0.1,

    # precision (BF16 ONLY)
    fp16=False,
    bf16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,

    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    
    save_total_limit=1,
    report_to="none",
)


## Define the Experiment Function

In [8]:
RESULTS = []

# -------------------------
# Base configuration
# -------------------------
best_cfg = {
    "use_lora": True,              
    "learning_rate": 2e-4,
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "use_class_weights": False,
}

def run_experiment(cfg, stage_name):
    tag = f"{stage_name}_" + "_".join([f"{k}_{v}" for k, v in cfg.items()])
    out_dir = f"{LORA_TUNING_DIR}/{tag}"
    os.makedirs(out_dir, exist_ok=True)

    print(f"\n===== {stage_name} | Running config: {cfg} =====")

    # ----------------------------------
    # Build TrainingArguments
    # ----------------------------------
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=cfg["learning_rate"],
        **BASE_TRAINING_ARGS,
    )

    # ----------------------------------
    # Initialize Trainer
    # ----------------------------------
    trainer = EncoderTrainer(
        model_name=MODEL_NAME,
        num_labels=NUM_LABELS,
        load_in_4bit=False,
    )

    # ----------------------------------
    # Configure LoRA (if enabled)
    # ----------------------------------
    if cfg.get("use_lora", True):
        trainer.configure_lora(
            r=cfg["r"],
            lora_alpha=cfg["lora_alpha"],
            lora_dropout=cfg["lora_dropout"],
            target_modules=["query", "value"],
        )

    # ----------------------------------
    # Class weights
    # ----------------------------------
    if cfg["use_class_weights"]:
        trainer.class_weights = CLASS_WEIGHTS

    # ----- Train -----
    metrics = trainer.train(
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        training_args=training_args,
        data_collator=data_collator,
        classification_eval_fn=lambda: trainer.evaluate_classification(
            test_path=VAL_JSON,
            labels=LABELS,
            verbose=False,
        )
    )

    # ----------------------------------
    # Save model (FFT + LoRA unified)
    # ----------------------------------
    trainer.save_model(out_dir)
    
    # ----------------------------------
    # Save metrics
    # ----------------------------------
    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    # ----------------------------------
    # Save experiment metadata (FULL INFO)
    # ----------------------------------
    experiment_metadata = {
        "model": MODEL_NAME,
        "experiment_type": stage_name,
        "learning_rate": cfg.get("learning_rate"),
        "r": cfg.get("r"),
        "lora_alpha": cfg.get("lora_alpha"),
        "lora_dropout": cfg.get("lora_dropout"),
        "use_lora": cfg.get("use_lora"),
        "use_class_weights": cfg.get("use_class_weights"),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }

    with open(os.path.join(out_dir, "exp_config.json"), "w") as f:
        json.dump(experiment_metadata, f, indent=2)

    # ----------------------------------
    # Store in memory
    # ----------------------------------
    RESULTS.append({
        "stage": stage_name,
        "config": copy.deepcopy(cfg),
        "metrics": metrics,
        "output_dir": out_dir,
    })

    return metrics


## Stage 1 — Full Fine-tuning

In [9]:
full_ft_cfg = {
    "use_lora": False,      # Full FT
    "learning_rate": 2e-4,
    "r": None,
    "lora_alpha": None,
    "lora_dropout": None,
    "use_class_weights": False,
}

lr_candidates = [5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1
best_lr = None

for lr in lr_candidates:
    cfg = copy.deepcopy(full_ft_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"FFT_LR_{lr}")
    score = metrics["f1"]   # use macro F1 ideally

    if score > best_metric:
        best_metric = score
        best_lr = lr

print("Best FFT LR:", best_lr)


===== FFT_LR_5e-05 | Running config: {'use_lora': False, 'learning_rate': 5e-05, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6892,0.434046
2,0.3084,0.337718
3,0.2276,0.375877
4,0.111,0.378047



[Classification Metrics]
accuracy: 0.8388
precision: 0.8352
recall: 0.7780
f1: 0.8024
auc_ovr: 0.9316

[Classification Metrics]
accuracy: 0.8802
precision: 0.8599
recall: 0.8824
f1: 0.8705
auc_ovr: 0.9684

[Classification Metrics]
accuracy: 0.8967
precision: 0.8853
recall: 0.8894
f1: 0.8872
auc_ovr: 0.9652

[Classification Metrics]
accuracy: 0.8967
precision: 0.8853
recall: 0.8894
f1: 0.8872
auc_ovr: 0.9661

===== FFT_LR_0.0001 | Running config: {'use_lora': False, 'learning_rate': 0.0001, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5778,0.487635
2,0.3284,0.444894
3,0.2064,0.507875
4,0.0403,0.508687



[Classification Metrics]
accuracy: 0.8347
precision: 0.8416
recall: 0.7805
f1: 0.8063
auc_ovr: 0.9305

[Classification Metrics]
accuracy: 0.8636
precision: 0.8490
recall: 0.8493
f1: 0.8488
auc_ovr: 0.9601

[Classification Metrics]
accuracy: 0.8760
precision: 0.8713
recall: 0.8610
f1: 0.8661
auc_ovr: 0.9625

[Classification Metrics]
accuracy: 0.8802
precision: 0.8691
recall: 0.8634
f1: 0.8661
auc_ovr: 0.9627

===== FFT_LR_0.0002 | Running config: {'use_lora': False, 'learning_rate': 0.0002, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9659,0.898366
2,0.9144,0.925835
3,0.923,0.883646
4,0.935,0.885264



[Classification Metrics]
accuracy: 0.6240
precision: 0.3753
recall: 0.3851
f1: 0.3499
auc_ovr: 0.5474

[Classification Metrics]
accuracy: 0.6033
precision: 0.5328
recall: 0.3474
f1: 0.2766
auc_ovr: 0.5532

[Classification Metrics]
accuracy: 0.6281
precision: 0.4631
recall: 0.3780
f1: 0.3338
auc_ovr: 0.6660

[Classification Metrics]
accuracy: 0.6240
precision: 0.4558
recall: 0.3733
f1: 0.3261
auc_ovr: 0.6555

===== FFT_LR_0.0003 | Running config: {'use_lora': False, 'learning_rate': 0.0003, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9713,0.985628
2,0.9299,0.935502
3,0.9387,0.921357
4,0.9813,0.920806



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5283

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5121

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4829

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5002

===== FFT_LR_0.0005 | Running config: {'use_lora': False, 'learning_rate': 0.0005, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9885,1.049261
2,0.9341,0.936647
3,0.947,0.92092
4,0.971,0.92061



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5762

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5119

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5015

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4999

===== FFT_LR_0.0006 | Running config: {'use_lora': False, 'learning_rate': 0.0006, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.0049,0.925817
2,0.9433,0.944273
3,0.9517,0.920764
4,0.971,0.92064



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5000

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5508

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4922

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4937
Best FFT LR: 5e-05


## Stage 2 — LoRA (Tune Learning Rate)

In [10]:
lr_candidates = [5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1

for lr in lr_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"LORA_LR_{lr}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["learning_rate"] = lr

print("Best LR:", best_cfg["learning_rate"])



===== LORA_LR_5e-05 | Running config: {'use_lora': True, 'learning_rate': 5e-05, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.9449,0.915038
2,0.8977,0.906873
3,0.9141,0.904184
4,0.9574,0.904107



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.6395





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7198





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7368





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7377





===== LORA_LR_0.0001 | Running config: {'use_lora': True, 'learning_rate': 0.0001, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.9024,0.791728
2,0.6587,0.682431
3,0.7125,0.653896
4,0.6431,0.654049



[Classification Metrics]
accuracy: 0.6612
precision: 0.4253
recall: 0.4202
f1: 0.3934
auc_ovr: 0.7748





[Classification Metrics]
accuracy: 0.6901
precision: 0.4280
recall: 0.4838
f1: 0.4542
auc_ovr: 0.8265





[Classification Metrics]
accuracy: 0.7025
precision: 0.4356
recall: 0.4885
f1: 0.4601
auc_ovr: 0.8355





[Classification Metrics]
accuracy: 0.7025
precision: 0.4356
recall: 0.4885
f1: 0.4601
auc_ovr: 0.8353





===== LORA_LR_0.0002 | Running config: {'use_lora': True, 'learning_rate': 0.0002, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7801,0.709434
2,0.5381,0.593391
3,0.6396,0.549284
4,0.4987,0.54711



[Classification Metrics]
accuracy: 0.6860
precision: 0.4248
recall: 0.4791
f1: 0.4503
auc_ovr: 0.8077





[Classification Metrics]
accuracy: 0.7314
precision: 0.7978
recall: 0.5427
f1: 0.5170
auc_ovr: 0.8686





[Classification Metrics]
accuracy: 0.7893
precision: 0.7746
recall: 0.6879
f1: 0.7150
auc_ovr: 0.8859





[Classification Metrics]
accuracy: 0.7851
precision: 0.7744
recall: 0.6880
f1: 0.7176
auc_ovr: 0.8865





===== LORA_LR_0.0003 | Running config: {'use_lora': True, 'learning_rate': 0.0003, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7186,0.674321
2,0.477,0.463613
3,0.5236,0.338866
4,0.2999,0.336029



[Classification Metrics]
accuracy: 0.6942
precision: 0.4317
recall: 0.4885
f1: 0.4583
auc_ovr: 0.8397





[Classification Metrics]
accuracy: 0.8140
precision: 0.7690
recall: 0.8309
f1: 0.7934
auc_ovr: 0.9334





[Classification Metrics]
accuracy: 0.8926
precision: 0.9060
recall: 0.8608
f1: 0.8812
auc_ovr: 0.9556





[Classification Metrics]
accuracy: 0.8843
precision: 0.8805
recall: 0.8538
f1: 0.8662
auc_ovr: 0.9556





===== LORA_LR_0.0005 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7121,0.645091
2,0.3819,0.351876
3,0.4344,0.308609
4,0.2184,0.31341



[Classification Metrics]
accuracy: 0.7190
precision: 0.4512
recall: 0.5120
f1: 0.4796
auc_ovr: 0.8566





[Classification Metrics]
accuracy: 0.8678
precision: 0.8313
recall: 0.8827
f1: 0.8500
auc_ovr: 0.9604





[Classification Metrics]
accuracy: 0.9008
precision: 0.9160
recall: 0.8774
f1: 0.8951
auc_ovr: 0.9683





[Classification Metrics]
accuracy: 0.9050
precision: 0.9185
recall: 0.8893
f1: 0.9029
auc_ovr: 0.9679





===== LORA_LR_0.0006 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7047,0.647077
2,0.371,0.336711
3,0.3789,0.306264
4,0.1619,0.31351



[Classification Metrics]
accuracy: 0.7190
precision: 0.4500
recall: 0.5072
f1: 0.4769
auc_ovr: 0.8785





[Classification Metrics]
accuracy: 0.8802
precision: 0.8443
recall: 0.8992
f1: 0.8661
auc_ovr: 0.9660





[Classification Metrics]
accuracy: 0.8967
precision: 0.9017
recall: 0.8846
f1: 0.8928
auc_ovr: 0.9708





[Classification Metrics]
accuracy: 0.8967
precision: 0.9017
recall: 0.8846
f1: 0.8928
auc_ovr: 0.9706




Best LR: 0.0005


## Stage 3 — LoRA (Tune Rank)

In [11]:
rank_candidates = [4, 8, 16]
best_metric = -1

for r in rank_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["r"] = r
    cfg["lora_alpha"] = 2 * r

    metrics = run_experiment(cfg, f"LORA_RANK_{r}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["r"] = r
        best_cfg["lora_alpha"] = 2 * r

print("Best Rank:", best_cfg["r"])


===== LORA_RANK_4 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 4, 'lora_alpha': 8, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 149,763 || all params: 108,462,342 || trainable%: 0.13807833874728614


Epoch,Training Loss,Validation Loss
1,0.7107,0.695354
2,0.4692,0.390697
3,0.487,0.32125
4,0.2648,0.319174



[Classification Metrics]
accuracy: 0.7149
precision: 0.4491
recall: 0.5096
f1: 0.4774
auc_ovr: 0.8473





[Classification Metrics]
accuracy: 0.8554
precision: 0.8260
recall: 0.8423
f1: 0.8333
auc_ovr: 0.9435





[Classification Metrics]
accuracy: 0.8967
precision: 0.9110
recall: 0.8655
f1: 0.8860
auc_ovr: 0.9579





[Classification Metrics]
accuracy: 0.8967
precision: 0.9031
recall: 0.8727
f1: 0.8867
auc_ovr: 0.9576





===== LORA_RANK_8 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7121,0.645091
2,0.3819,0.351876
3,0.4344,0.308609
4,0.2184,0.31341



[Classification Metrics]
accuracy: 0.7190
precision: 0.4512
recall: 0.5120
f1: 0.4796
auc_ovr: 0.8566





[Classification Metrics]
accuracy: 0.8678
precision: 0.8313
recall: 0.8827
f1: 0.8500
auc_ovr: 0.9604





[Classification Metrics]
accuracy: 0.9008
precision: 0.9160
recall: 0.8774
f1: 0.8951
auc_ovr: 0.9683





[Classification Metrics]
accuracy: 0.9050
precision: 0.9185
recall: 0.8893
f1: 0.9029
auc_ovr: 0.9679





===== LORA_RANK_16 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 592,131 || all params: 108,904,710 || trainable%: 0.5437147759724993


Epoch,Training Loss,Validation Loss
1,0.7109,0.644298
2,0.3482,0.337277
3,0.3729,0.346087
4,0.1293,0.349638



[Classification Metrics]
accuracy: 0.7231
precision: 0.4499
recall: 0.5025
f1: 0.4740
auc_ovr: 0.8765





[Classification Metrics]
accuracy: 0.8843
precision: 0.8530
recall: 0.8872
f1: 0.8661
auc_ovr: 0.9639





[Classification Metrics]
accuracy: 0.9091
precision: 0.9105
recall: 0.8844
f1: 0.8967
auc_ovr: 0.9657





[Classification Metrics]
accuracy: 0.9050
precision: 0.8989
recall: 0.8821
f1: 0.8900
auc_ovr: 0.9659




Best Rank: 8


## Stage 4 — LoRA (Tune Alpha)

In [12]:
alpha_candidates = [
    best_cfg["r"],
    2 * best_cfg["r"],
    4 * best_cfg["r"]
]

best_metric = -1

for alpha in alpha_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_alpha"] = alpha

    metrics = run_experiment(cfg, f"LORA_ALPHA_{alpha}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_alpha"] = alpha

print("Best Alpha:", best_cfg["lora_alpha"])



===== LORA_ALPHA_8 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 8, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7074,0.681818
2,0.4663,0.399463
3,0.4855,0.307575
4,0.2654,0.306407



[Classification Metrics]
accuracy: 0.7107
precision: 0.4451
recall: 0.5049
f1: 0.4730
auc_ovr: 0.8529





[Classification Metrics]
accuracy: 0.8554
precision: 0.8235
recall: 0.8494
f1: 0.8355
auc_ovr: 0.9466





[Classification Metrics]
accuracy: 0.9050
precision: 0.9161
recall: 0.8749
f1: 0.8936
auc_ovr: 0.9601





[Classification Metrics]
accuracy: 0.9008
precision: 0.9135
recall: 0.8774
f1: 0.8941
auc_ovr: 0.9593





===== LORA_ALPHA_16 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7121,0.645091
2,0.3819,0.351876
3,0.4344,0.308609
4,0.2184,0.31341



[Classification Metrics]
accuracy: 0.7190
precision: 0.4512
recall: 0.5120
f1: 0.4796
auc_ovr: 0.8566





[Classification Metrics]
accuracy: 0.8678
precision: 0.8313
recall: 0.8827
f1: 0.8500
auc_ovr: 0.9604





[Classification Metrics]
accuracy: 0.9008
precision: 0.9160
recall: 0.8774
f1: 0.8951
auc_ovr: 0.9683





[Classification Metrics]
accuracy: 0.9050
precision: 0.9185
recall: 0.8893
f1: 0.9029
auc_ovr: 0.9679





===== LORA_ALPHA_32 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7151,0.642044
2,0.3453,0.372285
3,0.3715,0.335134
4,0.1323,0.340693



[Classification Metrics]
accuracy: 0.7149
precision: 0.4451
recall: 0.5025
f1: 0.4721
auc_ovr: 0.8795





[Classification Metrics]
accuracy: 0.8636
precision: 0.8303
recall: 0.8708
f1: 0.8431
auc_ovr: 0.9597





[Classification Metrics]
accuracy: 0.9050
precision: 0.9112
recall: 0.8893
f1: 0.8994
auc_ovr: 0.9686





[Classification Metrics]
accuracy: 0.9050
precision: 0.9025
recall: 0.8917
f1: 0.8966
auc_ovr: 0.9690




Best Alpha: 16


## Stage 5 — LoRA (Tune Dropout)

In [13]:
dropout_candidates = [0.0, 0.05, 0.1]
best_metric = -1

for d in dropout_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_dropout"] = d

    metrics = run_experiment(cfg, f"LORA_DROPOUT_{d}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_dropout"] = d

print("Best Dropout:", best_cfg["lora_dropout"])



===== LORA_DROPOUT_0.0 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.0, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7217,0.656789
2,0.4008,0.314057
3,0.3505,0.307173
4,0.2105,0.315283



[Classification Metrics]
accuracy: 0.7107
precision: 0.4437
recall: 0.5002
f1: 0.4702
auc_ovr: 0.8687





[Classification Metrics]
accuracy: 0.8678
precision: 0.8392
recall: 0.8755
f1: 0.8546
auc_ovr: 0.9635





[Classification Metrics]
accuracy: 0.9008
precision: 0.8954
recall: 0.8893
f1: 0.8922
auc_ovr: 0.9678





[Classification Metrics]
accuracy: 0.8967
precision: 0.8849
recall: 0.8870
f1: 0.8857
auc_ovr: 0.9666





===== LORA_DROPOUT_0.05 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7121,0.645091
2,0.3819,0.351876
3,0.4344,0.308609
4,0.2184,0.31341



[Classification Metrics]
accuracy: 0.7190
precision: 0.4512
recall: 0.5120
f1: 0.4796
auc_ovr: 0.8566





[Classification Metrics]
accuracy: 0.8678
precision: 0.8313
recall: 0.8827
f1: 0.8500
auc_ovr: 0.9604





[Classification Metrics]
accuracy: 0.9008
precision: 0.9160
recall: 0.8774
f1: 0.8951
auc_ovr: 0.9683





[Classification Metrics]
accuracy: 0.9050
precision: 0.9185
recall: 0.8893
f1: 0.9029
auc_ovr: 0.9679





===== LORA_DROPOUT_0.1 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7199,0.644869
2,0.3755,0.37091
3,0.4342,0.302856
4,0.2128,0.30789



[Classification Metrics]
accuracy: 0.7149
precision: 0.4494
recall: 0.5120
f1: 0.4782
auc_ovr: 0.8553





[Classification Metrics]
accuracy: 0.8595
precision: 0.8168
recall: 0.8781
f1: 0.8384
auc_ovr: 0.9590





[Classification Metrics]
accuracy: 0.9008
precision: 0.9064
recall: 0.8870
f1: 0.8961
auc_ovr: 0.9692





[Classification Metrics]
accuracy: 0.9008
precision: 0.9064
recall: 0.8870
f1: 0.8961
auc_ovr: 0.9685




Best Dropout: 0.05


## Stage 6 — LoRA (Class Weights)

In [14]:
for use_weights in [False, True]:
    cfg = copy.deepcopy(best_cfg)
    cfg["use_class_weights"] = use_weights

    metrics = run_experiment(cfg, f"LORA_CLASS_WEIGHT")
    score = metrics["f1"]

    if score > best_metric:
        best_metric = score
        best_cfg["use_class_weights"] = use_weights

print("Best use_class_weights:", best_cfg["use_class_weights"])



===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7121,0.645091
2,0.3819,0.351876
3,0.4344,0.308609
4,0.2184,0.31341



[Classification Metrics]
accuracy: 0.7190
precision: 0.4512
recall: 0.5120
f1: 0.4796
auc_ovr: 0.8566





[Classification Metrics]
accuracy: 0.8678
precision: 0.8313
recall: 0.8827
f1: 0.8500
auc_ovr: 0.9604





[Classification Metrics]
accuracy: 0.9008
precision: 0.9160
recall: 0.8774
f1: 0.8951
auc_ovr: 0.9683





[Classification Metrics]
accuracy: 0.9050
precision: 0.9185
recall: 0.8893
f1: 0.9029
auc_ovr: 0.9679





===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': True} =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,609,798 || trainable%: 0.2736576307783944


Epoch,Training Loss,Validation Loss
1,0.7164,0.560984
2,0.4179,0.364531
3,0.455,0.354481
4,0.1808,0.358583



[Classification Metrics]
accuracy: 0.7975
precision: 0.7808
recall: 0.7837
f1: 0.7793
auc_ovr: 0.9102





[Classification Metrics]
accuracy: 0.8595
precision: 0.8273
recall: 0.8827
f1: 0.8505
auc_ovr: 0.9633





[Classification Metrics]
accuracy: 0.8802
precision: 0.8619
recall: 0.8873
f1: 0.8737
auc_ovr: 0.9654





[Classification Metrics]
accuracy: 0.8760
precision: 0.8589
recall: 0.8826
f1: 0.8700
auc_ovr: 0.9651




Best use_class_weights: False


In [15]:
print("\n===== FINAL BEST CONFIG =====")
print(best_cfg)



===== FINAL BEST CONFIG =====
{'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False}
