# LLM Fine-Tuning with LoRA (Encoder)

## Environment & Version Checks

In [1]:
import transformers
print(transformers.__version__)

import sys, os, json, copy
sys.path.append("..")

from datasets import load_dataset
from transformers import TrainingArguments, DataCollatorWithPadding
from src.EncoderTrainer import EncoderTrainer
from peft import LoraConfig, TaskType
import numpy as np
import torch
import time

5.0.0


## Global Configuration

In [2]:
# ===== paths & constants =====
OUTPUT_FOLDER = "../outputs"
DATA_FOLDER = "../data"

MODEL_NAME = "bert-base-uncased"

VAL_JSON  = f"{DATA_FOLDER}/processed/val.json"
TEST_JSON = f"{DATA_FOLDER}/processed/test.json"

MAX_LENGTH = 128
LABELS = ["negative", "neutral", "positive"]
NUM_LABELS = len(LABELS)

LORA_TUNING_DIR = f"{OUTPUT_FOLDER}/lora_tuning/{MODEL_NAME}"
os.makedirs(LORA_TUNING_DIR, exist_ok=True)

## Dataset Loading

In [3]:
dataset = load_dataset(
    "json",
    data_files={
        "train": f"{DATA_FOLDER}/processed/train.json",
        "validation": f"{DATA_FOLDER}/processed/val.json",
        "test": f"{DATA_FOLDER}/processed/test.json",
    }
)


In [4]:
from collections import Counter

# Extract outputs
outputs = dataset['test']['output']

# Count occurrences
label_counts = Counter(outputs)
total = sum(label_counts.values())

counts = np.array([label_counts.get(label, 0) for label in LABELS])

print("Counts:", counts)

inverse_weights = total / (len(LABELS) * np.maximum(counts, 1))

# Normalize weights
inverse_weights = inverse_weights / inverse_weights.sum()

CLASS_WEIGHTS = torch.tensor(inverse_weights, dtype=torch.float)

print("Class Weights (Inverse Normalized):", CLASS_WEIGHTS)

Counts: [ 23 147  72]
Class Weights (Inverse Normalized): tensor([0.6775, 0.1060, 0.2164])


## Prompt Formatting (Instruction-Tuning Style)

In [5]:
trainer = EncoderTrainer(
    model_name=MODEL_NAME,
    num_labels=NUM_LABELS,
    load_in_4bit=False,
)

tokenizer = trainer.tokenizer

def format_encoder(example):
    input = example["input"]
    label = LABELS.index(example["output"])  # or map from output

    return {
        "input": input,
        "label": label,
    }

dataset = dataset.map(
    format_encoder,
    batched=False,
    num_proc=1,
    desc="Formatting prompts"
)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


## Tokenization & Data Collation

In [6]:
from transformers import DataCollatorWithPadding

def tokenize_encoder(batch):
    enc = tokenizer(
        batch["input"],
        truncation=True,
        max_length=MAX_LENGTH,
    )
    enc["labels"] = int(batch["label"])
    return enc

tokenized_ds = dataset.map(
    tokenize_encoder,
    batched=False,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing encoder inputs",
)

data_collator = DataCollatorWithPadding(tokenizer)

## Training Configuration

In [7]:
BASE_TRAINING_ARGS = dict(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,   # effective batch = 8
    num_train_epochs=4,
    lr_scheduler_type="cosine", 
    weight_decay=0.01,
    # warmup_steps=100,
    warmup_ratio=0.1,

    # precision (BF16 ONLY)
    fp16=False,
    bf16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,

    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    
    save_total_limit=1,
    report_to="none",
)


In [8]:
RESULTS = []

# -------------------------
# Base configuration
# -------------------------
best_cfg = {
    "use_lora": True,              
    "learning_rate": 2e-4,
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "use_class_weights": False,
}

def run_experiment(cfg, stage_name):
    tag = f"{stage_name}_" + "_".join([f"{k}_{v}" for k, v in cfg.items()])
    out_dir = f"{LORA_TUNING_DIR}/{tag}"
    os.makedirs(out_dir, exist_ok=True)

    print(f"\n===== {stage_name} | Running config: {cfg} =====")

    # ----------------------------------
    # Build TrainingArguments
    # ----------------------------------
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=cfg["learning_rate"],
        **BASE_TRAINING_ARGS,
    )

    # ----------------------------------
    # Initialize Trainer
    # ----------------------------------
    trainer = EncoderTrainer(
        model_name=MODEL_NAME,
        num_labels=NUM_LABELS,
        load_in_4bit=False,
    )

    # ----------------------------------
    # Configure LoRA (if enabled)
    # ----------------------------------
    if cfg.get("use_lora", True):
        trainer.configure_lora(
            r=cfg["r"],
            lora_alpha=cfg["lora_alpha"],
            lora_dropout=cfg["lora_dropout"],
            target_modules=["query", "value"],
        )

    # ----------------------------------
    # Class weights
    # ----------------------------------
    if cfg["use_class_weights"]:
        trainer.class_weights = CLASS_WEIGHTS

    # ----- Train -----
    metrics = trainer.train(
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        training_args=training_args,
        data_collator=data_collator,
        classification_eval_fn=lambda: trainer.evaluate_classification(
            test_path=VAL_JSON,
            labels=LABELS,
            verbose=False,
        )
    )

    # ----------------------------------
    # Save model (FFT + LoRA unified)
    # ----------------------------------
    trainer.save_model(out_dir)
    
    # ----------------------------------
    # Save metrics
    # ----------------------------------
    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    # ----------------------------------
    # Save experiment metadata (FULL INFO)
    # ----------------------------------
    experiment_metadata = {
        "model": MODEL_NAME,
        "experiment_type": stage_name,
        "learning_rate": cfg.get("learning_rate"),
        "r": cfg.get("r"),
        "lora_alpha": cfg.get("lora_alpha"),
        "lora_dropout": cfg.get("lora_dropout"),
        "use_lora": cfg.get("use_lora"),
        "use_class_weights": cfg.get("use_class_weights"),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }

    with open(os.path.join(out_dir, "exp_config.json"), "w") as f:
        json.dump(experiment_metadata, f, indent=2)

    # ----------------------------------
    # Store in memory
    # ----------------------------------
    RESULTS.append({
        "stage": stage_name,
        "config": copy.deepcopy(cfg),
        "metrics": metrics,
        "output_dir": out_dir,
    })

    return metrics


## Stage 1 — Full Fine-tuning

In [9]:
full_ft_cfg = {
    "use_lora": False,      # Full FT
    "learning_rate": 2e-4,
    "r": None,
    "lora_alpha": None,
    "lora_dropout": None,
    "use_class_weights": False,
}

lr_candidates = [2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1
best_lr = None

for lr in lr_candidates:
    cfg = copy.deepcopy(full_ft_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"FFT_LR_{lr}")
    score = metrics["f1"]   # use macro F1 ideally

    if score > best_metric:
        best_metric = score
        best_lr = lr

print("Best FFT LR:", best_lr)

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== FFT_LR_0.0002 | Running config: {'use_lora': False, 'learning_rate': 0.0002, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,1.47251,0.657768
2,1.246045,0.489435
3,0.46678,0.697106
4,0.270694,0.715878



[Classification Metrics]
accuracy: 0.6942
precision: 0.7839
recall: 0.5497
f1: 0.5036
auc_ovr: 0.8615


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.8347
precision: 0.8596
recall: 0.7671
f1: 0.7935
auc_ovr: 0.9355


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.8388
precision: 0.8251
recall: 0.8481
f1: 0.8349
auc_ovr: 0.9372


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.8347
precision: 0.8216
recall: 0.8456
f1: 0.8316
auc_ovr: 0.9391


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== FFT_LR_0.0003 | Running config: {'use_lora': False, 'learning_rate': 0.0003, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,1.978683,1.002726
2,1.931284,1.003352
3,1.867265,0.984713
4,1.823957,0.977276



[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.4422


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.5067


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.4933


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.5289


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== FFT_LR_0.0005 | Running config: {'use_lora': False, 'learning_rate': 0.0005, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,1.986979,1.005775
2,1.96952,1.008132
3,1.854762,0.985963
4,1.833894,0.978716



[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.5034


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.5474


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.4874


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.4864


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== FFT_LR_0.0006 | Running config: {'use_lora': False, 'learning_rate': 0.0006, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,1.922699,1.033008
2,1.921536,1.006345
3,1.850137,0.989057
4,1.815227,0.979629



[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.4921


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.5064


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.5115


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


[Classification Metrics]
accuracy: 0.5579
precision: 0.1860
recall: 0.3333
f1: 0.2387
auc_ovr: 0.5044


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Best FFT LR: 0.0002


## Stage 2 — LoRA (Tune Learning Rate)

In [10]:
lr_candidates = [2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1

for lr in lr_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"LORA_LR_{lr}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["learning_rate"] = lr

print("Best LR:", best_cfg["learning_rate"])


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_LR_0.0002 | Running config: {'use_lora': True, 'learning_rate': 0.0002, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,1.563674,0.752205
2,1.316699,0.650335
3,1.135657,0.593303
4,0.987146,0.58847



[Classification Metrics]
accuracy: 0.6983
precision: 0.6324
recall: 0.5815
f1: 0.5906
auc_ovr: 0.8160

[Classification Metrics]
accuracy: 0.7355
precision: 0.7058
recall: 0.6329
f1: 0.6551
auc_ovr: 0.8671

[Classification Metrics]
accuracy: 0.7769
precision: 0.7466
recall: 0.7214
f1: 0.7275
auc_ovr: 0.8942

[Classification Metrics]
accuracy: 0.7727
precision: 0.7353
recall: 0.7166
f1: 0.7187
auc_ovr: 0.8962


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_LR_0.0003 | Running config: {'use_lora': True, 'learning_rate': 0.0003, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,1.504509,0.725704
2,1.217048,0.548403
3,0.876184,0.463813
4,0.632345,0.461341



[Classification Metrics]
accuracy: 0.7149
precision: 0.6684
recall: 0.6006
f1: 0.6188
auc_ovr: 0.8318

[Classification Metrics]
accuracy: 0.8058
precision: 0.8083
recall: 0.7547
f1: 0.7763
auc_ovr: 0.9129

[Classification Metrics]
accuracy: 0.8347
precision: 0.8201
recall: 0.8253
f1: 0.8227
auc_ovr: 0.9354

[Classification Metrics]
accuracy: 0.8306
precision: 0.8120
recall: 0.8205
f1: 0.8161
auc_ovr: 0.9366


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_LR_0.0005 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,1.444384,0.561085
2,0.955193,0.374953
3,0.659902,0.365365
4,0.350679,0.373572



[Classification Metrics]
accuracy: 0.7727
precision: 0.7820
recall: 0.7127
f1: 0.7392
auc_ovr: 0.9053

[Classification Metrics]
accuracy: 0.8636
precision: 0.8663
recall: 0.8521
f1: 0.8578
auc_ovr: 0.9602

[Classification Metrics]
accuracy: 0.8719
precision: 0.8677
recall: 0.8701
f1: 0.8684
auc_ovr: 0.9655

[Classification Metrics]
accuracy: 0.8719
precision: 0.8620
recall: 0.8701
f1: 0.8656
auc_ovr: 0.9650


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_LR_0.0006 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,1.454703,0.558488
2,0.923037,0.37647
3,0.595298,0.379462
4,0.333378,0.394424



[Classification Metrics]
accuracy: 0.7810
precision: 0.7850
recall: 0.7310
f1: 0.7531
auc_ovr: 0.9055

[Classification Metrics]
accuracy: 0.8595
precision: 0.8614
recall: 0.8496
f1: 0.8537
auc_ovr: 0.9590

[Classification Metrics]
accuracy: 0.8678
precision: 0.8638
recall: 0.8677
f1: 0.8651
auc_ovr: 0.9646

[Classification Metrics]
accuracy: 0.8636
precision: 0.8601
recall: 0.8652
f1: 0.8618
auc_ovr: 0.9640
Best LR: 0.0005


## Stage 3 — LoRA (Tune Rank)

In [11]:
rank_candidates = [4, 8, 16]
best_metric = -1

for r in rank_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["r"] = r
    cfg["lora_alpha"] = 2 * r

    metrics = run_experiment(cfg, f"LORA_RANK_{r}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["r"] = r
        best_cfg["lora_alpha"] = 2 * r

print("Best Rank:", best_cfg["r"])

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_RANK_4 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 4, 'lora_alpha': 8, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 149,763 || all params: 109,634,310 || trainable%: 0.1366023099885428


Epoch,Training Loss,Validation Loss
1,1.477591,0.693529
2,1.058945,0.449143
3,0.755677,0.362629
4,0.489949,0.371866



[Classification Metrics]
accuracy: 0.7231
precision: 0.6791
recall: 0.6163
f1: 0.6321
auc_ovr: 0.8548

[Classification Metrics]
accuracy: 0.8223
precision: 0.8286
recall: 0.8002
f1: 0.8132
auc_ovr: 0.9386

[Classification Metrics]
accuracy: 0.8554
precision: 0.8433
recall: 0.8580
f1: 0.8499
auc_ovr: 0.9603

[Classification Metrics]
accuracy: 0.8554
precision: 0.8381
recall: 0.8625
f1: 0.8482
auc_ovr: 0.9600


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_RANK_8 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375


Epoch,Training Loss,Validation Loss
1,1.444384,0.561085
2,0.955193,0.374953
3,0.659902,0.365365
4,0.350679,0.373572



[Classification Metrics]
accuracy: 0.7727
precision: 0.7820
recall: 0.7127
f1: 0.7392
auc_ovr: 0.9053

[Classification Metrics]
accuracy: 0.8636
precision: 0.8663
recall: 0.8521
f1: 0.8578
auc_ovr: 0.9602

[Classification Metrics]
accuracy: 0.8719
precision: 0.8677
recall: 0.8701
f1: 0.8684
auc_ovr: 0.9655

[Classification Metrics]
accuracy: 0.8719
precision: 0.8620
recall: 0.8701
f1: 0.8656
auc_ovr: 0.9650


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_RANK_16 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.425545,0.529034
2,0.907697,0.359742
3,0.545796,0.398455
4,0.326202,0.413574



[Classification Metrics]
accuracy: 0.8058
precision: 0.8050
recall: 0.7662
f1: 0.7826
auc_ovr: 0.9113

[Classification Metrics]
accuracy: 0.8843
precision: 0.8912
recall: 0.8733
f1: 0.8810
auc_ovr: 0.9610

[Classification Metrics]
accuracy: 0.8802
precision: 0.8739
recall: 0.8796
f1: 0.8759
auc_ovr: 0.9635

[Classification Metrics]
accuracy: 0.8802
precision: 0.8739
recall: 0.8796
f1: 0.8759
auc_ovr: 0.9630
Best Rank: 16


## Stage 4 — LoRA (Tune Alpha)

In [12]:
alpha_candidates = [
    best_cfg["r"],
    2 * best_cfg["r"],
    4 * best_cfg["r"]
]

best_metric = -1

for alpha in alpha_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_alpha"] = alpha

    metrics = run_experiment(cfg, f"LORA_ALPHA_{alpha}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_alpha"] = alpha

print("Best Alpha:", best_cfg["lora_alpha"])


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_ALPHA_16 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.422801,0.559475
2,0.962306,0.36577
3,0.599337,0.365259
4,0.407137,0.367431



[Classification Metrics]
accuracy: 0.7893
precision: 0.7791
recall: 0.7291
f1: 0.7487
auc_ovr: 0.8991

[Classification Metrics]
accuracy: 0.8595
precision: 0.8698
recall: 0.8496
f1: 0.8575
auc_ovr: 0.9578

[Classification Metrics]
accuracy: 0.8760
precision: 0.8713
recall: 0.8772
f1: 0.8734
auc_ovr: 0.9653

[Classification Metrics]
accuracy: 0.8719
precision: 0.8676
recall: 0.8747
f1: 0.8701
auc_ovr: 0.9655


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_ALPHA_32 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.425545,0.529034
2,0.907697,0.359742
3,0.545796,0.398455
4,0.326202,0.413574



[Classification Metrics]
accuracy: 0.8058
precision: 0.8050
recall: 0.7662
f1: 0.7826
auc_ovr: 0.9113

[Classification Metrics]
accuracy: 0.8843
precision: 0.8912
recall: 0.8733
f1: 0.8810
auc_ovr: 0.9610

[Classification Metrics]
accuracy: 0.8802
precision: 0.8739
recall: 0.8796
f1: 0.8759
auc_ovr: 0.9635

[Classification Metrics]
accuracy: 0.8802
precision: 0.8739
recall: 0.8796
f1: 0.8759
auc_ovr: 0.9630


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_ALPHA_64 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.341035,0.43493
2,0.900194,0.331426
3,0.512427,0.403356
4,0.189305,0.426829



[Classification Metrics]
accuracy: 0.8471
precision: 0.8299
recall: 0.8488
f1: 0.8375
auc_ovr: 0.9397

[Classification Metrics]
accuracy: 0.8967
precision: 0.8934
recall: 0.8961
f1: 0.8943
auc_ovr: 0.9646

[Classification Metrics]
accuracy: 0.8843
precision: 0.8766
recall: 0.8860
f1: 0.8811
auc_ovr: 0.9661

[Classification Metrics]
accuracy: 0.8760
precision: 0.8577
recall: 0.8880
f1: 0.8711
auc_ovr: 0.9658
Best Alpha: 64


## Stage 5 — LoRA (Tune Dropout)

In [13]:
dropout_candidates = [0.0, 0.05, 0.1]
best_metric = -1

for d in dropout_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_dropout"] = d

    metrics = run_experiment(cfg, f"LORA_DROPOUT_{d}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_dropout"] = d

print("Best Dropout:", best_cfg["lora_dropout"])


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_DROPOUT_0.0 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.0, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.261231,0.460118
2,0.87808,0.328202
3,0.455514,0.444267
4,0.168029,0.469816



[Classification Metrics]
accuracy: 0.8388
precision: 0.8450
recall: 0.8062
f1: 0.8225
auc_ovr: 0.9360

[Classification Metrics]
accuracy: 0.8719
precision: 0.8739
recall: 0.8701
f1: 0.8714
auc_ovr: 0.9676

[Classification Metrics]
accuracy: 0.8802
precision: 0.8800
recall: 0.8839
f1: 0.8813
auc_ovr: 0.9675

[Classification Metrics]
accuracy: 0.8760
precision: 0.8769
recall: 0.8749
f1: 0.8751
auc_ovr: 0.9662


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_DROPOUT_0.05 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.341035,0.43493
2,0.900194,0.331426
3,0.512427,0.403356
4,0.189305,0.426829



[Classification Metrics]
accuracy: 0.8471
precision: 0.8299
recall: 0.8488
f1: 0.8375
auc_ovr: 0.9397

[Classification Metrics]
accuracy: 0.8967
precision: 0.8934
recall: 0.8961
f1: 0.8943
auc_ovr: 0.9646

[Classification Metrics]
accuracy: 0.8843
precision: 0.8766
recall: 0.8860
f1: 0.8811
auc_ovr: 0.9661

[Classification Metrics]
accuracy: 0.8760
precision: 0.8577
recall: 0.8880
f1: 0.8711
auc_ovr: 0.9658


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_DROPOUT_0.1 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.1, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.428423,0.447164
2,0.885715,0.34525
3,0.548438,0.4393
4,0.172363,0.470965



[Classification Metrics]
accuracy: 0.8388
precision: 0.8250
recall: 0.8216
f1: 0.8232
auc_ovr: 0.9376

[Classification Metrics]
accuracy: 0.8636
precision: 0.8659
recall: 0.8632
f1: 0.8626
auc_ovr: 0.9635

[Classification Metrics]
accuracy: 0.8595
precision: 0.8527
recall: 0.8647
f1: 0.8583
auc_ovr: 0.9642

[Classification Metrics]
accuracy: 0.8636
precision: 0.8554
recall: 0.8717
f1: 0.8626
auc_ovr: 0.9631
Best Dropout: 0.05


## Stage 6 — LoRA (Class Weights)

In [14]:
for use_weights in [False, True]:
    cfg = copy.deepcopy(best_cfg)
    cfg["use_class_weights"] = use_weights

    metrics = run_experiment(cfg, f"LORA_CLASS_WEIGHT")
    score = metrics["f1"]

    if score > best_metric:
        best_metric = score
        best_cfg["use_class_weights"] = use_weights

print("Best use_class_weights:", best_cfg["use_class_weights"])


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.341035,0.43493
2,0.900194,0.331426
3,0.512427,0.403356
4,0.189305,0.426829



[Classification Metrics]
accuracy: 0.8471
precision: 0.8299
recall: 0.8488
f1: 0.8375
auc_ovr: 0.9397

[Classification Metrics]
accuracy: 0.8967
precision: 0.8934
recall: 0.8961
f1: 0.8943
auc_ovr: 0.9646

[Classification Metrics]
accuracy: 0.8843
precision: 0.8766
recall: 0.8860
f1: 0.8811
auc_ovr: 0.9661

[Classification Metrics]
accuracy: 0.8760
precision: 0.8577
recall: 0.8880
f1: 0.8711
auc_ovr: 0.9658


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.



===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.05, 'use_class_weights': True} =====


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379259355919153


Epoch,Training Loss,Validation Loss
1,1.550428,0.466922
2,1.08039,0.335172
3,0.605393,0.459066
4,0.320743,0.434284



[Classification Metrics]
accuracy: 0.8099
precision: 0.7990
recall: 0.7925
f1: 0.7949
auc_ovr: 0.9298

[Classification Metrics]
accuracy: 0.8678
precision: 0.8581
recall: 0.8745
f1: 0.8643
auc_ovr: 0.9616

[Classification Metrics]
accuracy: 0.8678
precision: 0.8668
recall: 0.8650
f1: 0.8659
auc_ovr: 0.9568

[Classification Metrics]
accuracy: 0.8636
precision: 0.8552
recall: 0.8806
f1: 0.8658
auc_ovr: 0.9578
Best use_class_weights: False


In [15]:
print("\n===== FINAL BEST CONFIG =====")
print(best_cfg)



===== FINAL BEST CONFIG =====
{'use_lora': True, 'learning_rate': 0.0005, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.05, 'use_class_weights': False}
