# LLM Fine-Tuning with Encoder

## Environment & Version Checks

In [1]:
import transformers
print(transformers.__version__)

import sys, os, json, copy
sys.path.append("..")

from datasets import load_dataset
from transformers import TrainingArguments, DataCollatorWithPadding
from src.EncoderTrainer import EncoderTrainer
from peft import LoraConfig, TaskType
import numpy as np
import torch
import time



4.41.2


## Global Configuration

In [2]:
# ===== paths & constants =====
OUTPUT_FOLDER = "../outputs"
DATA_FOLDER = "../data"

MODEL_NAME = "microsoft/deberta-v3-base"
FOLDER_NAME = "deberta-v3-base"

VAL_JSON  = f"{DATA_FOLDER}/processed/val.json"
TEST_JSON = f"{DATA_FOLDER}/processed/test.json"

MAX_LENGTH = 128
LABELS = ["negative", "neutral", "positive"]
NUM_LABELS = len(LABELS)

LORA_TUNING_DIR = f"{OUTPUT_FOLDER}/lora_tuning/{FOLDER_NAME}"
os.makedirs(LORA_TUNING_DIR, exist_ok=True)

## Dataset Loading

In [3]:
dataset = load_dataset(
    "json",
    data_files={
        "train": f"{DATA_FOLDER}/processed/train.json",
        "validation": f"{DATA_FOLDER}/processed/val.json",
        "test": f"{DATA_FOLDER}/processed/test.json",
    }
)


In [4]:
from collections import Counter

# Extract outputs
outputs = dataset['test']['output']

# Count occurrences
label_counts = Counter(outputs)
total = sum(label_counts.values())

counts = np.array([label_counts.get(label, 0) for label in LABELS])

print("Counts:", counts)

inverse_weights = total / (len(LABELS) * np.maximum(counts, 1))

# Normalize weights
inverse_weights = inverse_weights / inverse_weights.sum()

CLASS_WEIGHTS = torch.tensor(inverse_weights, dtype=torch.float)

print("Class Weights (Inverse Normalized):", CLASS_WEIGHTS)

Counts: [ 34 143  65]
Class Weights (Inverse Normalized): tensor([0.5679, 0.1350, 0.2971])


## Prompt Formatting

In [5]:
trainer = EncoderTrainer(
    model_name=MODEL_NAME,
    labels=LABELS,
    load_in_4bit=False,
)

tokenizer = trainer.tokenizer

def format_encoder(example):
    input = example["input"]
    label = LABELS.index(example["output"])  # or map from output

    return {
        "input": input,
        "label": label,
    }

dataset = dataset.map(
    format_encoder,
    batched=False,
    num_proc=1,
    desc="Formatting prompts"
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization & Data Collation

In [6]:
from transformers import DataCollatorWithPadding

def tokenize_encoder(batch):
    enc = tokenizer(
        batch["input"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        add_special_tokens=True,
    )
    enc["labels"] = int(batch["label"])
    return enc

tokenized_ds = dataset.map(
    tokenize_encoder,
    batched=False,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing encoder inputs",
)

data_collator = DataCollatorWithPadding(tokenizer)

## Training Configuration

In [7]:
BASE_TRAINING_ARGS = dict(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,   # effective batch = 8
    num_train_epochs=4,
    lr_scheduler_type="cosine", 
    weight_decay=0.01,
    # warmup_steps=100,
    warmup_ratio=0.1,

    # precision (BF16 ONLY)
    fp16=False,
    bf16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,

    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    
    save_total_limit=1,
    report_to="none",
)


## Define the Experiment Function

In [8]:
RESULTS = []

# -------------------------
# Base configuration
# -------------------------
best_cfg = {
    "use_lora": True,              
    "learning_rate": 2e-4,
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "use_class_weights": False,
}

def run_experiment(cfg, stage_name):
    tag = f"{stage_name}_" + "_".join([f"{k}_{v}" for k, v in cfg.items()])
    out_dir = f"{LORA_TUNING_DIR}/{tag}"
    os.makedirs(out_dir, exist_ok=True)

    print(f"\n===== {stage_name} | Running config: {cfg} =====")

    # ----------------------------------
    # Build TrainingArguments
    # ----------------------------------
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=cfg["learning_rate"],
        **BASE_TRAINING_ARGS,
    )

    # ----------------------------------
    # Initialize Trainer
    # ----------------------------------
    trainer = EncoderTrainer(
        model_name=MODEL_NAME,
        labels=LABELS,
        load_in_4bit=False,
    )

    # ----------------------------------
    # Configure LoRA (if enabled)
    # ----------------------------------
    if cfg.get("use_lora", True):
        trainer.configure_lora(
            r=cfg["r"],
            lora_alpha=cfg["lora_alpha"],
            lora_dropout=cfg["lora_dropout"],
            target_modules=["query_proj", "value_proj"]
        )

    # ----------------------------------
    # Class weights
    # ----------------------------------
    if cfg["use_class_weights"]:
        trainer.class_weights = CLASS_WEIGHTS

    # ----- Train -----
    metrics = trainer.train(
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        training_args=training_args,
        data_collator=data_collator,
        classification_eval_fn=lambda: trainer.evaluate_classification(
            test_path=VAL_JSON,
            labels=LABELS,
            verbose=False,
        )
    )

    # ----------------------------------
    # Save model (FFT + LoRA unified)
    # ----------------------------------
    print("Before save:",
      trainer.model.classifier.weight.mean())

    trainer.save_model(out_dir)
    
    # ----------------------------------
    # Save metrics
    # ----------------------------------
    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)

    # ----------------------------------
    # Save experiment metadata (FULL INFO)
    # ----------------------------------
    experiment_metadata = {
        "model": MODEL_NAME,
        "experiment_type": stage_name,
        "learning_rate": cfg.get("learning_rate"),
        "r": cfg.get("r"),
        "lora_alpha": cfg.get("lora_alpha"),
        "lora_dropout": cfg.get("lora_dropout"),
        "use_lora": cfg.get("use_lora"),
        "use_class_weights": cfg.get("use_class_weights"),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    }

    with open(os.path.join(out_dir, "exp_config.json"), "w") as f:
        json.dump(experiment_metadata, f, indent=2)

    # ----------------------------------
    # Store in memory
    # ----------------------------------
    RESULTS.append({
        "stage": stage_name,
        "config": copy.deepcopy(cfg),
        "metrics": metrics,
        "output_dir": out_dir,
    })

    return metrics


## Stage 1 — Full Fine-tuning

In [9]:
full_ft_cfg = {
    "use_lora": False,      # Full FT
    "learning_rate": 2e-4,
    "r": None,
    "lora_alpha": None,
    "lora_dropout": None,
    "use_class_weights": False,
}

lr_candidates = [5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1
best_lr = None

for lr in lr_candidates:
    cfg = copy.deepcopy(full_ft_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"FFT_LR_{lr}")
    score = metrics["f1"]   # use macro F1 ideally

    if score > best_metric:
        best_metric = score
        best_lr = lr

print("Best FFT LR:", best_lr)


===== FFT_LR_5e-05 | Running config: {'use_lora': False, 'learning_rate': 5e-05, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7224,0.492738
2,0.2705,0.393013
3,0.2745,0.354403
4,0.1829,0.355174



[Classification Metrics]
accuracy: 0.7851
precision: 0.8313
recall: 0.6184
f1: 0.6269
auc_ovr: 0.9300

[Classification Metrics]
accuracy: 0.8760
precision: 0.8455
recall: 0.8944
f1: 0.8665
auc_ovr: 0.9700

[Classification Metrics]
accuracy: 0.9008
precision: 0.8758
recall: 0.9156
f1: 0.8936
auc_ovr: 0.9696

[Classification Metrics]
accuracy: 0.9008
precision: 0.8758
recall: 0.9156
f1: 0.8936
auc_ovr: 0.9694

===== FFT_LR_0.0001 | Running config: {'use_lora': False, 'learning_rate': 0.0001, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7894,0.62307
2,0.3343,0.488249
3,0.2959,0.443855
4,0.1385,0.436957



[Classification Metrics]
accuracy: 0.7479
precision: 0.4733
recall: 0.5401
f1: 0.5040
auc_ovr: 0.8867

[Classification Metrics]
accuracy: 0.8636
precision: 0.8352
recall: 0.8850
f1: 0.8564
auc_ovr: 0.9579

[Classification Metrics]
accuracy: 0.8843
precision: 0.8600
recall: 0.8919
f1: 0.8746
auc_ovr: 0.9580

[Classification Metrics]
accuracy: 0.8884
precision: 0.8689
recall: 0.8943
f1: 0.8808
auc_ovr: 0.9580

===== FFT_LR_0.0002 | Running config: {'use_lora': False, 'learning_rate': 0.0002, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9517,0.855348
2,0.7847,0.920697
3,0.8713,0.873712
4,0.8574,0.869512



[Classification Metrics]
accuracy: 0.5785
precision: 0.3878
recall: 0.4398
f1: 0.3990
auc_ovr: 0.6516

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5928

[Classification Metrics]
accuracy: 0.6240
precision: 0.3771
recall: 0.4158
f1: 0.3918
auc_ovr: 0.6954

[Classification Metrics]
accuracy: 0.6281
precision: 0.3802
recall: 0.4181
f1: 0.3941
auc_ovr: 0.6801

===== FFT_LR_0.0003 | Running config: {'use_lora': False, 'learning_rate': 0.0003, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9664,0.94348
2,0.9141,0.93368
3,0.9381,0.919472
4,0.9675,0.91799



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5082

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5811

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.6153

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.6128

===== FFT_LR_0.0005 | Running config: {'use_lora': False, 'learning_rate': 0.0005, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9696,0.948234
2,0.916,0.934408
3,0.9371,0.914556
4,0.9673,0.920475



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5339

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.6278

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.6108

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5364

===== FFT_LR_0.0006 | Running config: {'use_lora': False, 'learning_rate': 0.0006, 'r': None, 'lora_alpha': None, 'lora_dropout': None, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.9693,0.951288
2,0.9155,0.938651
3,0.9381,0.922399
4,0.9683,0.921514



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.4904

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5111

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5054

[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.5023
Best FFT LR: 5e-05


## Stage 2 — LoRA (Tune Learning Rate)

In [10]:
lr_candidates = [5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 6e-4]
best_metric = -1

for lr in lr_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["learning_rate"] = lr

    metrics = run_experiment(cfg, f"LORA_LR_{lr}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["learning_rate"] = lr

print("Best LR:", best_cfg["learning_rate"])



===== LORA_LR_5e-05 | Running config: {'use_lora': True, 'learning_rate': 5e-05, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,1.0038,0.93002
2,0.8262,0.843596
3,0.8591,0.831479
4,0.8843,0.830839



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7104





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7514





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7624





[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7635





===== LORA_LR_0.0001 | Running config: {'use_lora': True, 'learning_rate': 0.0001, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.9247,0.852881
2,0.7075,0.746384
3,0.737,0.701291
4,0.7084,0.699855



[Classification Metrics]
accuracy: 0.5909
precision: 0.1970
recall: 0.3333
f1: 0.2476
auc_ovr: 0.7723





[Classification Metrics]
accuracy: 0.6901
precision: 0.4258
recall: 0.4578
f1: 0.4330
auc_ovr: 0.8123





[Classification Metrics]
accuracy: 0.6901
precision: 0.4206
recall: 0.4602
f1: 0.4343
auc_ovr: 0.8368





[Classification Metrics]
accuracy: 0.6901
precision: 0.4206
recall: 0.4602
f1: 0.4343
auc_ovr: 0.8390





===== LORA_LR_0.0002 | Running config: {'use_lora': True, 'learning_rate': 0.0002, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.864,0.758514
2,0.5399,0.559263
3,0.6121,0.520731
4,0.5272,0.518468



[Classification Metrics]
accuracy: 0.6818
precision: 0.4310
recall: 0.4461
f1: 0.4225
auc_ovr: 0.7923





[Classification Metrics]
accuracy: 0.7686
precision: 0.8217
recall: 0.5899
f1: 0.5823
auc_ovr: 0.8831





[Classification Metrics]
accuracy: 0.7686
precision: 0.8209
recall: 0.5661
f1: 0.5413
auc_ovr: 0.9027





[Classification Metrics]
accuracy: 0.7686
precision: 0.8209
recall: 0.5661
f1: 0.5413
auc_ovr: 0.9031





===== LORA_LR_0.0003 | Running config: {'use_lora': True, 'learning_rate': 0.0003, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7559,0.624615
2,0.5214,0.507792
3,0.4965,0.401503
4,0.3262,0.40081



[Classification Metrics]
accuracy: 0.7066
precision: 0.4302
recall: 0.4719
f1: 0.4453
auc_ovr: 0.8654





[Classification Metrics]
accuracy: 0.7934
precision: 0.7247
recall: 0.7310
f1: 0.7278
auc_ovr: 0.9150





[Classification Metrics]
accuracy: 0.8512
precision: 0.8761
recall: 0.8017
f1: 0.8308
auc_ovr: 0.9480





[Classification Metrics]
accuracy: 0.8512
precision: 0.8759
recall: 0.8112
f1: 0.8374
auc_ovr: 0.9486





===== LORA_LR_0.0005 | Running config: {'use_lora': True, 'learning_rate': 0.0005, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7455,0.624204
2,0.3905,0.405358
3,0.3914,0.361064
4,0.2113,0.362144



[Classification Metrics]
accuracy: 0.7190
precision: 0.4469
recall: 0.5025
f1: 0.4729
auc_ovr: 0.8620





[Classification Metrics]
accuracy: 0.8760
precision: 0.8754
recall: 0.8419
f1: 0.8569
auc_ovr: 0.9498





[Classification Metrics]
accuracy: 0.8884
precision: 0.8758
recall: 0.8560
f1: 0.8649
auc_ovr: 0.9553





[Classification Metrics]
accuracy: 0.8802
precision: 0.8690
recall: 0.8490
f1: 0.8580
auc_ovr: 0.9549





===== LORA_LR_0.0006 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7425,0.64662
2,0.3646,0.367829
3,0.3923,0.34184
4,0.223,0.350998



[Classification Metrics]
accuracy: 0.7149
precision: 0.4471
recall: 0.4978
f1: 0.4702
auc_ovr: 0.8436





[Classification Metrics]
accuracy: 0.8843
precision: 0.8661
recall: 0.8632
f1: 0.8644
auc_ovr: 0.9574





[Classification Metrics]
accuracy: 0.9008
precision: 0.8969
recall: 0.8821
f1: 0.8892
auc_ovr: 0.9594





[Classification Metrics]
accuracy: 0.8967
precision: 0.8913
recall: 0.8894
f1: 0.8903
auc_ovr: 0.9598




Best LR: 0.0006


## Stage 3 — LoRA (Tune Rank)

In [11]:
rank_candidates = [4, 8, 16]
best_metric = -1

for r in rank_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["r"] = r
    cfg["lora_alpha"] = 2 * r

    metrics = run_experiment(cfg, f"LORA_RANK_{r}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["r"] = r
        best_cfg["lora_alpha"] = 2 * r

print("Best Rank:", best_cfg["r"])


===== LORA_RANK_4 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 4, 'lora_alpha': 8, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 149,763 || all params: 184,574,214 || trainable%: 0.08113971976605573


Epoch,Training Loss,Validation Loss
1,0.723,0.607552
2,0.4807,0.473501
3,0.4473,0.42698
4,0.267,0.422223



[Classification Metrics]
accuracy: 0.7190
precision: 0.4485
recall: 0.5049
f1: 0.4749
auc_ovr: 0.8609





[Classification Metrics]
accuracy: 0.8264
precision: 0.7866
recall: 0.7902
f1: 0.7883
auc_ovr: 0.9209





[Classification Metrics]
accuracy: 0.8678
precision: 0.8881
recall: 0.8492
f1: 0.8651
auc_ovr: 0.9434





[Classification Metrics]
accuracy: 0.8760
precision: 0.8944
recall: 0.8609
f1: 0.8732
auc_ovr: 0.9461





===== LORA_RANK_8 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7425,0.64662
2,0.3646,0.367829
3,0.3923,0.34184
4,0.223,0.350998



[Classification Metrics]
accuracy: 0.7149
precision: 0.4471
recall: 0.4978
f1: 0.4702
auc_ovr: 0.8436





[Classification Metrics]
accuracy: 0.8843
precision: 0.8661
recall: 0.8632
f1: 0.8644
auc_ovr: 0.9574





[Classification Metrics]
accuracy: 0.9008
precision: 0.8969
recall: 0.8821
f1: 0.8892
auc_ovr: 0.9594





[Classification Metrics]
accuracy: 0.8967
precision: 0.8913
recall: 0.8894
f1: 0.8903
auc_ovr: 0.9598





===== LORA_RANK_16 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 592,131 || all params: 185,016,582 || trainable%: 0.320042124656697


Epoch,Training Loss,Validation Loss
1,0.7503,0.654235
2,0.3627,0.377403
3,0.3295,0.384532
4,0.183,0.411137



[Classification Metrics]
accuracy: 0.7231
precision: 0.4615
recall: 0.5308
f1: 0.4911
auc_ovr: 0.8354





[Classification Metrics]
accuracy: 0.8595
precision: 0.8494
recall: 0.8494
f1: 0.8494
auc_ovr: 0.9501





[Classification Metrics]
accuracy: 0.8884
precision: 0.8657
recall: 0.8919
f1: 0.8774
auc_ovr: 0.9568





[Classification Metrics]
accuracy: 0.8802
precision: 0.8571
recall: 0.8873
f1: 0.8708
auc_ovr: 0.9550




Best Rank: 8


## Stage 4 — LoRA (Tune Alpha)

In [12]:
alpha_candidates = [
    best_cfg["r"],
    2 * best_cfg["r"],
    4 * best_cfg["r"]
]

best_metric = -1

for alpha in alpha_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_alpha"] = alpha

    metrics = run_experiment(cfg, f"LORA_ALPHA_{alpha}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_alpha"] = alpha

print("Best Alpha:", best_cfg["lora_alpha"])



===== LORA_ALPHA_8 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 8, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7581,0.635214
2,0.4768,0.424465
3,0.4061,0.378098
4,0.2307,0.3737



[Classification Metrics]
accuracy: 0.7479
precision: 0.4752
recall: 0.5236
f1: 0.4965
auc_ovr: 0.8534





[Classification Metrics]
accuracy: 0.8595
precision: 0.8472
recall: 0.8183
f1: 0.8310
auc_ovr: 0.9341





[Classification Metrics]
accuracy: 0.8802
precision: 0.8994
recall: 0.8417
f1: 0.8640
auc_ovr: 0.9552





[Classification Metrics]
accuracy: 0.8760
precision: 0.8827
recall: 0.8418
f1: 0.8569
auc_ovr: 0.9568





===== LORA_ALPHA_16 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7425,0.64662
2,0.3646,0.367829
3,0.3923,0.34184
4,0.223,0.350998



[Classification Metrics]
accuracy: 0.7149
precision: 0.4471
recall: 0.4978
f1: 0.4702
auc_ovr: 0.8436





[Classification Metrics]
accuracy: 0.8843
precision: 0.8661
recall: 0.8632
f1: 0.8644
auc_ovr: 0.9574





[Classification Metrics]
accuracy: 0.9008
precision: 0.8969
recall: 0.8821
f1: 0.8892
auc_ovr: 0.9594





[Classification Metrics]
accuracy: 0.8967
precision: 0.8913
recall: 0.8894
f1: 0.8903
auc_ovr: 0.9598





===== LORA_ALPHA_32 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.8401,0.729153
2,0.5128,0.47643
3,0.4073,0.338555
4,0.2517,0.364118



[Classification Metrics]
accuracy: 0.7107
precision: 0.4490
recall: 0.4955
f1: 0.4694
auc_ovr: 0.8044





[Classification Metrics]
accuracy: 0.8223
precision: 0.8584
recall: 0.7374
f1: 0.7681
auc_ovr: 0.9223





[Classification Metrics]
accuracy: 0.8926
precision: 0.8968
recall: 0.8487
f1: 0.8699
auc_ovr: 0.9609





[Classification Metrics]
accuracy: 0.8926
precision: 0.8944
recall: 0.8679
f1: 0.8802
auc_ovr: 0.9620




Best Alpha: 16


## Stage 5 — LoRA (Tune Dropout)

In [13]:
dropout_candidates = [0.0, 0.05, 0.1]
best_metric = -1

for d in dropout_candidates:
    cfg = copy.deepcopy(best_cfg)
    cfg["lora_dropout"] = d

    metrics = run_experiment(cfg, f"LORA_DROPOUT_{d}")
    score = metrics["f1"]
    
    if score > best_metric:
        best_metric = score
        best_cfg["lora_dropout"] = d

print("Best Dropout:", best_cfg["lora_dropout"])



===== LORA_DROPOUT_0.0 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.0, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7786,0.669886
2,0.4863,0.444559
3,0.4016,0.38861
4,0.2451,0.38359



[Classification Metrics]
accuracy: 0.7273
precision: 0.4536
recall: 0.5024
f1: 0.4752
auc_ovr: 0.8501





[Classification Metrics]
accuracy: 0.8430
precision: 0.8562
recall: 0.8088
f1: 0.8242
auc_ovr: 0.9334





[Classification Metrics]
accuracy: 0.8719
precision: 0.8607
recall: 0.8490
f1: 0.8528
auc_ovr: 0.9576





[Classification Metrics]
accuracy: 0.8760
precision: 0.8579
recall: 0.8609
f1: 0.8586
auc_ovr: 0.9599





===== LORA_DROPOUT_0.05 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.7425,0.64662
2,0.3646,0.367829
3,0.3923,0.34184
4,0.223,0.350998



[Classification Metrics]
accuracy: 0.7149
precision: 0.4471
recall: 0.4978
f1: 0.4702
auc_ovr: 0.8436





[Classification Metrics]
accuracy: 0.8843
precision: 0.8661
recall: 0.8632
f1: 0.8644
auc_ovr: 0.9574





[Classification Metrics]
accuracy: 0.9008
precision: 0.8969
recall: 0.8821
f1: 0.8892
auc_ovr: 0.9594





[Classification Metrics]
accuracy: 0.8967
precision: 0.8913
recall: 0.8894
f1: 0.8903
auc_ovr: 0.9598





===== LORA_DROPOUT_0.1 | Running config: {'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.8304,0.788386
2,0.5343,0.546837
3,0.5531,0.46308
4,0.3471,0.439963



[Classification Metrics]
accuracy: 0.6901
precision: 0.4352
recall: 0.4602
f1: 0.4374
auc_ovr: 0.6850





[Classification Metrics]
accuracy: 0.7810
precision: 0.5049
recall: 0.5871
f1: 0.5384
auc_ovr: 0.8831





[Classification Metrics]
accuracy: 0.8554
precision: 0.8504
recall: 0.7847
f1: 0.8077
auc_ovr: 0.9222





[Classification Metrics]
accuracy: 0.8636
precision: 0.8872
recall: 0.8133
f1: 0.8388
auc_ovr: 0.9314




Best Dropout: 0.05


## Stage 6 — LoRA (Class Weights)

In [9]:
best_metric = -1

for use_weights in [False, True]:
    cfg = copy.deepcopy(best_cfg)
    cfg["use_class_weights"] = use_weights

    metrics = run_experiment(cfg, f"LORA_CLASS_WEIGHT")
    score = metrics["f1"]

    if score > best_metric:
        best_metric = score
        best_cfg["use_class_weights"] = use_weights

print("Best use_class_weights:", best_cfg["use_class_weights"])



===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0002, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,0.871,0.751567
2,0.5498,0.545538
3,0.6027,0.518142
4,0.5423,0.518353



[Classification Metrics]
accuracy: 0.6777
precision: 0.4228
recall: 0.4485
f1: 0.4249
auc_ovr: 0.8066





[Classification Metrics]
accuracy: 0.7603
precision: 0.8159
recall: 0.5733
f1: 0.5567
auc_ovr: 0.8813





[Classification Metrics]
accuracy: 0.7727
precision: 0.4936
recall: 0.5683
f1: 0.5264
auc_ovr: 0.8950





[Classification Metrics]
accuracy: 0.7686
precision: 0.4902
recall: 0.5636
f1: 0.5227
auc_ovr: 0.8952




<class 'peft.utils.other.ModulesToSaveWrapper'>
Before save: tensor(0.0005, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)





===== LORA_CLASS_WEIGHT | Running config: {'use_lora': True, 'learning_rate': 0.0002, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': True} =====


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 184,721,670 || trainable%: 0.1609009922874777


Epoch,Training Loss,Validation Loss
1,1.0201,0.901173
2,0.6929,0.598332
3,0.586,0.426829
4,0.4428,0.421008



[Classification Metrics]
accuracy: 0.6653
precision: 0.4097
recall: 0.4462
f1: 0.4221
auc_ovr: 0.7590





[Classification Metrics]
accuracy: 0.7603
precision: 0.7191
recall: 0.7675
f1: 0.7390
auc_ovr: 0.9025





[Classification Metrics]
accuracy: 0.8058
precision: 0.8025
recall: 0.8167
f1: 0.8069
auc_ovr: 0.9438





[Classification Metrics]
accuracy: 0.8099
precision: 0.8144
recall: 0.8214
f1: 0.8148
auc_ovr: 0.9453




<class 'peft.utils.other.ModulesToSaveWrapper'>
Before save: tensor(-4.7684e-05, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<MeanBackward0>)




Best use_class_weights: True


In [15]:
print("\n===== FINAL BEST CONFIG =====")
print(best_cfg)



===== FINAL BEST CONFIG =====
{'use_lora': True, 'learning_rate': 0.0006, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'use_class_weights': False}
