# Setup

In [1]:
from dataclasses import dataclass
from tqdm.autonotebook import tqdm
from utils.utils import set_seeds
import wandb

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments
)

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


# Config

In [2]:
@dataclass
class Config:
    data_path: str = "../../data/unlp-2025/"
    cv_path: str = "../../data/unlp-2025/cv_split.csv"
    
    pretrained: str = "meta-llama/Llama-3.2-3B"
    adapter_args = {
        'run_id': "6jg7ywjs",
        'version': "latest"
    }
    max_length: int = 1024
    
    wandb_init_args = {
        'project': "sl-unlp-2025",
        'entity': "havlytskyi-thesis",
        'name': "billama-3.2-3B--mlm-eng"
    }

config = Config()

# Training Arguments

In [3]:
training_args = TrainingArguments(
    output_dir=f'./checkpoints/{config.wandb_init_args["name"]}',
    logging_dir=f'./logs/{config.wandb_init_args["name"]}',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    # gradient_accumulation_steps=1,
    bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    save_steps=100,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

# Instantiate the tokenizer & model

## Base Model

In [4]:
from src.models.llama import biLlamaForTokenClassification
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training


tokenizer = AutoTokenizer.from_pretrained(config.pretrained)
tokenizer.pad_token = tokenizer.eos_token


quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# model = AutoModelForTokenClassification.from_pretrained(
base_model = biLlamaForTokenClassification.from_pretrained(
    config.pretrained,
    id2label={0: 0, 1: 1},
    label2id={0: 0, 1: 1},
    quantization_config=quant_config
)
base_model = prepare_model_for_kbit_training(base_model)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of biLlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Pretrained LoRA

In [5]:
# to HF

out = !source ../../scripts/wandb_to_hf.sh {config.adapter_args['run_id']} {config.adapter_args['version']}

hf_path_line = [l for l in out if l.startswith("RUN_NAME=")][0]
hf_path = hf_path_line.split("=", 1)[1]

config.adapter_args['subfolder'] = hf_path
hf_path

'llama_3_2_3B_mlm_no_quant_eng'

In [6]:
from peft import PeftModel

model = PeftModel.from_pretrained(
    base_model,
    "nuinashco/GPT2Vec",
    subfolder=config.adapter_args['subfolder']
)

model = model.merge_and_unload()

adapter_config.json:   0%|          | 0.00/798 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/389M [00:00<?, ?B/s]



## New LoRA

In [7]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=64,  # the dimension of the low-rank matrices
    lora_alpha=128, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj", "down_proj", "up_proj"]
) 

model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()


trainable params: 97,255,424 || all params: 3,310,011,394 || trainable%: 2.9382


# Callbacks

In [8]:
from src.callbacks.directionality import AttentionGeometryCallback

callbacks = [
    AttentionGeometryCallback(
        q_path="base_model.model.model.layers[layer_idx].self_attn.q_proj",
        k_path="base_model.model.model.layers[layer_idx].self_attn.k_proj",
        attention_type="grouped",
        is_lora=True, merge_lora=merge_lora,
        is_quantized=True
    )
    for merge_lora in [True, False]
]

# Data

In [9]:
import pandas as pd

df = pd.read_parquet(config.data_path + "train.parquet")
cv = pd.read_csv(config.cv_path)
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv(config.data_path + "test.csv")

In [10]:
from utils.data import preprocess_df

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

is_valid_mask = (df.fold == 4)
df_train = df[~is_valid_mask].copy()
df_valid = df[is_valid_mask].copy()


df_train = preprocess_df(df_train, tokenizer=tokenizer, max_length=config.max_length)
df_valid = preprocess_df(df_valid, tokenizer=tokenizer, max_length=None)
df_test = preprocess_df(df_test, tokenizer=tokenizer, max_length=None)

  0%|          | 0/3058 [00:00<?, ?it/s]

  0%|          | 0/764 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [11]:
train_columns = list(df_train.seq_labels.iloc[0].keys()) +\
                ['content', 'trigger_words']
test_columns = list(df_train.seq_labels.iloc[0].keys()) + ['content']

ds_train = Dataset.from_pandas(df_train[train_columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df_valid[train_columns].reset_index(drop=True))
ds_test = Dataset.from_pandas(df_test[test_columns].reset_index(drop=True))

# Train

In [12]:
from itertools import chain

train_labels = df_train.labels.tolist() + df_valid.labels.tolist()
positive_class_balance = pd.Series(list(chain(*train_labels))).mean()

positive_class_balance

0.23824465120696162

In [13]:
from transformers import DataCollatorForTokenClassification
from utils.trainer import SpanIdentificationTrainer

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = SpanIdentificationTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    desired_positive_ratio=positive_class_balance,
    callbacks=callbacks
)

  super().__init__(
You are adding a <class 'src.callbacks.directionality.AttentionGeometryCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
AttentionGeometryCallback
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [14]:
wandb.init(**config.wandb_init_args)
wandb.define_metric("*", summary="none")

trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mivan-havlytskyi[0m ([33mivan-havlytskyiz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Thold
100,0.4376,0.485262,0.531475,0.732184,0.61589,0.15
200,0.3533,0.397243,0.549548,0.745914,0.632849,0.26
300,0.4281,0.414845,0.561649,0.732227,0.635694,0.21
400,0.3563,0.405434,0.545224,0.74958,0.631275,0.24
500,0.3177,0.406454,0.55507,0.737989,0.633592,0.29
600,0.285,0.451972,0.570747,0.698199,0.628072,0.44
700,0.2518,0.457681,0.558876,0.702994,0.622705,0.25
800,0.234,0.449963,0.563218,0.6894,0.619953,0.32
900,0.2129,0.459397,0.558693,0.69377,0.618947,0.26


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-100)... Done. 1.5s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-100)... Done. 1.5s
Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-200)... Done. 1.5s
Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-200)... Done. 1.6s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-300)... Done. 1.5s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-300)... Done. 1.5s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-300)... Done. 1.6s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-400)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-400)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-400)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-500)... Done. 1.5s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-500)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-500)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-600)... Done. 1.5s
Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-600)... Done. 1.6s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

Done. 1.5s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-700)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-700)... Done. 1.6s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-800)... Done. 1.5s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-800)... Done. 1.5s
Done. 1.6s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-900)... Done. 1.5s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-900)... Done. 1.6s
Done. 1.6s
  return fn(*args, **kwargs)
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-960)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-960)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--mlm-eng/checkpoint-960)... Done. 1.6s
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input argume

TrainOutput(global_step=960, training_loss=0.32780181529621283, metrics={'train_runtime': 3000.8566, 'train_samples_per_second': 5.095, 'train_steps_per_second': 0.32, 'total_flos': 1.8235552478359616e+17, 'train_loss': 0.32780181529621283, 'epoch': 5.0})

# Inference

## Checkpoint

In [15]:
from utils.metric import score as char_f1
from utils.utils import inference_aggregation

FINETUNED_MODEL = f'./checkpoints/{config.wandb_init_args["name"]}/checkpoint-300'

In [16]:
trainer._load_from_checkpoint(FINETUNED_MODEL)

## Threshold Selection

In [17]:
valid_preds = trainer.predict(ds_valid)
valid_metrics = trainer.compute_metrics((valid_preds.predictions, valid_preds.label_ids))

valid_metrics

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

{'precision': 0.5616485612189772,
 'recall': 0.7322268044736351,
 'f1': 0.6356935793401429,
 'thold': 0.21000000000000002}

In [18]:
from utils.utils import find_class_balance_threshold

test_preds = trainer.predict(ds_test)
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions), dim=-1).cpu().numpy()

test_distr_th = find_class_balance_threshold(
    desired_positive_ratio=positive_class_balance,
    probabilities=test_probabilities,
    labels=test_preds.label_ids
    )

print(test_distr_th)

  0%|          | 0/41 [00:00<?, ?it/s]

0.2772727272727273


In [19]:
final_th = valid_metrics['thold']

## CV-Score

In [20]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions), dim=-1).cpu().numpy()
valid_results = inference_aggregation(
    probabilities=valid_probabilities,
    labels=valid_preds.label_ids,
    offset_mappings=ds_valid['offset_mapping'],
    thold=final_th
)

In [21]:
from copy import deepcopy

df_valid_gt = df[df.fold==4][['id', 'trigger_words']].reset_index(drop=True)
df_valid = deepcopy(df_valid_gt)
df_valid['trigger_words'] = valid_results

cv_score = char_f1(df_valid_gt, df_valid, row_id_column_name='id')
cv_score

0.6356935793401429

## Predict Test

In [22]:
test_results = inference_aggregation(
    probabilities=test_probabilities,
    labels=test_preds.label_ids,
    offset_mappings=ds_test['offset_mapping'],
    thold=final_th
)

In [23]:
df_test_gt = pd.read_csv(config.data_path + 'solution.csv')[['id', 'trigger_words']]
df_test = deepcopy(df_test_gt)
df_test['trigger_words'] = test_results

test_score = char_f1(df_test_gt, df_test, row_id_column_name='id')
test_score

0.6126795738295568