# Setup

In [1]:
from dataclasses import dataclass
from tqdm.autonotebook import tqdm
from utils.utils import set_seeds
import wandb

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments
)

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


# Config

In [2]:
@dataclass
class Config:
    data_path: str = "../../data/unlp-2025/"
    cv_path: str = "../../data/unlp-2025/cv_split.csv"
    
    pretrained: str = "meta-llama/Llama-3.2-3B"
    max_length: int = 1024

    
    wandb_init_args = {
        'project': "sl-unlp-2025",
        'entity': "havlytskyi-thesis",
        'name': "billama-3.2-3B--no-pretrain"
    }

config = Config()

# Training Arguments

In [3]:
training_args = TrainingArguments(
    output_dir=f'./checkpoints/{config.wandb_init_args["name"]}',
    logging_dir=f'./logs/{config.wandb_init_args["name"]}',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    # gradient_accumulation_steps=1,
    bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    save_steps=100,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

# Instantiate the tokenizer & model

In [4]:
from gpt2vec.models.llama import biLlamaForTokenClassification
from transformers import BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(config.pretrained)
tokenizer.pad_token = tokenizer.eos_token


quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# model = AutoModelForTokenClassification.from_pretrained(
model = biLlamaForTokenClassification.from_pretrained(
    config.pretrained,
    id2label={0: 0, 1: 1},
    label2id={0: 0, 1: 1},
    quantization_config=quant_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of biLlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType


lora_config = LoraConfig(
    r=64,  # the dimension of the low-rank matrices
    lora_alpha=128, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj", "down_proj", "up_proj"]
) 

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()


trainable params: 97,255,424 || all params: 3,310,011,394 || trainable%: 2.9382


# Callbacks

In [6]:
from gpt2vec.callbacks.directionality import AttentionGeometryCallback

callbacks = [
    AttentionGeometryCallback(
        q_path="base_model.model.model.layers[layer_idx].self_attn.q_proj",
        k_path="base_model.model.model.layers[layer_idx].self_attn.k_proj",
        attention_type="grouped",
        is_lora=True, merge_lora=merge_lora,
        is_quantized=True
    )
    for merge_lora in [True, False]
]

# Data

In [7]:
import pandas as pd

df = pd.read_parquet(config.data_path + "train.parquet")
cv = pd.read_csv(config.cv_path)
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv(config.data_path + "test.csv")

In [8]:
from utils.data import preprocess_df

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

is_valid_mask = (df.fold == 4)
df_train = df[~is_valid_mask].copy()
df_valid = df[is_valid_mask].copy()


df_train = preprocess_df(df_train, tokenizer=tokenizer, max_length=config.max_length)
df_valid = preprocess_df(df_valid, tokenizer=tokenizer, max_length=None)
df_test = preprocess_df(df_test, tokenizer=tokenizer, max_length=None)

  0%|          | 0/3058 [00:00<?, ?it/s]

  0%|          | 0/764 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [9]:
train_columns = list(df_train.seq_labels.iloc[0].keys()) +\
                ['content', 'trigger_words']
test_columns = list(df_train.seq_labels.iloc[0].keys()) + ['content']

ds_train = Dataset.from_pandas(df_train[train_columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df_valid[train_columns].reset_index(drop=True))
ds_test = Dataset.from_pandas(df_test[test_columns].reset_index(drop=True))

# Train

In [10]:
from itertools import chain

train_labels = df_train.labels.tolist() + df_valid.labels.tolist()
positive_class_balance = pd.Series(list(chain(*train_labels))).mean()

positive_class_balance

0.23824465120696162

In [11]:
from transformers import DataCollatorForTokenClassification
from utils.trainer import SpanIdentificationTrainer

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = SpanIdentificationTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    desired_positive_ratio=positive_class_balance,
    callbacks=callbacks
)

  super().__init__(
You are adding a <class 'gpt2vec.callbacks.directionality.AttentionGeometryCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
AttentionGeometryCallback
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
wandb.init(**config.wandb_init_args)
wandb.define_metric("*", summary="none")

trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mivan-havlytskyi[0m ([33mivan-havlytskyiz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Thold
100,0.4772,0.510233,0.478367,0.750471,0.584293,0.1
200,0.3895,0.401102,0.527211,0.753399,0.62033,0.3
300,0.4385,0.410873,0.557735,0.703418,0.622162,0.23
400,0.3711,0.41281,0.554543,0.724318,0.628161,0.26
500,0.343,0.409053,0.54495,0.755724,0.633259,0.24
600,0.2864,0.478634,0.544993,0.742825,0.628714,0.4
700,0.2479,0.495387,0.523764,0.742766,0.614331,0.12
800,0.2188,0.472846,0.525448,0.752007,0.618637,0.16
900,0.2102,0.478435,0.527207,0.754739,0.620781,0.17


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-100)... Done. 1.7s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-100)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-100)... Done. 2.0s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-200)... Done. 1.7s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-200)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-200)... Done. 1.9s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-300)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-300)... Done. 1.8s
Done. 2.0s
  return fn(*args, **kwargs)
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-400)... Done. 1.7s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-400)... Done. 1.9s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-400)... Done. 1.9s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-500)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-500)... Done. 2.0s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-600)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-600)... Done. 1.8s
Done. 2.1s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-700)... Done. 1.7s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-700)... Done. 1.8s
Done. 1.9s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-800)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-800)... Done. 1.8s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-900)... Done. 1.8s
Done. 1.9s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-900)... Done. 1.9s
  return fn(*args, **kwargs)
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-960)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-960)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/billama-3.2-3B--no-pretrain/checkpoint-960)... Done. 2.3s
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base

TrainOutput(global_step=960, training_loss=0.3493352657804886, metrics={'train_runtime': 3272.6377, 'train_samples_per_second': 4.672, 'train_steps_per_second': 0.293, 'total_flos': 1.8235552478359616e+17, 'train_loss': 0.3493352657804886, 'epoch': 5.0})

# Inference

## Checkpoint

In [13]:
from utils.metric import score as char_f1
from utils.utils import inference_aggregation

FINETUNED_MODEL = f'./checkpoints/{config.wandb_init_args["name"]}/checkpoint-500'

In [14]:
trainer._load_from_checkpoint(FINETUNED_MODEL)

## Threshold Selection

In [15]:
valid_preds = trainer.predict(ds_valid)
valid_metrics = trainer.compute_metrics((valid_preds.predictions, valid_preds.label_ids))

valid_metrics

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

{'precision': 0.5449500079546706,
 'recall': 0.7557235714406938,
 'f1': 0.6332589095252994,
 'thold': 0.24000000000000002}

In [16]:
from utils.utils import find_class_balance_threshold

test_preds = trainer.predict(ds_test)
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions), dim=-1).cpu().numpy()

test_distr_th = find_class_balance_threshold(
    desired_positive_ratio=positive_class_balance,
    probabilities=test_probabilities,
    labels=test_preds.label_ids
    )

print(test_distr_th)

  0%|          | 0/41 [00:00<?, ?it/s]

0.39606060606060606


In [17]:
final_th = valid_metrics['thold']

## CV-Score

In [18]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions), dim=-1).cpu().numpy()
valid_results = inference_aggregation(
    probabilities=valid_probabilities,
    labels=valid_preds.label_ids,
    offset_mappings=ds_valid['offset_mapping'],
    thold=final_th
)

In [19]:
from copy import deepcopy

df_valid_gt = df[df.fold==4][['id', 'trigger_words']].reset_index(drop=True)
df_valid = deepcopy(df_valid_gt)
df_valid['trigger_words'] = valid_results

cv_score = char_f1(df_valid_gt, df_valid, row_id_column_name='id')
cv_score

0.6332589095252994

## Predict Test

In [20]:
test_results = inference_aggregation(
    probabilities=test_probabilities,
    labels=test_preds.label_ids,
    offset_mappings=ds_test['offset_mapping'],
    thold=final_th
)

In [21]:
df_test_gt = pd.read_csv(config.data_path + 'solution.csv')[['id', 'trigger_words']]
df_test = deepcopy(df_test_gt)
df_test['trigger_words'] = test_results

test_score = char_f1(df_test_gt, df_test, row_id_column_name='id')
test_score

0.6179660369300621