# Setup

In [1]:
import sys
sys.path.append('..')

In [2]:
from dataclasses import dataclass
from tqdm.autonotebook import tqdm
from utils.utils import set_seeds
import wandb

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments
)

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


# Config

In [3]:
@dataclass
class Config:
    data_path: str = "../../../data/unlp-2025/"
    cv_path: str = "../../../data/unlp-2025/cv_split.csv"
    
    pretrained: str = "google/gemma-3-4b-pt"
    adapter_args = {
        'run_id': "g2zlfok5",
        'version': "latest"
    }
    max_length: int = 1024
    
    wandb_init_args = {
        'project': "sl-unlp-2025",
        'entity': "havlytskyi-thesis",
        'name': "gemma-3-4B--mntp-ukr"
    }

config = Config()

# Training Arguments

In [4]:
training_args = TrainingArguments(
    output_dir=f'./checkpoints/{config.wandb_init_args["name"]}',
    logging_dir=f'./logs/{config.wandb_init_args["name"]}',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    # gradient_accumulation_steps=1,
    bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    save_steps=100,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

# Instantiate the tokenizer & model

## Base Model

In [5]:
from src.models.gemma3 import biGemma3ForTokenClassification
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training


tokenizer = AutoTokenizer.from_pretrained(config.pretrained)
tokenizer.pad_token = tokenizer.eos_token


quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)


base_model = biGemma3ForTokenClassification.from_pretrained(
    config.pretrained,
    id2label={0: 0, 1: 1},
    label2id={0: 0, 1: 1},
    quantization_config=quant_config
)
base_model = prepare_model_for_kbit_training(base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of biGemma3ForTokenClassification were not initialized from the model checkpoint at google/gemma-3-4b-pt and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Pretrained LoRA

In [6]:
# to HF

out = !source ../../../scripts/wandb_to_hf.sh {config.adapter_args['run_id']} {config.adapter_args['version']}

hf_path_line = [l for l in out if l.startswith("RUN_NAME=")][0]
hf_path = hf_path_line.split("=", 1)[1]

config.adapter_args['subfolder'] = hf_path
hf_path

'gemma_3_4B_mntp_no_quant_ukr'

In [7]:
from peft import PeftModel

model = PeftModel.from_pretrained(
    base_model,
    "nuinashco/GPT2Vec",
    subfolder=config.adapter_args['subfolder']
)

model = model.merge_and_unload()

adapter_config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/477M [00:00<?, ?B/s]



## New LoRA

In [8]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=64,  # the dimension of the low-rank matrices
    lora_alpha=128, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj", "down_proj", "up_proj"]
) 

model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()


trainable params: 119,209,984 || all params: 3,999,478,274 || trainable%: 2.9806


# Data

In [9]:
import pandas as pd

df = pd.read_parquet(config.data_path + "train.parquet")
cv = pd.read_csv(config.cv_path)
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv(config.data_path + "test.csv")

In [10]:
from utils.data import preprocess_df

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

is_valid_mask = (df.fold == 4)
df_train = df[~is_valid_mask].copy()
df_valid = df[is_valid_mask].copy()


df_train = preprocess_df(df_train, tokenizer=tokenizer, max_length=config.max_length)
df_valid = preprocess_df(df_valid, tokenizer=tokenizer, max_length=None)
df_test = preprocess_df(df_test, tokenizer=tokenizer, max_length=None)

  0%|          | 0/3058 [00:00<?, ?it/s]

  0%|          | 0/764 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [11]:
train_columns = list(df_train.seq_labels.iloc[0].keys()) +\
                ['content', 'trigger_words']
test_columns = list(df_train.seq_labels.iloc[0].keys()) + ['content']

ds_train = Dataset.from_pandas(df_train[train_columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df_valid[train_columns].reset_index(drop=True))
ds_test = Dataset.from_pandas(df_test[test_columns].reset_index(drop=True))

# Train

In [12]:
from itertools import chain

train_labels = df_train.labels.tolist() + df_valid.labels.tolist()
positive_class_balance = pd.Series(list(chain(*train_labels))).mean()

positive_class_balance

0.2294743405414578

In [13]:
from transformers import DataCollatorForTokenClassification
from utils.trainer import SpanIdentificationTrainer

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = SpanIdentificationTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    desired_positive_ratio=positive_class_balance
)

  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [14]:
wandb.init(**config.wandb_init_args)

trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mivan-havlytskyi[0m ([33mivan-havlytskyiz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Thold
100,0.4962,0.505602,0.554363,0.690868,0.615134,0.1
200,0.34,0.413327,0.573869,0.70946,0.634502,0.49
300,0.4237,0.386297,0.54553,0.755477,0.633564,0.2
400,0.3574,0.389951,0.549643,0.760586,0.638134,0.23
500,0.3259,0.385599,0.550944,0.758295,0.6382,0.25
600,0.2576,0.417451,0.541292,0.778711,0.63865,0.25
700,0.2281,0.428584,0.553616,0.746211,0.635645,0.24
800,0.2131,0.429388,0.556852,0.734772,0.633558,0.23
900,0.199,0.440697,0.543598,0.758447,0.633297,0.18


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-100)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-200)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-300)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-400)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-500)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-600)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-700)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-800)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-900)... Done. 1.5s
  return fn(*args, **kwargs)
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--mntp-ukr/checkpoint-960)... Done. 1.5s
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TrainOutput(global_step=960, training_loss=0.3255849284430345, metrics={'train_runtime': 3859.4682, 'train_samples_per_second': 3.962, 'train_steps_per_second': 0.249, 'total_flos': 2.015598036588244e+17, 'train_loss': 0.3255849284430345, 'epoch': 5.0})

# Inference

## Checkpoint

In [15]:
from utils.metric import score as char_f1
from utils.utils import inference_aggregation

FINETUNED_MODEL = f'./checkpoints/{config.wandb_init_args["name"]}/checkpoint-600'

In [16]:
trainer._load_from_checkpoint(FINETUNED_MODEL)

## Threshold Selection

In [17]:
valid_preds = trainer.predict(ds_valid)
valid_metrics = trainer.compute_metrics((valid_preds.predictions, valid_preds.label_ids))

valid_metrics

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

{'precision': 0.5412923585100422,
 'recall': 0.778711199361879,
 'f1': 0.6386504327008397,
 'thold': 0.25}

In [18]:
from utils.utils import find_class_balance_threshold

test_preds = trainer.predict(ds_test)
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions), dim=-1).cpu().numpy()

test_distr_th = find_class_balance_threshold(
    desired_positive_ratio=positive_class_balance,
    probabilities=test_probabilities,
    labels=test_preds.label_ids
    )

print(test_distr_th)

  0%|          | 0/41 [00:00<?, ?it/s]

0.5445454545454546


In [19]:
final_th = valid_metrics['thold']

## CV-Score

In [20]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions), dim=-1).cpu().numpy()
valid_results = inference_aggregation(
    probabilities=valid_probabilities,
    labels=valid_preds.label_ids,
    offset_mappings=ds_valid['offset_mapping'],
    thold=final_th
)

In [21]:
from copy import deepcopy

df_valid_gt = df[df.fold==4][['id', 'trigger_words']].reset_index(drop=True)
df_valid = deepcopy(df_valid_gt)
df_valid['trigger_words'] = valid_results

cv_score = char_f1(df_valid_gt, df_valid, row_id_column_name='id')
cv_score

0.6386504327008397

## Predict Test

In [22]:
test_results = inference_aggregation(
    probabilities=test_probabilities,
    labels=test_preds.label_ids,
    offset_mappings=ds_test['offset_mapping'],
    thold=final_th
)

In [23]:
df_test_gt = pd.read_csv(config.data_path + 'solution.csv')[['id', 'trigger_words']]
df_test = deepcopy(df_test_gt)
df_test['trigger_words'] = test_results

test_score = char_f1(df_test_gt, df_test, row_id_column_name='id')
test_score

0.6329901429504362