# Setup

In [1]:
import sys
sys.path.append('..')

In [2]:
from dataclasses import dataclass
from tqdm.autonotebook import tqdm
from utils.utils import set_seeds
import wandb

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments
)

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


# Config

In [3]:
@dataclass
class Config:
    data_path: str = "../../../data/unlp-2025/"
    cv_path: str = "../../../data/unlp-2025/cv_split.csv"
    
    pretrained: str = "google/gemma-3-4b-pt"
    max_length: int = 1024

    
    wandb_init_args = {
        'project': "sl-unlp-2025",
        'entity': "havlytskyi-thesis",
        'name': "gemma-3-4B--no-pretrain"
    }

config = Config()

# Training Arguments

In [4]:
training_args = TrainingArguments(
    output_dir=f'./checkpoints/{config.wandb_init_args["name"]}',
    logging_dir=f'./logs/{config.wandb_init_args["name"]}',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    # gradient_accumulation_steps=1,
    bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    save_steps=100,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

# Instantiate the tokenizer & model

In [5]:
from src.models.gemma3 import biGemma3ForTokenClassification
from transformers import BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(config.pretrained)


quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)


model = biGemma3ForTokenClassification.from_pretrained(
    config.pretrained,
    id2label={0: 0, 1: 1},
    label2id={0: 0, 1: 1},
    quantization_config=quant_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of biGemma3ForTokenClassification were not initialized from the model checkpoint at google/gemma-3-4b-pt and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType


lora_config = LoraConfig(
    r=64,  # the dimension of the low-rank matrices
    lora_alpha=128, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj", "down_proj", "up_proj"]
) 

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()


trainable params: 119,209,984 || all params: 3,999,478,274 || trainable%: 2.9806


# Data

In [7]:
import pandas as pd

df = pd.read_parquet(config.data_path + "train.parquet")
cv = pd.read_csv(config.cv_path)
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv(config.data_path + "test.csv")

In [8]:
from utils.data import preprocess_df

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

is_valid_mask = (df.fold == 4)
df_train = df[~is_valid_mask].copy()
df_valid = df[is_valid_mask].copy()


df_train = preprocess_df(df_train, tokenizer=tokenizer, max_length=config.max_length)
df_valid = preprocess_df(df_valid, tokenizer=tokenizer, max_length=None)
df_test = preprocess_df(df_test, tokenizer=tokenizer, max_length=None)

  0%|          | 0/3058 [00:00<?, ?it/s]

  0%|          | 0/764 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [9]:
train_columns = list(df_train.seq_labels.iloc[0].keys()) +\
                ['content', 'trigger_words']
test_columns = list(df_train.seq_labels.iloc[0].keys()) + ['content']

ds_train = Dataset.from_pandas(df_train[train_columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df_valid[train_columns].reset_index(drop=True))
ds_test = Dataset.from_pandas(df_test[test_columns].reset_index(drop=True))

# Train

In [10]:
from itertools import chain

train_labels = df_train.labels.tolist() + df_valid.labels.tolist()
positive_class_balance = pd.Series(list(chain(*train_labels))).mean()

positive_class_balance

0.2294743405414578

In [11]:
from transformers import DataCollatorForTokenClassification
from utils.trainer import SpanIdentificationTrainer

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = SpanIdentificationTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    desired_positive_ratio=positive_class_balance
)

  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
wandb.init(**config.wandb_init_args)

trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mivan-havlytskyi[0m ([33mivan-havlytskyiz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Thold
100,0.6633,0.538956,0.382927,0.686913,0.491733,0.11
200,0.4775,0.474512,0.410803,0.717606,0.522496,0.24
300,0.4788,0.471829,0.418401,0.73428,0.533059,0.16
400,0.441,0.45755,0.416574,0.751082,0.535913,0.19
500,0.4286,0.465574,0.424926,0.747773,0.541909,0.3
600,0.3638,0.460517,0.436286,0.719906,0.543309,0.26
700,0.3722,0.466023,0.435753,0.698759,0.536771,0.25
800,0.3297,0.469881,0.417691,0.752244,0.537133,0.19
900,0.3217,0.475636,0.419306,0.746746,0.537052,0.17


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-100)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-200)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-300)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-400)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-500)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-600)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-700)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-800)... Done. 1.5s
  return fn(*args, **kwargs)


  0%|          | 0/41 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-900)... Done. 1.5s
  return fn(*args, **kwargs)
[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/gemma-3-4B--no-pretrain/checkpoint-960)... Done. 1.5s
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TrainOutput(global_step=960, training_loss=0.4300747687617938, metrics={'train_runtime': 3865.1302, 'train_samples_per_second': 3.956, 'train_steps_per_second': 0.248, 'total_flos': 2.015598036588244e+17, 'train_loss': 0.4300747687617938, 'epoch': 5.0})

# Inference

## Checkpoint

In [13]:
from utils.metric import score as char_f1
from utils.utils import inference_aggregation

FINETUNED_MODEL = f'./checkpoints/{config.wandb_init_args["name"]}/checkpoint-600'

In [14]:
trainer._load_from_checkpoint(FINETUNED_MODEL)

## Threshold Selection

In [15]:
valid_preds = trainer.predict(ds_valid)
valid_metrics = trainer.compute_metrics((valid_preds.predictions, valid_preds.label_ids))

valid_metrics

  0%|          | 0/41 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

{'precision': 0.4362860301869327,
 'recall': 0.7199056395634982,
 'f1': 0.5433091792853689,
 'thold': 0.26}

In [16]:
from utils.utils import find_class_balance_threshold

test_preds = trainer.predict(ds_test)
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions), dim=-1).cpu().numpy()

test_distr_th = find_class_balance_threshold(
    desired_positive_ratio=positive_class_balance,
    probabilities=test_probabilities,
    labels=test_preds.label_ids
    )

print(test_distr_th)

  0%|          | 0/41 [00:00<?, ?it/s]

0.44555555555555554


In [17]:
final_th = valid_metrics['thold']

## CV-Score

In [18]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions), dim=-1).cpu().numpy()
valid_results = inference_aggregation(
    probabilities=valid_probabilities,
    labels=valid_preds.label_ids,
    offset_mappings=ds_valid['offset_mapping'],
    thold=final_th
)

In [19]:
from copy import deepcopy

df_valid_gt = df[df.fold==4][['id', 'trigger_words']].reset_index(drop=True)
df_valid = deepcopy(df_valid_gt)
df_valid['trigger_words'] = valid_results

cv_score = char_f1(df_valid_gt, df_valid, row_id_column_name='id')
cv_score

0.5433091792853689

## Predict Test

In [20]:
test_results = inference_aggregation(
    probabilities=test_probabilities,
    labels=test_preds.label_ids,
    offset_mappings=ds_test['offset_mapping'],
    thold=final_th
)

In [21]:
df_test_gt = pd.read_csv(config.data_path + 'solution.csv')[['id', 'trigger_words']]
df_test = deepcopy(df_test_gt)
df_test['trigger_words'] = test_results

test_score = char_f1(df_test_gt, df_test, row_id_column_name='id')
test_score

0.5391665327161375