In [1]:
import os
from recruitair.modeling.custom_qwen import customize_qwen_model, freeze_custom_qwen_backbone
import torch
from torch import nn
import contextlib
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import pathlib
from tqdm import tqdm
from recruitair.modeling.tokenize import ResumeAndCriteriaTokenizer
import mlflow

# os.environ["STORE_INPUT_TENSORS"] = "true" # Uncomment this line to store input tensors that cause OOM errors
# os.environ["STORE_MEMORY_SNAPSHOTS"] = "true" # Uncomment this line to store CUDA memory snapshots on OOM errors
# Use tool https://docs.pytorch.org/memory_viz to analyze the memory snapshots
os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
mlflow.set_tracking_uri("http://nattech.fib.upc.edu:40380/")
mlflow.set_experiment("criteria-evaluation/custom-qwen-finetune")
mlflow.pytorch.autolog()

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load pretrained model
model_name = "Qwen/Qwen3-0.6B"
original_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = customize_qwen_model(original_model)
freeze_custom_qwen_backbone(model)

`torch_dtype` is deprecated! Use `dtype` instead!


In [3]:
# Load the training and testing datasets from "data/processed/train.jsonl" and "data/processed/validation.jsonl"
import pandas as pd

train_df = pd.read_json("../data/processed/train.jsonl", lines=True, encoding="utf-8")
val_df = pd.read_json("../data/processed/validation.jsonl", lines=True, encoding="utf-8")

# Convert the DataFrames to PyTorch Datasets
from torch.utils.data import Dataset, DataLoader


class CriteriaDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        return row["resume"], row["criteria"], row["score"] / 10.0  # Normalize score to [0, 1]


train_dataset = CriteriaDataset(train_df)
val_dataset = CriteriaDataset(val_df)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [4]:
# optimizer only for head
model.to(device)
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3, weight_decay=1e-4)
criterion = nn.MSELoss()

custom_tokenizer = ResumeAndCriteriaTokenizer(tokenizer)

In [5]:
if os.getenv("STORE_INPUT_TENSORS", "false").lower() == "true":
    pathlib.Path("../data/raw/input-data").mkdir(parents=True, exist_ok=True)
if os.getenv("STORE_INPUT_TENSORS", "false").lower() == "true":
    pathlib.Path("../data/raw/input-data").mkdir(parents=True, exist_ok=True)

num_epochs = 5
with mlflow.start_run(run_name="custom-qwen-finetune"):
    mlflow.log_params(
        {
            "num_epochs": num_epochs,
            "optimizer": "Adam",
            "optimizer/Adam/learning_rate": 1e-3,
            "optimizer/Adam/weight_decay": 1e-4,
            "criterion": "MSELoss",
            "batch_size": 8,
            "original-model": model_name,
        }
    )

    train_start = time.monotonic()

    torch.cuda.memory._record_memory_history()
    for epoch in range(num_epochs):
        epoch_start = time.monotonic()
        model.train()
        model.backbone.eval()
        model.head.train()
        running_loss = 0.0
        bar = tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", leave=False, unit="batch")
        for resume_batch, criteria_batch, score_batch in train_loader:
            padded_input_tokens, attention_mask = custom_tokenizer(resume_batch, criteria_batch)
            padded_input_tokens = padded_input_tokens.to(device)
            attention_mask = attention_mask.to(device)
            score_batch = score_batch.type(torch.float32).unsqueeze(1).to(device)

            optimizer.zero_grad()

            if os.getenv("STORE_INPUT_TENSORS", "false").lower() == "true":
                # Save the input values that caused the error for debugging
                torch.save(
                    {
                        "padded_input_tokens": padded_input_tokens,
                        "attention_mask": attention_mask,
                        "score_batch": score_batch,
                        "resume_batch": resume_batch,
                        "criteria_batch": criteria_batch,
                        "epoch": epoch,
                        "batch_index": bar.n,
                    },
                    f"../data/raw/input-data/epoch{epoch+1}-batch{bar.n + 1}-inputs.pth",
                )

            try:
                preds = model(input_ids=padded_input_tokens, attention_mask=attention_mask)
            except Exception as e:
                if os.getenv("STORE_INPUT_TENSORS", "false").lower() == "true":
                    with contextlib.suppress(Exception):
                        torch.cuda.memory._dump_snapshot(
                            f"../data/raw/cuda-mem-snapshots/epoch{epoch+1}-batch{bar.n + 1}-error.pickle"
                        )
                raise e from e

            loss = criterion(preds, score_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * len(resume_batch)
            # Update progress bar's description with current loss
            bar.set_postfix(loss=loss.item())
            bar.update(1)
            if os.getenv("STORE_MEMORY_SNAPSHOTS", "false").lower() == "true":
                try:
                    torch.cuda.memory._dump_snapshot(
                        f"../data/raw/cuda-mem-snapshots/epoch{epoch+1}-batch{bar.n}.pickle"
                    )
                except Exception as e:
                    print(f"Could not dump CUDA memory snapshot: {e}")
        bar.close()
        epoch_loss = running_loss / len(train_loader.dataset)
        mlflow.log_metric("epoch_duration_seconds", time.monotonic() - epoch_start, step=epoch)
        mlflow.log_metric("train_loss", epoch_loss, step=epoch)
        print(f"Epoch {epoch+1}/{num_epochs} loss: {epoch_loss:.4f}")
        # Validation
        with mlflow.start_run(run_name="custom-qwen-finetune-validation", nested=True):
            validation_start = time.monotonic()
            model.eval()
            val_loss = 0.0
            bar = tqdm(total=len(val_loader), desc=f"Validation {epoch+1}/{num_epochs}", leave=False, unit="batch")
            with torch.no_grad():
                for resume_batch, criteria_batch, score_batch in val_loader:
                    padded_input_tokens, attention_mask = custom_tokenizer(resume_batch, criteria_batch)
                    padded_input_tokens = padded_input_tokens.to(device)
                    attention_mask = attention_mask.to(device)
                    score_batch = score_batch.type(torch.float32).unsqueeze(1).to(device)

                    preds = model(input_ids=padded_input_tokens, attention_mask=attention_mask)
                    loss = criterion(preds, score_batch)
                    val_loss += loss.item() * len(resume_batch)
                    bar.set_postfix(loss=loss.item())
                    bar.update(1)
            bar.close()
            val_epoch_loss = val_loss / len(val_loader.dataset)
            print(f"Validation loss: {val_epoch_loss:.4f}")
            mlflow.log_metric("validation_loss", val_epoch_loss, step=epoch)
            mlflow.log_metric("validation_duration_seconds", time.monotonic() - validation_start, step=epoch)
        mlflow.pytorch.log_model(
            model, artifact_path="model", registered_model_name="custom-qwen-finetuned", step=epoch
        )
    mlflow.log_metric("total_training_duration_seconds", time.monotonic() - train_start)

2025/10/10 21:28:12 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/10/10 21:28:12 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2025/10/10 22:14:06 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/10/10 22:14:06 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 1/5 loss: 0.3272


                                                                                

Validation loss: 0.3197


2025/10/10 22:32:13 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/10/10 22:32:13 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run custom-qwen-finetune-validation at: http://nattech.fib.upc.edu:40380/#/experiments/2/runs/1763dc3643cb4f1faca0fc242dac7da3
🧪 View experiment at: http://nattech.fib.upc.edu:40380/#/experiments/2


Successfully registered model 'custom-qwen-finetuned'.
2025/10/10 22:32:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: custom-qwen-finetuned, version 1
Created version '1' of model 'custom-qwen-finetuned'.
2025/10/10 23:22:18 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/10/10 23:22:18 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 2/5 loss: 0.3272


                                                                                

Validation loss: 0.3197


2025/10/10 23:35:35 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...


🏃 View run custom-qwen-finetune-validation at: http://nattech.fib.upc.edu:40380/#/experiments/2/runs/b0bdacf339464739af1cbb1ff9ab65b5
🧪 View experiment at: http://nattech.fib.upc.edu:40380/#/experiments/2


2025/10/10 23:35:35 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
Registered model 'custom-qwen-finetuned' already exists. Creating a new version of this model...
2025/10/10 23:36:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: custom-qwen-finetuned, version 2
Created version '2' of model 'custom-qwen-finetuned'.
2025/10/11 00:28:13 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/10/11 00:28:13 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 3/5 loss: 0.3272


                                                                                

Validation loss: 0.3197


2025/10/11 00:41:21 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...


🏃 View run custom-qwen-finetune-validation at: http://nattech.fib.upc.edu:40380/#/experiments/2/runs/5c34360979504f35806c115baf2f876e
🧪 View experiment at: http://nattech.fib.upc.edu:40380/#/experiments/2


2025/10/11 00:41:21 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
Registered model 'custom-qwen-finetuned' already exists. Creating a new version of this model...
2025/10/11 00:41:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: custom-qwen-finetuned, version 3
Created version '3' of model 'custom-qwen-finetuned'.
2025/10/11 01:33:06 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/10/11 01:33:06 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 4/5 loss: 0.3272


                                                                                

Validation loss: 0.3197


2025/10/11 01:46:21 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...


🏃 View run custom-qwen-finetune-validation at: http://nattech.fib.upc.edu:40380/#/experiments/2/runs/607d667718d84e1f9d2a4ccc5c878d94
🧪 View experiment at: http://nattech.fib.upc.edu:40380/#/experiments/2


2025/10/11 01:46:21 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
Registered model 'custom-qwen-finetuned' already exists. Creating a new version of this model...
2025/10/11 01:46:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: custom-qwen-finetuned, version 4
Created version '4' of model 'custom-qwen-finetuned'.
2025/10/11 02:36:43 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/10/11 02:36:43 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 5/5 loss: 0.2978


                                                                                

Validation loss: 0.2909


2025/10/11 02:49:43 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...


🏃 View run custom-qwen-finetune-validation at: http://nattech.fib.upc.edu:40380/#/experiments/2/runs/0d2a986341884d11b8070b8649424173
🧪 View experiment at: http://nattech.fib.upc.edu:40380/#/experiments/2


2025/10/11 02:49:43 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
Registered model 'custom-qwen-finetuned' already exists. Creating a new version of this model...
2025/10/11 02:50:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: custom-qwen-finetuned, version 5
Created version '5' of model 'custom-qwen-finetuned'.
2025/10/11 02:50:22 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...


🏃 View run custom-qwen-finetune at: http://nattech.fib.upc.edu:40380/#/experiments/2/runs/fffdb50c061246fcbcff1eb1cc1fc51e
🧪 View experiment at: http://nattech.fib.upc.edu:40380/#/experiments/2


2025/10/11 02:50:22 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
