# Prediction of Department: Fine-Tuning a Classification Model


In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, classification_report

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from sklearn.utils import resample
from matplotlib import pyplot as plt

In [11]:
GH_USER = "luisadosch"
GH_REPO = "Final-Project-snapAddy"
BRANCH = "main"


def get_github_url(relative_path):
    return f"https://raw.githubusercontent.com/{GH_USER}/{GH_REPO}/{BRANCH}/{relative_path}"

In [12]:
jobs_annotated = pd.read_csv(get_github_url("data/processed/jobs_annotated.csv"))
df_department = pd.read_csv(get_github_url("data/raw/department-v2.csv"))

# test set from annotated CVs (ACTIVE only) -> out-of-production set
jobs_annotated_df = jobs_annotated.loc[
    jobs_annotated["status"] == "ACTIVE",
    ["position", "department"]
].copy()

jobs_annotated_df = jobs_annotated_df.rename(columns={"position": "text", "department": "label"})

# base train/test dfs (strings)
df_department = df_department[["text", "label"]].copy()
jobs_annotated_df = jobs_annotated_df[["text", "label"]].copy()

print("fine tuning data:", len(df_department), "CV labeled data:", len(jobs_annotated_df))
print("fine-tune label counts:\n", df_department["label"].value_counts())
print("out-of-production label counts:\n", jobs_annotated_df["label"].value_counts())

fine tuning data: 10145 CV labeled data: 623
fine-tune label counts:
 label
Marketing                 4295
Sales                     3328
Information Technology    1305
Business Development       620
Project Management         201
Consulting                 167
Administrative              83
Other                       42
Purchasing                  40
Customer Support            33
Human Resources             31
Name: count, dtype: int64
out-of-production label counts:
 label
Other                     344
Information Technology     62
Sales                      46
Consulting                 39
Project Management         39
Marketing                  22
Business Development       20
Human Resources            16
Purchasing                 15
Administrative             14
Customer Support            6
Name: count, dtype: int64


In [13]:
train_df_base, temp_df = train_test_split(
    df_department,
    test_size=0.30,              # 70% train, 30% temp
    stratify=df_department["label"],
    random_state=42,
)

val_df_base, df_department_test = train_test_split(
    temp_df,
    test_size=0.50,              # 15% val, 15% test
    stratify=temp_df["label"],
    random_state=42,
)

print("train:", len(train_df_base), "val:", len(val_df_base), "test:", len(df_department_test))
print("train label counts:\n", train_df_base["label"].value_counts())
print("val label counts:\n", val_df_base["label"].value_counts())
print("test label counts:\n", df_department_test["label"].value_counts())

train: 7101 val: 1522 test: 1522
train label counts:
 label
Marketing                 3006
Sales                     2330
Information Technology     913
Business Development       434
Project Management         141
Consulting                 117
Administrative              58
Other                       29
Purchasing                  28
Customer Support            23
Human Resources             22
Name: count, dtype: int64
val label counts:
 label
Marketing                 644
Sales                     499
Information Technology    196
Business Development       93
Project Management         30
Consulting                 25
Administrative             12
Other                       7
Purchasing                  6
Human Resources             5
Customer Support            5
Name: count, dtype: int64
test label counts:
 label
Marketing                 645
Sales                     499
Information Technology    196
Business Development       93
Project Management         30
Consulting      

In [14]:
def add_synthetic_department(train_df: pd.DataFrame, synthetic_csv_relpath: str) -> pd.DataFrame:
    syn = pd.read_csv(get_github_url(synthetic_csv_relpath))

    # expect columns: position, department
    syn = syn[["position", "department"]].copy()
    syn = syn.rename(columns={"position": "text", "department": "label"})
    syn = syn.dropna(subset=["text", "label"])

    out = pd.concat([train_df[["text", "label"]], syn[["text", "label"]]], ignore_index=True)
    return out


In [15]:
MODEL_CKPT = "xlm-roberta-base"
MAX_LEN = 80

tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

In [16]:
def to_hf_dataset(df: pd.DataFrame, label_col: str = "labels") -> Dataset:
    ds = Dataset.from_pandas(df[["text", label_col]].copy(), preserve_index=False)
    ds = ds.map(tokenize, batched=True)
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", label_col])
    return ds

In [17]:
def eval_split(trainer, train_ds, val_ds, test_ds):
    train_metrics = trainer.evaluate(train_ds)
    val_metrics   = trainer.evaluate(val_ds)
    test_metrics  = trainer.evaluate(test_ds)
    print("train:", train_metrics)
    print("val:  ", val_metrics)
    print("test: ", test_metrics)
    return train_metrics, val_metrics, test_metrics

In [18]:
def loss_curves(trainer):
    logs = pd.DataFrame(trainer.state.log_history)

    train_loss = logs.dropna(subset=["loss"])[["step", "loss"]].copy()
    eval_loss  = logs.dropna(subset=["eval_loss"])[["step", "eval_loss"]].copy()

    plt.figure()
    plt.plot(train_loss["step"], train_loss["loss"])
    plt.xlabel("step")
    plt.ylabel("train loss")
    plt.title("Train loss")
    plt.show()

    plt.figure()
    plt.plot(eval_loss["step"], eval_loss["eval_loss"])
    plt.xlabel("step")
    plt.ylabel("eval loss")
    plt.title("Eval loss")
    plt.show()

    return logs

classification pipeline

In [19]:
def compute_metrics_cls(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }

In [20]:
def predict_table_cls(trainer, ds, df_text_label, id2label):
    pred = trainer.predict(ds)
    probs = pred.predictions
    y_pred = np.argmax(probs, axis=-1)

    out = df_text_label.copy().reset_index(drop=True)
    out["pred_id"] = y_pred
    out["pred_label"] = [id2label[int(i)] for i in y_pred]
    out["correct"] = (out["pred_label"] == out["label"])
    return out

In [21]:
def report_cls(trainer, ds, id2label):
    pred = trainer.predict(ds)
    y_pred = np.argmax(pred.predictions, axis=-1)
    y_true = pred.label_ids
    print(classification_report(y_true, y_pred, target_names=[id2label[i] for i in range(len(id2label))]))

In [22]:
def oversample_df(df, label_col="label", random_state=42):
    max_n = df[label_col].value_counts().max()
    parts = []
    for lab, g in df.groupby(label_col):
        parts.append(resample(g, replace=True, n_samples=max_n, random_state=random_state))
    return pd.concat(parts).sample(frac=1, random_state=random_state).reset_index(drop=True)


def run_classification(
    train_df,
    test_df,
    val_df,
    out_dir="dept_cls",
    do_oversample=False,
    random_state=42,
):
    # label space ONLY from train_df (no leakage)
    label_list = sorted(train_df["label"].unique())
    label2id = {l: i for i, l in enumerate(label_list)}
    id2label = {i: l for l, i in label2id.items()}

    tr = train_df.copy()
    va = val_df.copy()
    te = test_df.copy()

    tr["labels"] = tr["label"].map(label2id).astype(int)
    va["labels"] = va["label"].map(label2id)
    te["labels"] = te["label"].map(label2id)

    # oversample train only (optional)
    train_os = tr
    if do_oversample:
        train_os = oversample_df(tr, label_col="label", random_state=random_state)

    train_ds = to_hf_dataset(train_os, label_col="labels")
    val_ds = to_hf_dataset(va, label_col="labels")
    test_ds = to_hf_dataset(te, label_col="labels")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CKPT,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
    )

    args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=10,
        weight_decay=0.05,
        warmup_ratio=0.06,
        do_eval=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1_macro",
        greater_is_better=True,
        fp16=True,
        report_to="none",
        dataloader_num_workers=0,
        logging_strategy="steps",
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_cls,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    trainer.train()

    eval_split(trainer, train_ds, val_ds, test_ds)
    loss_curves(trainer)

    train_pred = predict_table_cls(trainer, train_ds, train_os[["text", "label"]], id2label)
    val_pred = predict_table_cls(trainer, val_ds, va[["text", "label"]], id2label)
    test_pred = predict_table_cls(trainer, test_ds, te[["text", "label"]], id2label)

    report_cls(trainer, val_ds, id2label)

    return trainer, (train_pred, val_pred, test_pred), (label2id, id2label)


model runs

classification without synth. data

In [23]:
import torch
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
print("current device:", torch.cuda.current_device() if torch.cuda.is_available() else None)
print("gpu name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)

torch: 2.9.0+cu126
cuda available: True
cuda device count: 1
current device: 0
gpu name: Tesla T4


classification with synthetic data

1) Normal (no synthetic, no oversampling)

In [24]:
# In-distribution evaluation (df_department test split)
dept_trainer_base, (dept_train_pred, dept_val_pred, dept_test_pred) = run_classification(
    train_df_base,
    val_df_base,
    df_department_test,
    out_dir="department_cls_base",
    do_oversample=False
)

Map:   0%|          | 0/7101 [00:00<?, ? examples/s]

Map:   0%|          | 0/1522 [00:00<?, ? examples/s]

Map:   0%|          | 0/1522 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4079,0.223609,0.946781,0.413382
2,0.1765,0.100302,0.978975,0.692988
3,0.0712,0.062259,0.984888,0.724712
4,0.078,0.029901,0.994744,0.972552
5,0.0421,0.017625,0.997372,0.986763
6,0.0181,0.019242,0.996715,0.978264
7,0.0057,0.009037,0.998686,0.993267
8,0.021,0.016996,0.996715,0.977693
9,0.0082,0.012633,0.997372,0.979573


train: {'eval_loss': 0.005571706686168909, 'eval_accuracy': 0.9990142233488241, 'eval_f1_macro': 0.9936828129319177, 'eval_runtime': 7.3406, 'eval_samples_per_second': 967.366, 'eval_steps_per_second': 30.243, 'epoch': 9.0}
val:   {'eval_loss': 0.009037076495587826, 'eval_accuracy': 0.9986859395532195, 'eval_f1_macro': 0.9932668795527442, 'eval_runtime': 1.5884, 'eval_samples_per_second': 958.174, 'eval_steps_per_second': 30.218, 'epoch': 9.0}
test:  {'eval_loss': 0.044849976897239685, 'eval_accuracy': 0.9947437582128777, 'eval_f1_macro': 0.9823252190623815, 'eval_runtime': 1.5852, 'eval_samples_per_second': 960.139, 'eval_steps_per_second': 30.28, 'epoch': 9.0}


NameError: name 'plt' is not defined

In [None]:
# Out-of-production evaluation (CV dataset)
dep_cv_pred = predict_table_cls(
    dept_trainer_base,
    to_hf_dataset(jobs_annotated_df.assign(labels=jobs_annotated_df["label"].map(ORD_MAP).astype(float)), label_col="labels"),
    jobs_annotated_df[["text", "label"]]
)

In [None]:
1/0

2) No synthetic + oversampling

In [None]:
dept_trainer_os, dept_preds_os, dept_maps_os = run_classification(
    train_df_base, test_df_base,
    out_dir="department_cls_oversample",
    do_oversample=True
)


## 3) With synthetic

In [None]:
train_df_aug = add_synthetic_department(train_df_base, "data/results/gemini_synthetic.csv")  # example path

dept_trainer_syn, dept_preds_syn, dept_maps_syn = run_classification(
    train_df_aug, test_df_base,
    out_dir="department_cls_synth",
    do_oversample=False
)
