# Description Model Approach 3: Fine-Tuning a Classification Model

* Fine-tuned classification model. Use the csv files to fine-tune a pre-trained classification model. Apply the model to the linked-in data


In [211]:
import pandas as pd

In [212]:
GH_USER = "luisadosch"
GH_REPO = "Final-Project-snapAddy"
BRANCH = "main"


def get_github_url(relative_path):
    return f"https://raw.githubusercontent.com/{GH_USER}/{GH_REPO}/{BRANCH}/{relative_path}"

In [213]:
jobs_annotated_csv_url = get_github_url("data/processed/jobs_annotated.csv")
jobs_annotated = pd.read_csv(jobs_annotated_csv_url)
jobs_annotated.head()

Unnamed: 0,cv_id,job_index,organization,position,startDate,endDate,status,department,seniority
0,0,0,Depot4Design GmbH,Prokurist,2019-08,,ACTIVE,Other,Management
1,0,1,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management
2,0,2,Depot4Design GmbH,Betriebswirtin,2019-07,,ACTIVE,Other,Professional
3,0,3,Depot4Design GmbH,Prokuristin,2019-07,,ACTIVE,Other,Management
4,0,4,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management


In [214]:
len(
    jobs_annotated[
        (jobs_annotated["status"] == "ACTIVE")
    ]
)


623

In [215]:
seniority_url = get_github_url("data/raw/seniority-v2.csv")
df_seniority = pd.read_csv(seniority_url)
df_seniority.head()

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


In [216]:
# from jobs_annotated only get the columns where status = ACTIVE
df_seniority_test = jobs_annotated[(jobs_annotated["status"] == "ACTIVE")]

# now create df_seniority_test by only keeping position and renam eit in text, seniority, and cv_id
df_seniority_test = df_seniority_test[["position", "seniority", "cv_id"]].copy()


# in df seniority_test, rename seniority in label and drop cv_id
df_seniority_test.rename(columns={"seniority": "label"}, inplace=True)
df_seniority_test.rename(columns={"position": "text"}, inplace=True)
# drop column cv_id
df_seniority_test.drop(columns=["cv_id"], inplace=True)

df_seniority_test.head()

Unnamed: 0,text,label
0,Prokurist,Management
1,CFO,Management
2,Betriebswirtin,Professional
3,Prokuristin,Management
4,CFO,Management


In [217]:
df_seniority.head()

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


In [218]:
train_df = pd.DataFrame({"text": df_seniority["text"], "label": df_seniority["label"]})
test_df = pd.DataFrame({"text": df_seniority_test["text"], "label": df_seniority_test["label"]})


In [219]:
ord_map = {
    "Junior": 1.0,
    "Professional": 2.0,   # kommt im Train nicht vor, ist ok
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0
}

train_df["y_reg"] = train_df["label"].map(ord_map).astype(float)
test_df["y_reg"]  = test_df["label"].map(ord_map).astype(float)

In [220]:
# len of train_df
len(train_df)


9428

In [221]:
len(test_df)

623

In [222]:
from transformers import AutoTokenizer

model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

MAX_LEN = 48  # Jobtitel sind kurz; wenn viele sehr lang sind: 48


In [223]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_sub_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df["label"],  # oder stratify=train_df["y_reg"].astype(int)
    random_state=42
)

train_ds = Dataset.from_pandas(
    train_sub_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)

val_ds = Dataset.from_pandas(
    val_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)

# dein finaler Test bleibt separat:
test_ds = Dataset.from_pandas(
    test_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)


In [224]:
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)


Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

Map:   0%|          | 0/623 [00:00<?, ? examples/s]

In [225]:
from transformers import AutoModelForSequenceClassification

num_labels = len(label2id)

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=1,
    problem_type="regression"
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [226]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

def score_to_label(s):
    if s < 1.5: return "Junior"
    if s < 2.5: return "Professional"
    if s < 3.5: return "Senior"
    if s < 4.5: return "Lead"
    if s < 5.5: return "Management"
    return "Director"

def compute_metrics_reg(eval_pred):
    preds, labels = eval_pred
    scores = np.squeeze(preds)     # kontinuierliche Vorhersage
    mae = mean_absolute_error(labels, scores)

    y_pred = [score_to_label(s) for s in scores]
    # labels sind 1..6 floats -> zurück zu Text
    inv_ord = {v:k for k,v in ord_map.items()}
    y_true = [inv_ord[float(int(round(x)))] for x in labels]

    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro")
    return {"mae": mae, "acc_thresh": acc, "f1_macro": f1m}



In [227]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [228]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback


args = TrainingArguments(
    output_dir="seniority_ft",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    warmup_ratio=0.06,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="mae",
    greater_is_better=False,
    fp16=True,
    logging_steps=50,
    report_to="none",              # wichtig
    dataloader_num_workers=0       # oft stabiler in Colab
)



In [229]:
!nvidia-smi


Mon Jan  5 14:56:10 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   59C    P0             28W /   70W |    8598MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [230]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_reg,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

trainer.evaluate()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Mae,Acc Thresh,F1 Macro
1,0.3146,0.217892,0.308096,0.866384,0.601013
2,0.1134,0.09593,0.226848,0.974019,0.802044
3,0.0523,0.10765,0.293493,0.984624,0.815572
4,0.0282,0.101359,0.269743,0.988335,0.982119


{'eval_loss': 0.09593001753091812,
 'eval_mae': 0.2268475443124771,
 'eval_acc_thresh': 0.9740190880169671,
 'eval_f1_macro': 0.8020442893582252,
 'eval_runtime': 1.3642,
 'eval_samples_per_second': 1382.487,
 'eval_steps_per_second': 43.249,
 'epoch': 4.0}

In [232]:
trainer.evaluate(test_ds)

{'eval_loss': 1.073161244392395,
 'eval_mae': 0.8183680176734924,
 'eval_acc_thresh': 0.42536115569823435,
 'eval_f1_macro': 0.3879731288797403,
 'eval_runtime': 0.5298,
 'eval_samples_per_second': 1175.831,
 'eval_steps_per_second': 37.747,
 'epoch': 4.0}