# Description Model Approach 3: Fine-Tuning a Classification Model

* Fine-tuned classification model. Use the csv files to fine-tune a pre-trained classification model. Apply the model to the linked-in data


In [33]:
import pandas as pd

In [34]:
GH_USER = "luisadosch"
GH_REPO = "Final-Project-snapAddy"
BRANCH = "main"


def get_github_url(relative_path):
    return f"https://raw.githubusercontent.com/{GH_USER}/{GH_REPO}/{BRANCH}/{relative_path}"

In [35]:
jobs_annotated_csv_url = get_github_url("data/processed/jobs_annotated.csv")
jobs_annotated = pd.read_csv(jobs_annotated_csv_url)
jobs_annotated.head()

Unnamed: 0,cv_id,job_index,organization,position,startDate,endDate,status,department,seniority
0,0,0,Depot4Design GmbH,Prokurist,2019-08,,ACTIVE,Other,Management
1,0,1,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management
2,0,2,Depot4Design GmbH,Betriebswirtin,2019-07,,ACTIVE,Other,Professional
3,0,3,Depot4Design GmbH,Prokuristin,2019-07,,ACTIVE,Other,Management
4,0,4,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management


In [36]:
len(
    jobs_annotated[
        (jobs_annotated["status"] == "ACTIVE")
    ]
)


623

In [37]:
seniority_url = get_github_url("data/raw/seniority-v2.csv")
df_seniority = pd.read_csv(seniority_url)
df_seniority.head()

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


In [38]:
# from jobs_annotated only get the columns where status = ACTIVE
df_seniority_test = jobs_annotated[(jobs_annotated["status"] == "ACTIVE")]

# now create df_seniority_test by only keeping position and renam eit in text, seniority, and cv_id
df_seniority_test = df_seniority_test[["position", "seniority", "cv_id"]].copy()


# in df seniority_test, rename seniority in label and drop cv_id
df_seniority_test.rename(columns={"seniority": "label"}, inplace=True)
df_seniority_test.rename(columns={"position": "text"}, inplace=True)
# drop column cv_id
df_seniority_test.drop(columns=["cv_id"], inplace=True)

df_seniority_test.head()

Unnamed: 0,text,label
0,Prokurist,Management
1,CFO,Management
2,Betriebswirtin,Professional
3,Prokuristin,Management
4,CFO,Management


In [39]:
df_seniority.head()

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


In [40]:
train_df = pd.DataFrame({"text": df_seniority["text"], "label": df_seniority["label"]})
test_df = pd.DataFrame({"text": df_seniority_test["text"], "label": df_seniority_test["label"]})


In [41]:
ord_map = {
    "Junior": 1.0,
    "Professional": 2.0,   # kommt im Train nicht vor, ist ok
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0
}

train_df["y_reg"] = train_df["label"].map(ord_map).astype(float)
test_df["y_reg"]  = test_df["label"].map(ord_map).astype(float)

In [42]:
# len of train_df
len(train_df)


9428

In [43]:
len(test_df)

623

In [44]:
from transformers import AutoTokenizer

model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

MAX_LEN = 48  # Jobtitel sind kurz; wenn viele sehr lang sind: 48


In [45]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_sub_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df["label"],  # oder stratify=train_df["y_reg"].astype(int)
    random_state=42
)

train_ds = Dataset.from_pandas(
    train_sub_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)

val_ds = Dataset.from_pandas(
    val_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)

# dein finaler Test bleibt separat:
test_ds = Dataset.from_pandas(
    test_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)


In [46]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )


In [47]:
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)


Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

Map:   0%|          | 0/623 [00:00<?, ? examples/s]

In [32]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=1,
    problem_type="regression"
)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

def score_to_label(s):
    if s < 1.5: return "Junior"
    if s < 2.5: return "Professional"
    if s < 3.5: return "Senior"
    if s < 4.5: return "Lead"
    if s < 5.5: return "Management"
    return "Director"

def compute_metrics_reg(eval_pred):
    preds, labels = eval_pred
    scores = np.squeeze(preds)     # kontinuierliche Vorhersage
    mae = mean_absolute_error(labels, scores)

    y_pred = [score_to_label(s) for s in scores]
    # labels sind 1..6 floats -> zur√ºck zu Text
    inv_ord = {v:k for k,v in ord_map.items()}
    y_true = [inv_ord[float(int(round(x)))] for x in labels]

    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro")
    return {"mae": mae, "acc_thresh": acc, "f1_macro": f1m}



In [49]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [50]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback


args = TrainingArguments(
    output_dir="seniority_ft",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    warmup_ratio=0.06,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="mae",
    greater_is_better=False,
    fp16=True,
    logging_steps=50,
    report_to="none",              # wichtig
    dataloader_num_workers=0       # oft stabiler in Colab
)



In [51]:
!nvidia-smi


Mon Jan  5 18:17:09 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P0             27W /   70W |     102MiB /  15360MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [52]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_reg,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

trainer.evaluate()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Mae,Acc Thresh,F1 Macro
1,0.3972,0.282984,0.358571,0.825027,0.55712
2,0.14,0.149293,0.285629,0.939024,0.713767
3,0.0655,0.087387,0.231577,0.983033,0.810075
4,0.0504,0.071625,0.211397,0.993107,0.825674
5,0.0291,0.0546,0.161019,0.993637,0.825895
6,0.0296,0.070539,0.197664,0.994168,0.825331
7,0.0345,0.042861,0.166496,0.995758,0.83012


{'eval_loss': 0.05459996312856674,
 'eval_mae': 0.16101889312267303,
 'eval_acc_thresh': 0.9936373276776246,
 'eval_f1_macro': 0.8258954553732889,
 'eval_runtime': 1.3427,
 'eval_samples_per_second': 1404.64,
 'eval_steps_per_second': 43.942,
 'epoch': 7.0}

In [53]:
trainer.evaluate(test_ds)

{'eval_loss': 1.3661303520202637,
 'eval_mae': 0.8400365710258484,
 'eval_acc_thresh': 0.5040128410914928,
 'eval_f1_macro': 0.45385702983580223,
 'eval_runtime': 0.4481,
 'eval_samples_per_second': 1390.429,
 'eval_steps_per_second': 44.637,
 'epoch': 7.0}

So basically the model works really well with trian but not with test
- when looking which labels performed worst its professional -> so the model is bad at predicting that because it is not part of the train df

-> so that is why we need more trianing data but without data leakage

Ty model with new synthethic data

In [77]:
synthetic_url = get_github_url("data/results/gemini_synthetic.csv")
data_synthetic = pd.read_csv(synthetic_url )
data_synthetic.head()

Unnamed: 0,row_id,row_id.1,row_id.2,cv_id,job_index,organization,position,startDate,endDate,status,department,seniority
0,0,0,0,0,0,"Keeping The Books, Bookkeeping",Bookkeeper,2023-03,,ACTIVE,,2.0
1,1,1,1,0,1,Playful Paws,Co-Owner,2018-11,,ACTIVE,,5.0
2,2,2,8,1,0,Erste Bank und Sparkasse,Strategy & Investments,2025-03,,ACTIVE,,2.0
3,3,3,21,2,0,Guido Meyer,Corporate Auditor,2022-07,,ACTIVE,,2.0
4,4,4,22,2,1,Guido Meyer,Corporate Auditor,2022-07,,ACTIVE,,2.0


In [78]:
# change data_synthetic to only keep columns position, seniority

data_synthetic = data_synthetic[["position", "seniority"]].copy()



2. Train with synthetic data

In [79]:
# dein urspr√ºngliches Mapping
ord_map = {
    "Junior": 1.0,
    "Professional": 2.0,
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0
}

# Mapping umdrehen: Zahl -> String
id2label = {v: k for k, v in ord_map.items()}

# neue Label-Spalte erzeugen
data_synthetic["label"] = data_synthetic["seniority"].map(id2label)

# rename column position into text
data_synthetic.rename(columns={"position": "text"}, inplace=True)
# rename seniority into 	y_reg
data_synthetic.rename(columns={"seniority": "y_reg"}, inplace=True)



In [80]:
data_synthetic.head()

Unnamed: 0,text,y_reg,label
0,Bookkeeper,2.0,Professional
1,Co-Owner,5.0,Management
2,Strategy & Investments,2.0,Professional
3,Corporate Auditor,2.0,Professional
4,Corporate Auditor,2.0,Professional


In [86]:
# Anzahl NaNs pro Spalte
data_synthetic.isna().sum()





Unnamed: 0,0
text,0
y_reg,10
label,10


In [87]:
# Zeilen mit NaN in y_reg oder label entfernen
data_synthetic = data_synthetic.dropna(subset=["y_reg", "label"])


In [88]:
data_synthetic[["y_reg", "label"]].isna().sum()
len(data_synthetic)


409

In [90]:
data_synthetic

Unnamed: 0,text,y_reg,label
0,Bookkeeper,2.0,Professional
1,Co-Owner,5.0,Management
2,Strategy & Investments,2.0,Professional
3,Corporate Auditor,2.0,Professional
4,Corporate Auditor,2.0,Professional
...,...,...,...
414,General Counsel / Leiter Rechtsabteilung,6.0,Director
415,Team Lead Business Services,4.0,Lead
416,CEO & Co-founder,6.0,Director
417,Juristischer Berater,2.0,Professional


In [91]:
# keep only the columns you want to train on
syn_rows = data_synthetic[["text", "label"]].copy()          # add "y_reg" too if you use regression
# syn_rows = data_synthetic[["text", "label", "y_reg"]].copy()

train_rows = train_df[["text", "label"]].copy()
# train_rows = train_df[["text", "label", "y_reg"]].copy()

# append rows
train_df_aug = pd.concat([train_rows, syn_rows], ignore_index=True)

print(len(train_df), "->", len(train_df_aug))


9428 -> 9837


In [92]:
train_df_aug.head()

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


In [96]:
label_list = sorted(train_df_aug["label"].unique())
label_list

['Director', 'Junior', 'Lead', 'Management', 'Professional', 'Senior']

In [99]:
train_df_aug["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Senior,3764
Lead,3566
Director,1054
Management,915
Junior,445
Professional,93


In [100]:
from sklearn.model_selection import train_test_split

train_sub_df, val_df = train_test_split(
    train_df_aug,
    test_size=0.2,
    stratify=train_df_aug["label"],
    random_state=42
)

print("Train:\n", train_sub_df["label"].value_counts())
print("Val:\n", val_df["label"].value_counts())



Train:
 label
Senior          3011
Lead            2853
Director         843
Management       732
Junior           356
Professional      74
Name: count, dtype: int64
Val:
 label
Senior          753
Lead            713
Director        211
Management      183
Junior           89
Professional     19
Name: count, dtype: int64


In [101]:
label_list = sorted(train_sub_df["label"].unique())
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print(label_list)


['Director', 'Junior', 'Lead', 'Management', 'Professional', 'Senior']


In [102]:
train_sub_df["label_id"] = train_sub_df["label"].map(label2id)
val_df["label_id"] = val_df["label"].map(label2id)

print("train missing:", train_sub_df["label_id"].isna().sum())
print("val missing:", val_df["label_id"].isna().sum())


train missing: 0
val missing: 0
