# Description Model Approach 3: Fine-Tuning a Classification Model

* Fine-tuned classification model. Use the csv files to fine-tune a pre-trained classification model. Apply the model to the linked-in data


In [118]:
import pandas as pd

In [119]:
GH_USER = "luisadosch"
GH_REPO = "Final-Project-snapAddy"
BRANCH = "main"


def get_github_url(relative_path):
    return f"https://raw.githubusercontent.com/{GH_USER}/{GH_REPO}/{BRANCH}/{relative_path}"

In [120]:
jobs_annotated_csv_url = get_github_url("data/processed/jobs_annotated.csv")
jobs_annotated = pd.read_csv(jobs_annotated_csv_url)
jobs_annotated.head()

Unnamed: 0,cv_id,job_index,organization,position,startDate,endDate,status,department,seniority
0,0,0,Depot4Design GmbH,Prokurist,2019-08,,ACTIVE,Other,Management
1,0,1,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management
2,0,2,Depot4Design GmbH,Betriebswirtin,2019-07,,ACTIVE,Other,Professional
3,0,3,Depot4Design GmbH,Prokuristin,2019-07,,ACTIVE,Other,Management
4,0,4,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management


In [121]:
len(
    jobs_annotated[
        (jobs_annotated["status"] == "ACTIVE")
    ]
)


623

In [122]:
seniority_url = get_github_url("data/raw/seniority-v2.csv")
df_seniority = pd.read_csv(seniority_url)
df_seniority.head()

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


In [123]:
# from jobs_annotated only get the columns where status = ACTIVE
df_seniority_test = jobs_annotated[(jobs_annotated["status"] == "ACTIVE")]

# now create df_seniority_test by only keeping position and renam eit in text, seniority, and cv_id
df_seniority_test = df_seniority_test[["position", "seniority", "cv_id"]].copy()


# in df seniority_test, rename seniority in label and drop cv_id
df_seniority_test.rename(columns={"seniority": "label"}, inplace=True)
df_seniority_test.rename(columns={"position": "text"}, inplace=True)
# drop column cv_id
df_seniority_test.drop(columns=["cv_id"], inplace=True)

df_seniority_test.head()

Unnamed: 0,text,label
0,Prokurist,Management
1,CFO,Management
2,Betriebswirtin,Professional
3,Prokuristin,Management
4,CFO,Management


In [124]:
df_seniority.head()

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior


In [125]:
train_df = pd.DataFrame({"text": df_seniority["text"], "label": df_seniority["label"]})
test_df = pd.DataFrame({"text": df_seniority_test["text"], "label": df_seniority_test["label"]})


In [126]:
ord_map = {
    "Junior": 1.0,
    "Professional": 2.0,   # kommt im Train nicht vor, ist ok
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0
}

train_df["y_reg"] = train_df["label"].map(ord_map).astype(float)
test_df["y_reg"]  = test_df["label"].map(ord_map).astype(float)

In [127]:
# len of train_df
len(train_df)


9428

In [128]:
len(test_df)

623

In [129]:
from transformers import AutoTokenizer

model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

MAX_LEN = 48  # Jobtitel sind kurz; wenn viele sehr lang sind: 48


In [130]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_sub_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df["label"],  # oder stratify=train_df["y_reg"].astype(int)
    random_state=42
)

train_ds = Dataset.from_pandas(
    train_sub_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)

val_ds = Dataset.from_pandas(
    val_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)

# dein finaler Test bleibt separat:
test_ds = Dataset.from_pandas(
    test_df[["text","y_reg"]].rename(columns={"y_reg":"labels"}).copy(),
    preserve_index=False
)


In [131]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )


In [132]:
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)


Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

Map:   0%|          | 0/623 [00:00<?, ? examples/s]

In [133]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=1,
    problem_type="regression"
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [134]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

def score_to_label(s):
    if s < 1.5: return "Junior"
    if s < 2.5: return "Professional"
    if s < 3.5: return "Senior"
    if s < 4.5: return "Lead"
    if s < 5.5: return "Management"
    return "Director"

def compute_metrics_reg(eval_pred):
    preds, labels = eval_pred
    scores = np.squeeze(preds)     # kontinuierliche Vorhersage
    mae = mean_absolute_error(labels, scores)

    y_pred = [score_to_label(s) for s in scores]
    # labels sind 1..6 floats -> zurück zu Text
    inv_ord = {v:k for k,v in ord_map.items()}
    y_true = [inv_ord[float(int(round(x)))] for x in labels]

    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro")
    return {"mae": mae, "acc_thresh": acc, "f1_macro": f1m}



In [135]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [136]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback


args = TrainingArguments(
    output_dir="seniority_ft",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    warmup_ratio=0.06,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="mae",
    greater_is_better=False,
    fp16=True,
    logging_steps=50,
    report_to="none",              # wichtig
    dataloader_num_workers=0       # oft stabiler in Colab
)



In [137]:
!nvidia-smi


Mon Jan  5 19:18:44 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P0             32W /   70W |    5404MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_reg,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

trainer.evaluate()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Mae,Acc Thresh,F1 Macro
1,0.3934,0.28308,0.329992,0.796925,0.547655
2,0.1474,0.096732,0.181925,0.962884,0.776061
3,0.0523,0.091099,0.227242,0.985684,0.977092
4,0.0458,0.065971,0.181117,0.987275,0.821152


In [None]:
trainer.evaluate(test_ds)

So basically the model works really well with trian but not with test
- when looking which labels performed worst its professional -> so the model is bad at predicting that because it is not part of the train df

-> so that is why we need more trianing data but without data leakage

Ty model with new synthethic data

In [None]:
synthetic_url = get_github_url("data/results/gemini_synthetic.csv")
data_synthetic = pd.read_csv(synthetic_url )
data_synthetic.head()

In [None]:
# change data_synthetic to only keep columns position, seniority

data_synthetic = data_synthetic[["position", "seniority"]].copy()



2. Train with synthetic data

In [None]:
# dein ursprüngliches Mapping
ord_map = {
    "Junior": 1.0,
    "Professional": 2.0,
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0
}

# Mapping umdrehen: Zahl -> String
id2label = {v: k for k, v in ord_map.items()}

# neue Label-Spalte erzeugen
data_synthetic["label"] = data_synthetic["seniority"].map(id2label)

# rename column position into text
data_synthetic.rename(columns={"position": "text"}, inplace=True)
# rename seniority into 	y_reg
data_synthetic.rename(columns={"seniority": "y_reg"}, inplace=True)



In [None]:
data_synthetic.head()

In [None]:
# Anzahl NaNs pro Spalte
data_synthetic.isna().sum()





In [None]:
# Zeilen mit NaN in y_reg oder label entfernen
data_synthetic = data_synthetic.dropna(subset=["y_reg", "label"])


In [None]:
data_synthetic[["y_reg", "label"]].isna().sum()
len(data_synthetic)


In [None]:
data_synthetic

In [None]:
# keep only the columns you want to train on
syn_rows = data_synthetic[["text", "label"]].copy()          # add "y_reg" too if you use regression
# syn_rows = data_synthetic[["text", "label", "y_reg"]].copy()

train_rows = train_df[["text", "label"]].copy()
# train_rows = train_df[["text", "label", "y_reg"]].copy()

# append rows
train_df_aug = pd.concat([train_rows, syn_rows], ignore_index=True)

print(len(train_df), "->", len(train_df_aug))


In [None]:
train_df_aug.head()

In [None]:
label_list = sorted(train_df_aug["label"].unique())
label_list

In [None]:
train_df_aug["label"].value_counts()


In [None]:
from sklearn.model_selection import train_test_split

train_sub_df, val_df = train_test_split(
    train_df_aug,
    test_size=0.2,
    stratify=train_df_aug["label"],
    random_state=42
)

print("Train:\n", train_sub_df["label"].value_counts())
print("Val:\n", val_df["label"].value_counts())



In [None]:
label_list = sorted(train_sub_df["label"].unique())
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print(label_list)


In [None]:
train_sub_df["label_id"] = train_sub_df["label"].map(label2id)
val_df["label_id"] = val_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)

print("test missing:", test_df["label_id"].isna().sum())
print("train missing:", train_sub_df["label_id"].isna().sum())
print("val missing:", val_df["label_id"].isna().sum())


In [None]:
test_df

In [None]:
from datasets import Dataset

train_hf_df = train_sub_df[["text", "label_id"]].rename(columns={"label_id": "labels"}).copy()
val_hf_df   = val_df[["text", "label_id"]].rename(columns={"label_id": "labels"}).copy()
test_hf_df  = test_df[["text", "label_id"]].rename(columns={"label_id": "labels"}).copy()

train_ds = Dataset.from_pandas(train_hf_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_hf_df, preserve_index=False)
test_ds  = Dataset.from_pandas(test_hf_df, preserve_index=False)


In [None]:
from transformers import AutoTokenizer

model_ckpt = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

MAX_LEN = 48  # Jobtitel sind kurz; wenn viele sehr lang sind: 48

In [None]:
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "labels"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="seniority_cls_xlmr",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    warmup_ratio=0.06,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=True,
    logging_steps=50,
    report_to="none",
    dataloader_num_workers=0
)


In [None]:
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
trainer.evaluate()


In [None]:
trainer.evaluate(test_ds)

In [None]:
from sklearn.metrics import classification_report
import numpy as np

pred = trainer.predict(val_ds)
y_pred = np.argmax(pred.predictions, axis=-1)
y_true = pred.label_ids

print(classification_report(
    y_true, y_pred,
    target_names=[id2label[i] for i in range(len(id2label))]
))


In [None]:
import numpy as np

counts = train_sub_df["label"].value_counts()
weights = np.zeros(len(label2id), dtype=np.float32)

for lbl, c in counts.items():
    weights[label2id[lbl]] = 1.0 / c

weights = weights / weights.mean()  # normalisieren
weights


In [None]:
import torch
from transformers import Trainer
import torch.nn as nn

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k:v for k,v in inputs.items() if k != "labels"})
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
class_weights = torch.tensor(weights)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    class_weights=class_weights
)

trainer.train()
trainer.evaluate()


orsample

In [None]:
import pandas as pd

prof_df = train_sub_df[train_sub_df["label"] == "Professional"]
rest_df = train_sub_df[train_sub_df["label"] != "Professional"]

# z.B. Professional 5x wiederholen
train_sub_df_up = pd.concat([rest_df, prof_df.sample(len(prof_df)*5, replace=True, random_state=42)], ignore_index=True)

train_sub_df_up["label_id"] = train_sub_df_up["label"].map(label2id)
