In [1]:
import pandas as pd

In [2]:
department_df = pd.read_csv("../data/raw/department-v2.csv")
seniority_df = pd.read_csv("../data/raw/seniority-v2.csv")
jobs_annotated_df = pd.read_csv("../data/processed/jobs_annotated.csv")

# Prediciting Seniority: Exploring Hybrid Finetuned Approach with Additional Data and No Fallback

In [3]:
jobs_annotated_df

Unnamed: 0,row_id,cv_id,job_index,organization,position,startDate,endDate,status,department,seniority
0,0,0,0,Depot4Design GmbH,Prokurist,2019-08,,ACTIVE,Other,Management
1,1,0,1,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management
2,2,0,2,Depot4Design GmbH,Betriebswirtin,2019-07,,ACTIVE,Other,Professional
3,3,0,3,Depot4Design GmbH,Prokuristin,2019-07,,ACTIVE,Other,Management
4,4,0,4,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management
...,...,...,...,...,...,...,...,...,...,...
2633,2633,607,1,Bistum,Justitiar,2014-03,,ACTIVE,Other,Professional
2634,2634,607,2,FORESTA Management,Geschäftsführer,2010-01,,ACTIVE,Other,Management
2635,2635,607,3,Malteser Hilfsdienst,Präsidium,1999,,ACTIVE,Other,Management
2636,2636,607,4,Spies & Brunner RAe,Rechtsanwalt,1995-01,,ACTIVE,Other,Professional


In [4]:
def normalize_text(s):
    s = "" if pd.isna(s) else str(s)
    s = s.lower().strip()
    s = " ".join(s.split())
    return s


In [5]:
department_df["text_norm"] = department_df["text"].apply(normalize_text)
seniority_df["text_norm"] = seniority_df["text"].apply(normalize_text)
jobs_annotated_df["job_title_norm"] = jobs_annotated_df["position"].apply(normalize_text)

In [6]:
def predict_seniority_rule_based(title, seniority_df_):
    title_norm = normalize_text(title)

    for _, row in seniority_df_.iterrows():
        if row["text_norm"] and row["text_norm"] in title_norm:
            return row["label"]

    return "Professional"   # fallback

In [7]:
# 3) Predictions erzeugen (inkl. row_id)
# jobs annotated df only with active jobs
jobs_annotated_df = jobs_annotated_df[jobs_annotated_df["status"].eq("ACTIVE")].copy()
pred_df = jobs_annotated_df[["row_id", "position", "seniority"]].copy()
pred_df = pred_df.rename(columns={"seniority": "seniority_true"})

In [8]:
pred_df

Unnamed: 0,row_id,position,seniority_true
0,0,Prokurist,Management
1,1,CFO,Management
2,2,Betriebswirtin,Professional
3,3,Prokuristin,Management
4,4,CFO,Management
...,...,...,...
2632,2632,Kanzler der deutschen Assoziation,Management
2633,2633,Justitiar,Professional
2634,2634,Geschäftsführer,Management
2635,2635,Präsidium,Management


In [9]:
pred_df["seniority_pred"] = pred_df["position"].apply(lambda t: predict_seniority_rule_based(t, seniority_df))

In [10]:
pred_df["correct"] = pred_df["seniority_pred"] == pred_df["seniority_true"]
total = len(pred_df)
accuracy = pred_df["correct"].sum() / total

print(f"Total predictions: {total}")
print(f"Accuracy: {accuracy:.4f}")

Total predictions: 623
Accuracy: 0.5666


In [11]:
print("Accuracy:", round(accuracy * 100, 2), "%")
pred_df.head()

Accuracy: 56.66 %


Unnamed: 0,row_id,position,seniority_true,seniority_pred,correct
0,0,Prokurist,Management,Professional,False
1,1,CFO,Management,Professional,False
2,2,Betriebswirtin,Professional,Professional,True
3,3,Prokuristin,Management,Professional,False
4,4,CFO,Management,Professional,False


In [12]:
# show the predictions that are true
pred_df[pred_df["correct"] == True]

Unnamed: 0,row_id,position,seniority_true,seniority_pred,correct
2,2,Betriebswirtin,Professional,Professional,True
6,6,Solutions Architect,Professional,Professional,True
14,14,Medizintechnik Beratung,Professional,Professional,True
19,19,Administrador Unico,Professional,Professional,True
32,32,Kaufmännischer Leiter,Lead,Lead,True
...,...,...,...,...,...
2618,2618,Owner - CSO,Management,Management,True
2626,2626,"Buying, Porecurement & Allocation",Professional,Professional,True
2633,2633,Justitiar,Professional,Professional,True
2634,2634,Geschäftsführer,Management,Management,True


In [13]:
# remove all rows where seniority_pred = Professional
no_fallback_pred = pred_df[pred_df["seniority_pred"] != "Professional"]
no_fallback_pred

# now count accuracy again
total = len(no_fallback_pred)
accuracy = no_fallback_pred["correct"].sum() / total
print(f"Total predictions (no fallback): {total}")
print(f"Accuracy (no fallback): {accuracy:.4f}")

Total predictions (no fallback): 383
Accuracy (no fallback): 0.5744


In [14]:
# count how many predictions in seniority_true are 'Professional' in no_fallback_pred
count_professional_true = (no_fallback_pred["seniority_true"] == "Professional").sum()
print("Number of predictions where seniority_true is 'Professional' (no fallback):", count_professional_true)
# show in percent of total no_fallback_pred
percent_professional_true = count_professional_true / len(no_fallback_pred) * 100
print("Percentage of predictions where seniority_true is 'Professional' (no fallback):", round(percent_professional_true, 2), "%")



Number of predictions where seniority_true is 'Professional' (no fallback): 83
Percentage of predictions where seniority_true is 'Professional' (no fallback): 21.67 %


In [15]:
# count accuracy again if we remove all 'Professional' from seniority_true as well
no_professional_true_pred = no_fallback_pred[no_fallback_pred["seniority_true"] != "Professional"]
total = len(no_professional_true_pred)
accuracy = no_professional_true_pred["correct"].sum() / total
print(f"Total predictions (no fallback, no 'Professional' true): {total}")
print(f"Accuracy (no fallback, no 'Professional' true): {accuracy:.4f}")

Total predictions (no fallback, no 'Professional' true): 300
Accuracy (no fallback, no 'Professional' true): 0.7333


Because the predictions are so good without the 'Professional' class and without the fallback, we want to see how well the baseline would do with additional data and without fallback.

We are analysing this because if the baseline would do well in this scenario, we could just use the baseline with additional data and no fallback instead of the finetuned model with fallback. Then it would make sense to use a hybrid approach of baseline and fallback.

so it seems like our baseline would work really well if we  would have professional labels and no fallback. This is why we try baseline with additional data and without fallback next.

In [16]:
ORD_MAP = {
    "Junior": 1.0,
    "Professional": 2.0,
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0,
}
INV_ORD = {v: k for k, v in ORD_MAP.items()}

In [17]:
def add_synthetic(train_df: pd.DataFrame, synthetic_csv_relpath: str) -> pd.DataFrame:
    syn = pd.read_csv("../data/results/gemini_synthetic.csv")
    syn = syn[["position", "seniority"]].copy()

    id2label = {v: k for k, v in ORD_MAP.items()}
    syn["label"] = syn["seniority"].map(id2label)
    syn = syn.rename(columns={"position": "text"})
    syn = syn.dropna(subset=["text", "label"])

    out = pd.concat([train_df[["text", "label"]], syn[["text", "label"]]], ignore_index=True)
    return out

In [18]:
train_df_aug = add_synthetic(seniority_df, "data/results/gemini_synthetic.csv")

In [19]:
train_df_aug

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior
...,...,...
11309,Juristischer Berater,Professional
11310,"Leitung Personal, Finanzen, Einkauf, IT | Folk...",Management
11311,Verwaltungsleitung Landesspracheninstitut in d...,Management
11312,"Leitung Gebäudemanagement, Einkauf und Control...",Management


In [20]:
train_df_aug["text_norm"] = train_df_aug["text"].apply(normalize_text)

In [21]:
def predict_seniority_rule_based_no_fallback(title, seniority_df_):
    title_norm = normalize_text(title)

    for _, row in seniority_df_.iterrows():
        if row["text_norm"] and row["text_norm"] in title_norm:
            return row["label"]
    return "no-label" # fallback

In [22]:
pred_baseline_df = jobs_annotated_df[["row_id", "position", "seniority"]].copy()
pred_baseline_df = pred_baseline_df.rename(columns={"seniority": "seniority_true"})

In [23]:
pred_baseline_df["seniority_pred"]  = pred_baseline_df["position"].apply(lambda t: predict_seniority_rule_based_no_fallback(t, train_df_aug))

In [24]:
# remove all pred where seniority_pred = "no-label"

pred_baseline_df = pred_baseline_df[pred_baseline_df["seniority_pred"] != "no-label"]
# add column correct, which is true if seniority_true == seniority_pred
pred_baseline_df["correct"] = pred_baseline_df["seniority_true"] == pred_baseline_df["seniority_pred"]

pred_baseline_df

Unnamed: 0,row_id,position,seniority_true,seniority_pred,correct
0,0,Prokurist,Management,Management,True
1,1,CFO,Management,Management,True
3,3,Prokuristin,Management,Management,True
4,4,CFO,Management,Management,True
17,17,Director expansión de negocio.,Director,Management,False
...,...,...,...,...,...
2620,2620,Site Manager,Professional,Senior,False
2625,2625,"SAP Consultant, Geschäftsführer, Gesellschafter",Management,Senior,False
2633,2633,Justitiar,Professional,Professional,True
2634,2634,Geschäftsführer,Management,Management,True


In [25]:
# count how much is correct
total = len(pred_baseline_df)
print(f"accuracy:", pred_baseline_df["correct"].sum() / total)

accuracy: 0.5829787234042553


Unfortunately, this performance is not good enough and worse than if we only used the finetuned model. This is because the additional data is too noisy and has not enough quality. Therefore, it does not make sense to use this hybrid approach. So we don't pursue this any further.

However, we want to give it a try with the department, since here the labeled data would be sufficient since here we dont have the problem that there is a label in the training data which is not in the test data. Also, the llm was way better in predicting the department than seniority, so maybe the additional data is of higher quality there.

## Department Hybrid Finetuned Approach with Additional Data and No Fallback

First we want to see what approach is best for department prediction. Only the given labeled data, or with our additional synthetic data as well.

In [26]:
def predict_department_rule_based(title, department_df):
    title_norm = normalize_text(title)

    for _, row in department_df.iterrows():
        if row["text_norm"] in title_norm:
            return row["label"]

    return "no-label"

In [27]:
jobs_annotated_df

Unnamed: 0,row_id,cv_id,job_index,organization,position,startDate,endDate,status,department,seniority,job_title_norm
0,0,0,0,Depot4Design GmbH,Prokurist,2019-08,,ACTIVE,Other,Management,prokurist
1,1,0,1,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management,cfo
2,2,0,2,Depot4Design GmbH,Betriebswirtin,2019-07,,ACTIVE,Other,Professional,betriebswirtin
3,3,0,3,Depot4Design GmbH,Prokuristin,2019-07,,ACTIVE,Other,Management,prokuristin
4,4,0,4,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management,cfo
...,...,...,...,...,...,...,...,...,...,...,...
2632,2632,607,0,Malteserorden,Kanzler der deutschen Assoziation,2015-01,,ACTIVE,Other,Management,kanzler der deutschen assoziation
2633,2633,607,1,Bistum,Justitiar,2014-03,,ACTIVE,Other,Professional,justitiar
2634,2634,607,2,FORESTA Management,Geschäftsführer,2010-01,,ACTIVE,Other,Management,geschäftsführer
2635,2635,607,3,Malteser Hilfsdienst,Präsidium,1999,,ACTIVE,Other,Management,präsidium


In [28]:
jobs_annotated_df = jobs_annotated_df[jobs_annotated_df["status"].eq("ACTIVE")].copy()
pred_department_df = jobs_annotated_df[["row_id", "position", "department"]].copy()
pred_department_df = pred_department_df.rename(columns={"department": "department_true"})

In [29]:
pred_department_df["department_pred"] = pred_department_df["position"].apply(lambda t: predict_department_rule_based(t, department_df))

In [30]:
pred_department_df["correct"] = pred_department_df["department_pred"] == pred_department_df["department_true"]


In [31]:
pred_department_df
# remove the rows where department_pred = "no-label"
pred_department_df = pred_department_df[pred_department_df["department_pred"] != "no-label"]

In [32]:
pred_department_df

Unnamed: 0,row_id,position,department_true,department_pred,correct
6,6,Solutions Architect,Information Technology,Information Technology,True
32,32,Kaufmännischer Leiter,Sales,Information Technology,False
54,54,Projektmanager,Project Management,Project Management,True
56,56,Leistungssachbearbeiter SGB II,Other,Information Technology,False
57,57,energy and sustainability consultant bei,Consulting,Consulting,True
...,...,...,...,...,...
2592,2592,Softwareentwickler,Information Technology,Information Technology,True
2609,2609,Advanced Purchasing - Commercial Vehicles and ...,Purchasing,Purchasing,True
2620,2620,Site Manager,Project Management,Information Technology,False
2625,2625,"SAP Consultant, Geschäftsführer, Gesellschafter",Consulting,Consulting,True


In [33]:
total = len(pred_department_df)
accuracy = pred_department_df["correct"].sum() / total

print(f"Total predictions: {total}")
print(f"Accuracy: {accuracy:.4f}")

Total predictions: 204
Accuracy: 0.4853


the accuracy of 0,48 is still not good enough. So we will not pursue this any further.