In [1]:
import pandas as pd

In [15]:
test_data = pd.read_csv("../../data/processed/jobs_annotated_active.csv")
prediction_df = pd.read_csv("../../data/results/gemini_results.csv")

In [16]:
test_data.head()

Unnamed: 0,row_id,cv_id,job_index,organization,position,startDate,endDate,status,department,seniority
0,0,0,0,Depot4Design GmbH,Prokurist,2019-08,,ACTIVE,Other,Management
1,1,0,1,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management
2,2,0,2,Depot4Design GmbH,Betriebswirtin,2019-07,,ACTIVE,Other,Professional
3,3,0,3,Depot4Design GmbH,Prokuristin,2019-07,,ACTIVE,Other,Management
4,4,0,4,Depot4Design GmbH,CFO,2019-07,,ACTIVE,Other,Management


In [4]:
prediction_df.head()

Unnamed: 0,row_id,position,seniority,department
0,0,Prokurist,5.0,Other
1,1,CFO,6.0,Other
2,2,Betriebswirtin,2.0,Business Development
3,3,Prokuristin,5.0,Other
4,4,CFO,6.0,Other


In [6]:
# merge into results -> merge on same row_id, keep position of test_data and rename seniority of prediction_df to predicted_seniority and rename department of prediction_df to predicted_department
# rename seniority in prediction_df to predicted_seniority and rename department in prediction_df to predicted_department
prediction_df = prediction_df.rename(columns={"seniority": "predicted_seniority", "department": "predicted_department"})

results_df = pd.merge(test_data, prediction_df[["row_id", "predicted_seniority", "predicted_department"]], on="row_id", how="left")

# drop coluumns job_index, startDate, endDate, status
results_df = results_df.drop(columns=["job_index", "startDate", "endDate", "status"])
results_df.head()

Unnamed: 0,row_id,cv_id,organization,position,department,seniority,predicted_seniority,predicted_department
0,0,0,Depot4Design GmbH,Prokurist,Other,Management,5.0,Other
1,1,0,Depot4Design GmbH,CFO,Other,Management,6.0,Other
2,2,0,Depot4Design GmbH,Betriebswirtin,Other,Professional,2.0,Business Development
3,3,0,Depot4Design GmbH,Prokuristin,Other,Management,5.0,Other
4,4,0,Depot4Design GmbH,CFO,Other,Management,6.0,Other


In [7]:
# encode seniority levels to numeric values

seniority_map = {
    "Junior": 1.0,
    "Professional": 2.0,   # kommt im Train nicht vor, ist ok
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0
}

results_df["seniority"] = results_df["seniority"].map(seniority_map)

results_df.head()

Unnamed: 0,row_id,cv_id,organization,position,department,seniority,predicted_seniority,predicted_department
0,0,0,Depot4Design GmbH,Prokurist,Other,5.0,5.0,Other
1,1,0,Depot4Design GmbH,CFO,Other,5.0,6.0,Other
2,2,0,Depot4Design GmbH,Betriebswirtin,Other,2.0,2.0,Business Development
3,3,0,Depot4Design GmbH,Prokuristin,Other,5.0,5.0,Other
4,4,0,Depot4Design GmbH,CFO,Other,5.0,6.0,Other


In [10]:
# make statistic of how much percent of seniority == predicted_seniority
correct_seniority = results_df[results_df["seniority"] == results_df["predicted_seniority"]]
accuracy_seniority = len(correct_seniority) / len(results_df) * 100
print(f"Seniority Prediction Accuracy: {accuracy_seniority:.2f}%")

Seniority Prediction Accuracy: 51.04%


In [11]:
# show me where seniority != predicted_seniority
incorrect_seniority = results_df[results_df["seniority"] != results_df["predicted_seniority"]]
incorrect_seniority.head(10)

Unnamed: 0,row_id,cv_id,organization,position,department,seniority,predicted_seniority,predicted_department
1,1,0,Depot4Design GmbH,CFO,Other,5.0,6.0,Other
4,4,0,Depot4Design GmbH,CFO,Other,5.0,6.0,Other
5,5,1,Computer Solutions,Solutions Architect,Information Technology,2.0,3.0,Information Technology
8,8,3,Air & Ground Operations Consultancy,Gerente comercial,Sales,4.0,5.0,Sales
9,9,3,Viajes Oceano S.L.,Administrador Unico,Administrative,2.0,5.0,Administrative
10,10,4,Himmelstalunds Utbildningscentrum,"APL-ansvarig, samordning",Administrative,4.0,2.0,Other
11,11,5,FSD Fahrzeugsystemdaten GmbH,Kaufmännischer Leiter,Sales,4.0,5.0,Other
14,14,8,PhoboSys GmbH,Managing Director,Other,5.0,6.0,Other
15,15,11,Twentieth Century Fox,Set Lighting,Other,2.0,1.0,Other
16,16,11,Sony,Set Lighting,Other,2.0,1.0,Other


In [12]:
# now do the same for department
correct_department = results_df[results_df["department"] == results_df["predicted_department"]]
accuracy_department = len(correct_department) / len(results_df) * 100
print(f"Department Prediction Accuracy: {accuracy_department:.2f}%")

Department Prediction Accuracy: 72.87%


In [17]:
# show me where department != predicted_department
incorrect_department = results_df[results_df["department"] != results_df["predicted_department"]]
incorrect_department.head(10)

Unnamed: 0,row_id,cv_id,organization,position,department,seniority,predicted_seniority,predicted_department
2,2,0,Depot4Design GmbH,Betriebswirtin,Other,2.0,2.0,Business Development
10,10,4,Himmelstalunds Utbildningscentrum,"APL-ansvarig, samordning",Administrative,4.0,2.0,Other
11,11,5,FSD Fahrzeugsystemdaten GmbH,Kaufmännischer Leiter,Sales,4.0,5.0,Other
21,21,13,Landeshauptstadt Wiesbaden,Leistungssachbearbeiter SGB II,Other,2.0,2.0,Administrative
28,28,21,Gebrüder Weiß & Cie. KG,Geschäftsführender Gesellschafter,Other,5.0,6.0,Business Development
32,32,27,Stork Lijmen,Co-Owner,Other,5.0,5.0,Business Development
34,34,27,ESBA Horti Solutions BV,CEO/Founder,Other,5.0,6.0,Business Development
43,43,39,Quest ontheFRONTIER,Human Resources Consultant,Consulting,2.0,2.0,Human Resources
52,52,49,Wehner Consulting GmbH,Geschäftsführer,Other,5.0,5.0,Business Development
57,57,55,myclimate,Projektleiterin Bildung,Other,4.0,4.0,Project Management
