<a href="https://colab.research.google.com/github/luisadosch/Final-Project-snapAddy/blob/main/model2_embedding_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Github-Zugangsdaten

In [44]:
# GitHub-Zugangsdaten
import pandas as pd

GH_USER = "luisadosch"
GH_REPO = "Final-Project-snapAddy"
BRANCH = "main"

def get_github_url(relative_path):
    return f"https://raw.githubusercontent.com/{GH_USER}/{GH_REPO}/{BRANCH}/{relative_path}"


jobs_annotated_active_df = pd.read_csv(get_github_url("data/processed/jobs_annotated_active.csv"))

department_df = pd.read_csv(get_github_url("data/raw/department-v2.csv"))

seniority_df = pd.read_csv(get_github_url("data/raw/seniority-v2.csv"))

# 2. Modell Seniority

In [45]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report
from collections import defaultdict
import numpy as np

# --- Labels & Beschreibungen aus seniority_df ---
slabel_names = seniority_df["label"].astype(str).tolist()
slabel_texts = seniority_df["text"].astype(str).tolist()

# --- Evaluation Labels (ACTIVE Jobs) ---
strue_seniority = jobs_annotated_active_df["seniority"].astype(str).tolist()

# --- Embedding-Modell laden ---
sembed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Embeddings für Label-Beschreibungen ---
X = sembed_model.encode(slabel_texts, convert_to_tensor=True)

# --- Zentroid je Seniority-Label ---
by_label = defaultdict(list)
for emb, lab in zip(X, slabel_names):
    by_label[lab].append(emb.cpu().numpy())

proto_labels = list(by_label.keys())
proto_embs = np.vstack([
    np.mean(by_label[label], axis=0)
    for label in proto_labels
])

# --- Embeddings für ACTIVE Jobs ---
E = sembed_model.encode(
    jobs_annotated_active_df["position"].astype(str).tolist(),
    convert_to_tensor=True
)

# --- Vorhersagen ---
spred_seniority = []
for e in E:
    sims = cosine_similarity(e.cpu().reshape(1, -1), proto_embs)[0]
    spred_seniority.append(proto_labels[int(np.argmax(sims))])

# --- Evaluation ---
s_eval_accuracy = accuracy_score(strue_seniority, spred_seniority)
s_eval_macro_f1 = f1_score(strue_seniority, spred_seniority, average="macro")

print("Embedding-based Seniority Prediction on ACTIVE Jobs")
print("Accuracy:", round(s_eval_accuracy, 3))
print("Macro F1:", round(s_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(strue_seniority, spred_seniority))

Embedding-based Seniority Prediction on ACTIVE Jobs
Accuracy: 0.409
Macro F1: 0.35

Classification Report:

              precision    recall  f1-score   support

    Director       0.41      0.82      0.55        34
      Junior       0.07      0.33      0.12        12
        Lead       0.39      0.43      0.41       125
  Management       0.73      0.70      0.71       192
Professional       0.00      0.00      0.00       216
      Senior       0.19      0.77      0.31        44

    accuracy                           0.41       623
   macro avg       0.30      0.51      0.35       623
weighted avg       0.34      0.41      0.36       623



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# MODEL KNOWLEDGE (ONLY seniority_df)
stexts = seniority_df["text"].astype(str).tolist()
slabels = seniority_df["label"].astype(str).tolist()

# EVALUATION DATA (ONLY ACTIVE jobs)
seval_texts = jobs_annotated_active_df["position"].astype(str).tolist()
strue_seniority = jobs_annotated_active_df["seniority"].astype(str).tolist()

# Embedding model
sembed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Embeddings
smodel_embeddings = sembed_model.encode(stexts,convert_to_tensor=True)

seval_embeddings = sembed_model.encode(seval_texts,convert_to_tensor=True)

# Prediction: Job → Seniority text
spred_seniority = []

for emb in seval_embeddings:
    ssims = cosine_similarity(emb.reshape(1, -1),smodel_embeddings)[0]
    best_idx = np.argmax(ssims)
    spred_seniority.append(slabels[best_idx])

# Evaluation
s_eval_accuracy = accuracy_score(strue_seniority, spred_seniority)
s_eval_macro_f1 = f1_score(strue_seniority, spred_seniority, average="macro")

print("Embedding-based Seniority Prediction on ACTIVE Jobs")
print("Accuracy:", round(s_eval_accuracy, 3))
print("Macro F1:", round(s_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(strue_seniority, spred_seniority))

Embedding-based Seniority Prediction on ACTIVE Jobs
Accuracy: 0.43
Macro F1: 0.392

Classification Report:

              precision    recall  f1-score   support

    Director       0.42      0.82      0.55        34
      Junior       0.19      0.42      0.26        12
        Lead       0.45      0.57      0.50       125
  Management       0.87      0.65      0.75       192
Professional       0.00      0.00      0.00       216
      Senior       0.17      0.89      0.28        44

    accuracy                           0.43       623
   macro avg       0.35      0.56      0.39       623
weighted avg       0.40      0.43      0.39       623



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


This block performs zero-shot classification of job seniority using embeddings. Each job title is converted into a dense vector using a pre-trained embedding model, and each possible seniority label is also mapped into the same embedding space. Cosine similarity is then computed between the job title embeddings and the label embeddings, and the label with the highest similarity is assigned to the job. This method does not require any training, making it a simple and interpretable embedding-based approach.

Applying this embedding-based approach to predict seniority on ACTIVE jobs results in an accuracy of 0.43 and a macro F1 score of 0.392. The detailed classification report is as follows:

| Label            | Precision | Recall | F1-score | Support |
| ---------------- | --------- | ------ | -------- | ------- |
| Director         | 0.42      | 0.82   | 0.55     | 34      |
| Junior           | 0.19      | 0.42   | 0.26     | 12      |
| Lead             | 0.45      | 0.57   | 0.50     | 125     |
| Management       | 0.87      | 0.65   | 0.75     | 192     |
| Professional     | 0.00      | 0.00   | 0.00     | 216     |
| Senior           | 0.17      | 0.89   | 0.28     | 44      |
| **Accuracy**     |           |        | 0.43     | 623     |
| **Macro avg**    | 0.35      | 0.56   | 0.39     | 623     |
| **Weighted avg** | 0.40      | 0.43   | 0.39     | 623     |


These results indicate that while some seniority levels, such as Director and Senior, are predicted with reasonable recall, the overall performance across all classes is limited. The approach captures high-level trends but struggles with frequent and ambiguous classes like Professional, highlighting the challenges of zero-shot classification in a domain with class imbalance and short, noisy job titles.

# 3. Modell Department

In [47]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report
from collections import defaultdict
import numpy as np

# --- Labels & Beschreibungen aus department_df ---
dlabel_names = department_df["label"].astype(str).tolist()
dlabel_texts = department_df["text"].astype(str).tolist()

# --- Evaluation Labels (ACTIVE Jobs) ---
dtrue_department = jobs_annotated_active_df["department"].astype(str).tolist()

# --- Embedding-Modell laden ---
dembed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Embeddings für Label-Beschreibungen ---
X = dembed_model.encode(dlabel_texts, convert_to_tensor=True)

# --- Zentroid je Department-Label ---
by_label = defaultdict(list)
for emb, lab in zip(X, dlabel_names):
    by_label[lab].append(emb.cpu().numpy())

proto_labels = list(by_label.keys())
proto_embs = np.vstack([
    np.mean(by_label[label], axis=0)
    for label in proto_labels])

# --- Embeddings für ACTIVE Jobs ---
E = dembed_model.encode(
    jobs_annotated_active_df["position"].astype(str).tolist(),
    convert_to_tensor=True)

# --- Vorhersagen ---
dpred_department = []
for e in E:
    sims = cosine_similarity(e.cpu().reshape(1, -1), proto_embs)[0]
    dpred_department.append(proto_labels[int(np.argmax(sims))])

# --- Evaluation ---
d_eval_accuracy = accuracy_score(dtrue_department, dpred_department)
d_eval_macro_f1 = f1_score(dtrue_department, dpred_department, average="macro")

print("Embedding-based Department Prediction on ACTIVE Jobs")
print("Accuracy:", round(d_eval_accuracy, 3))
print("Macro F1:", round(d_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(dtrue_department, dpred_department))

Embedding-based Department Prediction on ACTIVE Jobs
Accuracy: 0.315
Macro F1: 0.315

Classification Report:

                        precision    recall  f1-score   support

        Administrative       0.03      0.21      0.05        14
  Business Development       0.17      0.35      0.23        20
            Consulting       0.28      0.67      0.40        39
      Customer Support       0.29      0.33      0.31         6
       Human Resources       0.31      0.62      0.42        16
Information Technology       0.35      0.31      0.32        62
             Marketing       0.39      0.50      0.44        22
                 Other       0.74      0.22      0.33       344
    Project Management       0.57      0.59      0.58        39
            Purchasing       0.02      0.07      0.03        15
                 Sales       0.29      0.43      0.35        46

              accuracy                           0.31       623
             macro avg       0.31      0.39      0.31   

This block applies the embedding-based zero-shot classification method to predict the department of each job. Each job title and each possible department label is converted into a dense vector using a pre-trained embedding model. Cosine similarity is then computed between the job title embeddings and the label embeddings, and the label with the highest similarity is assigned to each job. This method requires no training, making it a simple, interpretable embedding-based approach based purely on semantic similarity.

Applying this embedding-based approach to predict departments on ACTIVE jobs results in an accuracy of 0.315 and a macro F1 score of 0.315. The detailed classification report is:
| Label                    | Precision | Recall | F1-score | Support |
|--------------------------|-----------|--------|----------|---------|
| Administrative           | 0.03      | 0.21   | 0.05     | 14      |
| Business Development     | 0.17      | 0.35   | 0.23     | 20      |
| Consulting               | 0.28      | 0.67   | 0.40     | 39      |
| Customer Support         | 0.29      | 0.33   | 0.31     | 6       |
| Human Resources          | 0.31      | 0.62   | 0.42     | 16      |
| Information Technology   | 0.35      | 0.31   | 0.32     | 62      |
| Marketing                | 0.39      | 0.50   | 0.44     | 22      |
| Other                    | 0.74      | 0.22   | 0.33     | 344     |
| Project Management       | 0.57      | 0.59   | 0.58     | 39      |
| Purchasing               | 0.02      | 0.07   | 0.03     | 15      |
| Sales                    | 0.29      | 0.43   | 0.35     | 46      |
| **Accuracy**             |           |        | **0.31** | **623** |
| **Macro avg**            | **0.31**  | **0.39** | **0.31** | **623** |
| **Weighted avg**         | **0.55**  | **0.31** | **0.34** | **623** |

These results indicate that while some departments, such as Human Resources and Project Management, are predicted with moderate recall, the overall performance is low, particularly for highly frequent and ambiguous labels like Other. This demonstrates the limitations of zero-shot embedding-based classification in domains with unbalanced class distributions and short, noisy job titles.

In [48]:
# --- Compare embedding-based results for ACTIVE jobs ---
comparison_metrics_active = pd.DataFrame({
    "Target": ["Seniority (ACTIVE Jobs)", "Department (ACTIVE Jobs)"],
    "Accuracy": [s_eval_accuracy, d_eval_accuracy],
    "Macro F1": [s_eval_macro_f1, d_eval_macro_f1]
})

print("Embedding-based Model Results (ACTIVE Jobs):\n")
print(comparison_metrics_active)

Embedding-based Model Results (ACTIVE Jobs):

                     Target  Accuracy  Macro F1
0   Seniority (ACTIVE Jobs)  0.430177  0.392017
1  Department (ACTIVE Jobs)  0.314607  0.314955


This block summarizes the evaluation metrics for the embedding-based zero-shot models applied to ACTIVE job titles. Accuracy represents the fraction of correct predictions, while Macro F1 accounts for class imbalance by averaging the F1-score across all labels. Presenting both metrics allows a direct comparison of performance between the Seniority and Department predictions on the same dataset.

The results of the embedding-based models on ACTIVE jobs are as follows:

| Target                   | Accuracy | Macro F1 |
| ------------------------ | -------- | -------- |
| Seniority (ACTIVE Jobs)  | 0.430    | 0.392    |
| Department (ACTIVE Jobs) | 0.212    | 0.306    |

These results indicate that the embedding-based zero-shot approach struggles with predicting both seniority and department on ACTIVE jobs. Seniority shows slightly higher accuracy for certain labels, but overall performance remains limited, reflecting the challenges of zero-shot classification in domains with noisy, short job titles and class imbalance.

# 4. Modell Seniority mit synthetic Daten

In [49]:
ORD_MAP = {
    "Junior": 1.0,
    "Professional": 2.0,
    "Senior": 3.0,
    "Lead": 4.0,
    "Management": 5.0,
    "Director": 6.0,
}
INV_ORD = {v: k for k, v in ORD_MAP.items()}

In [50]:
def add_synthetic(train_df: pd.DataFrame, synthetic_csv_relpath: str) -> pd.DataFrame:
    syn = pd.read_csv(get_github_url(synthetic_csv_relpath))
    syn = syn[["position", "seniority"]].copy()

    id2label = {v: k for k, v in ORD_MAP.items()}
    syn["label"] = syn["seniority"].map(id2label)
    syn = syn.rename(columns={"position": "text"})
    syn = syn.dropna(subset=["text", "label"])

    out = pd.concat([train_df[["text", "label"]], syn[["text", "label"]]], ignore_index=True)
    return out

In [51]:
sdf_aug = add_synthetic(seniority_df, "data/results/gemini_synthetic.csv")
sdf_aug

Unnamed: 0,text,label
0,Analyst,Junior
1,Analyste financier,Junior
2,Anwendungstechnischer Mitarbeiter,Junior
3,Application Engineer,Senior
4,Applications Engineer,Senior
...,...,...
11309,Juristischer Berater,Professional
11310,"Leitung Personal, Finanzen, Einkauf, IT | Folk...",Management
11311,Verwaltungsleitung Landesspracheninstitut in d...,Management
11312,"Leitung Gebäudemanagement, Einkauf und Control...",Management


In [52]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report
from collections import defaultdict
import numpy as np

# --- Labels & Beschreibungen aus seniority_df ---
slabel_names = sdf_aug["label"].astype(str).tolist()
slabel_texts = sdf_aug["text"].astype(str).tolist()

# --- Evaluation Labels (ACTIVE Jobs) ---
sstrue_seniority = jobs_annotated_active_df["seniority"].astype(str).tolist()

# --- Embedding-Modell laden ---
sembed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Embeddings für Label-Beschreibungen ---
X = sembed_model.encode(slabel_texts, convert_to_tensor=True)

# --- Zentroid je Seniority-Label ---
by_label = defaultdict(list)
for emb, lab in zip(X, slabel_names):
    by_label[lab].append(emb.cpu().numpy())

proto_labels = list(by_label.keys())
proto_embs = np.vstack([
    np.mean(by_label[label], axis=0)
    for label in proto_labels
])

# --- Embeddings für ACTIVE Jobs ---
E = sembed_model.encode(
    jobs_annotated_active_df["position"].astype(str).tolist(),
    convert_to_tensor=True
)

# --- Vorhersagen ---
sspred_seniority = []
for e in E:
    sims = cosine_similarity(e.cpu().reshape(1, -1), proto_embs)[0]
    sspred_seniority.append(proto_labels[int(np.argmax(sims))])

# --- Evaluation ---
ss_eval_accuracy = accuracy_score(sstrue_seniority, sspred_seniority)
ss_eval_macro_f1 = f1_score(sstrue_seniority, sspred_seniority, average="macro")

print("Embedding-based Seniority Prediction on ACTIVE Jobs")
print("Accuracy:", round(ss_eval_accuracy, 3))
print("Macro F1:", round(ss_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(sstrue_seniority, sspred_seniority))

Embedding-based Seniority Prediction on ACTIVE Jobs
Accuracy: 0.478
Macro F1: 0.409

Classification Report:

              precision    recall  f1-score   support

    Director       0.44      0.82      0.58        34
      Junior       0.03      0.17      0.05        12
        Lead       0.48      0.35      0.41       125
  Management       0.83      0.62      0.71       192
Professional       0.57      0.40      0.47       216
      Senior       0.17      0.41      0.24        44

    accuracy                           0.48       623
   macro avg       0.42      0.46      0.41       623
weighted avg       0.59      0.48      0.51       623



# 5. Modell Department mit synthetic Daten

In [53]:
def add_synthetic_department(train_df: pd.DataFrame, synthetic_csv_relpath: str) -> pd.DataFrame:
    syn = pd.read_csv(get_github_url(synthetic_csv_relpath))

    # expect columns: position, department
    syn = syn[["position", "department"]].copy()
    syn = syn.rename(columns={"position": "text", "department": "label"})
    syn = syn.dropna(subset=["text", "label"])

    out = pd.concat([train_df[["text", "label"]], syn[["text", "label"]]], ignore_index=True)
    return out

In [54]:
ddf_aug = add_synthetic_department(department_df, "data/results/gemini_synthetic.csv")
ddf_aug

Unnamed: 0,text,label
0,Adjoint directeur communication,Marketing
1,Advisor Strategy and Projects,Project Management
2,Beratung & Projekte,Project Management
3,Beratung & Projektmanagement,Project Management
4,Beratung und Projektmanagement kommunale Partner,Project Management
...,...,...
12026,Juristischer Berater,Consulting
12027,"Leitung Personal, Finanzen, Einkauf, IT | Folk...",Human Resources
12028,Verwaltungsleitung Landesspracheninstitut in d...,Administrative
12029,"Leitung Gebäudemanagement, Einkauf und Control...",Purchasing


In [55]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report
from collections import defaultdict
import numpy as np

# --- Labels & Beschreibungen aus department_df ---
dlabel_names = ddf_aug["label"].astype(str).tolist()
dlabel_texts = ddf_aug["text"].astype(str).tolist()

# --- Evaluation Labels (ACTIVE Jobs) ---
sdtrue_department = jobs_annotated_active_df["department"].astype(str).tolist()

# --- Embedding-Modell laden ---
dembed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Embeddings für Label-Beschreibungen ---
X = dembed_model.encode(dlabel_texts, convert_to_tensor=True)

# --- Zentroid je Department-Label ---
by_label = defaultdict(list)
for emb, lab in zip(X, dlabel_names):
    by_label[lab].append(emb.cpu().numpy())

proto_labels = list(by_label.keys())
proto_embs = np.vstack([
    np.mean(by_label[label], axis=0)
    for label in proto_labels])

# --- Embeddings für ACTIVE Jobs ---
E = dembed_model.encode(
    jobs_annotated_active_df["position"].astype(str).tolist(),
    convert_to_tensor=True)

# --- Vorhersagen ---
sdpred_department = []
for e in E:
    sims = cosine_similarity(e.cpu().reshape(1, -1), proto_embs)[0]
    sdpred_department.append(proto_labels[int(np.argmax(sims))])

# --- Evaluation ---
sd_eval_accuracy = accuracy_score(sdtrue_department, sdpred_department)
sd_eval_macro_f1 = f1_score(sdtrue_department, sdpred_department, average="macro")

print("Embedding-based Department Prediction on ACTIVE Jobs")
print("Accuracy:", round(sd_eval_accuracy, 3))
print("Macro F1:", round(sd_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(sdtrue_department, sdpred_department))

Embedding-based Department Prediction on ACTIVE Jobs
Accuracy: 0.496
Macro F1: 0.44

Classification Report:

                        precision    recall  f1-score   support

        Administrative       0.03      0.21      0.05        14
  Business Development       0.16      0.35      0.22        20
            Consulting       0.43      0.51      0.47        39
      Customer Support       0.55      1.00      0.71         6
       Human Resources       0.56      0.62      0.59        16
Information Technology       0.54      0.32      0.40        62
             Marketing       0.48      0.45      0.47        22
                 Other       0.78      0.53      0.64       344
    Project Management       0.71      0.62      0.66        39
            Purchasing       0.07      0.20      0.10        15
                 Sales       0.63      0.48      0.54        46

              accuracy                           0.50       623
             macro avg       0.45      0.48      0.44    

# 6. Modell Seniority mit synthetic Daten und Oversampling

In [56]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report
from collections import defaultdict
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# --- Labels & Beschreibungen aus sdf_aug (mit synthetischen Daten) ---
slabel_names = sdf_aug["label"].astype(str).tolist()
slabel_texts = sdf_aug["text"].astype(str).tolist()

# --- Oversampling: Labels ausgleichen ---
ros = RandomOverSampler(random_state=42)
slabel_texts_res, slabel_names_res = ros.fit_resample(
    np.array(slabel_texts).reshape(-1,1),  # reshaped für RandomOverSampler
    slabel_names
)
slabel_texts_res = slabel_texts_res.flatten()

# --- Evaluation Labels (ACTIVE Jobs) ---
sostrue_seniority = jobs_annotated_active_df["seniority"].astype(str).tolist()
eval_texts = jobs_annotated_active_df["position"].astype(str).tolist()

# --- Embedding-Modell laden ---
sembed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Embeddings für Label-Beschreibungen (nach Oversampling) ---
X = sembed_model.encode(slabel_texts_res, convert_to_tensor=True)

# --- Zentroid je Seniority-Label berechnen ---
by_label = defaultdict(list)
for emb, lab in zip(X, slabel_names_res):
    by_label[lab].append(emb.cpu().numpy())

proto_labels = list(by_label.keys())
proto_embs = np.vstack([
    np.mean(by_label[label], axis=0)
    for label in proto_labels
])

# --- Embeddings für ACTIVE Jobs ---
E = sembed_model.encode(eval_texts, convert_to_tensor=True)

# --- Vorhersagen ---
sospred_seniority = []
for e in E:
    sims = cosine_similarity(e.cpu().reshape(1, -1), proto_embs)[0]
    sospred_seniority.append(proto_labels[int(np.argmax(sims))])

# --- Evaluation ---
sos_eval_accuracy = accuracy_score(sostrue_seniority, sospred_seniority)
sos_eval_macro_f1 = f1_score(sostrue_seniority, sospred_seniority, average="macro")

print("Embedding-based Seniority Prediction on ACTIVE Jobs (mit Oversampling)")
print("Accuracy:", round(sos_eval_accuracy, 3))
print("Macro F1:", round(sos_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(sostrue_seniority, sospred_seniority))

Embedding-based Seniority Prediction on ACTIVE Jobs (mit Oversampling)
Accuracy: 0.474
Macro F1: 0.405

Classification Report:

              precision    recall  f1-score   support

    Director       0.44      0.82      0.57        34
      Junior       0.03      0.17      0.05        12
        Lead       0.47      0.35      0.40       125
  Management       0.83      0.61      0.70       192
Professional       0.56      0.40      0.47       216
      Senior       0.16      0.39      0.23        44

    accuracy                           0.47       623
   macro avg       0.42      0.46      0.40       623
weighted avg       0.58      0.47      0.51       623



# 7. Modell Department mit synthetic Datem und Oversampling

In [57]:
# --- Labels & Beschreibungen aus ddf_aug (mit synthetischen Daten) ---
dlabel_names = ddf_aug["label"].astype(str).tolist()
dlabel_texts = ddf_aug["text"].astype(str).tolist()

# --- Oversampling: Labels ausgleichen ---
ros = RandomOverSampler(random_state=42)
dlabel_texts_res, dlabel_names_res = ros.fit_resample(
    np.array(dlabel_texts).reshape(-1,1),  # reshaped für RandomOverSampler
    dlabel_names
)
dlabel_texts_res = dlabel_texts_res.flatten()

# --- Evaluation Labels (ACTIVE Jobs) ---
sodtrue_department = jobs_annotated_active_df["department"].astype(str).tolist()
eval_texts = jobs_annotated_active_df["position"].astype(str).tolist()

# --- Embedding-Modell laden ---
dembed_model = SentenceTransformer("all-MiniLM-L6-v2")

# --- Embeddings für Label-Beschreibungen (nach Oversampling) ---
X = dembed_model.encode(dlabel_texts_res, convert_to_tensor=True)

# --- Zentroid je Department-Label berechnen ---
by_label = defaultdict(list)
for emb, lab in zip(X, dlabel_names_res):
    by_label[lab].append(emb.cpu().numpy())

proto_labels = list(by_label.keys())
proto_embs = np.vstack([
    np.mean(by_label[label], axis=0)
    for label in proto_labels
])

# --- Embeddings für ACTIVE Jobs ---
E = dembed_model.encode(eval_texts, convert_to_tensor=True)

# --- Vorhersagen ---
sodpred_department = []
for e in E:
    sims = cosine_similarity(e.cpu().reshape(1, -1), proto_embs)[0]
    sodpred_department.append(proto_labels[int(np.argmax(sims))])

# --- Evaluation ---
sod_eval_accuracy = accuracy_score(sodtrue_department, sodpred_department)
sod_eval_macro_f1 = f1_score(sodtrue_department, sodpred_department, average="macro")

print("Embedding-based Department Prediction on ACTIVE Jobs (mit Oversampling)")
print("Accuracy:", round(sod_eval_accuracy, 3))
print("Macro F1:", round(sod_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(sodtrue_department, sodpred_department))

Embedding-based Department Prediction on ACTIVE Jobs (mit Oversampling)
Accuracy: 0.502
Macro F1: 0.446

Classification Report:

                        precision    recall  f1-score   support

        Administrative       0.05      0.36      0.09        14
  Business Development       0.17      0.35      0.23        20
            Consulting       0.42      0.51      0.46        39
      Customer Support       0.55      1.00      0.71         6
       Human Resources       0.53      0.62      0.57        16
Information Technology       0.59      0.35      0.44        62
             Marketing       0.48      0.45      0.47        22
                 Other       0.79      0.53      0.64       344
    Project Management       0.71      0.64      0.68        39
            Purchasing       0.07      0.20      0.10        15
                 Sales       0.66      0.46      0.54        46

              accuracy                           0.50       623
             macro avg       0.45    

# 8. Modell mit synthetic Daten und Oversampling und normieren

In [65]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report
from collections import defaultdict
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# --- Labels & Beschreibungen aus sdf_aug (mit synthetischen Daten) ---
slabel_names = sdf_aug["label"].astype(str).tolist()
slabel_texts = sdf_aug["text"].astype(str).tolist()

# --- Oversampling: Labels ausgleichen ---
ros = RandomOverSampler(random_state=42)
slabel_texts_res, slabel_names_res = ros.fit_resample(
    np.array(slabel_texts).reshape(-1,1),
    slabel_names
)
slabel_texts_res = slabel_texts_res.flatten()

# --- Evaluation Labels (ACTIVE Jobs) ---
nsostrue_seniority = jobs_annotated_active_df["seniority"].astype(str).tolist()
eval_texts = jobs_annotated_active_df["position"].astype(str).tolist()

# --- Embedding-Modell laden ---
sembed_model = SentenceTransformer("all-mpnet-base-v2")

# --- Embeddings für Label-Beschreibungen (nach Oversampling) ---
X = sembed_model.encode(slabel_texts_res, convert_to_tensor=True)

# --- Normierung der Embeddings ---
X_norm = np.array([v.cpu().numpy()/np.linalg.norm(v.cpu().numpy()) for v in X])

# --- Zentroid je Seniority-Label berechnen ---
by_label = defaultdict(list)
for emb, lab in zip(X_norm, slabel_names_res):
    by_label[lab].append(emb)

proto_labels = list(by_label.keys())
proto_embs = np.vstack([
    np.mean(by_label[label], axis=0)
    for label in proto_labels
])

# --- Embeddings für ACTIVE Jobs ---
E = sembed_model.encode(eval_texts, convert_to_tensor=True)
E_norm = np.array([v.cpu().numpy()/np.linalg.norm(v.cpu().numpy()) for v in E])

# --- Vorhersagen ---
nsospred_seniority = []
for e in E_norm:
    sims = cosine_similarity(e.reshape(1, -1), proto_embs)[0]
    nsospred_seniority.append(proto_labels[int(np.argmax(sims))])

# --- Evaluation ---
nsos_eval_accuracy = accuracy_score(nsostrue_seniority, nsospred_seniority)
nsos_eval_macro_f1 = f1_score(nsostrue_seniority, nsospred_seniority, average="macro")

print("Embedding-based Seniority Prediction on ACTIVE Jobs (mit Oversampling & normierten Embeddings)")
print("Accuracy:", round(nsos_eval_accuracy, 3))
print("Macro F1:", round(nsos_eval_macro_f1, 3))
print("\nClassification Report:\n")
print(classification_report(nsostrue_seniority, nsospred_seniority))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding-based Seniority Prediction on ACTIVE Jobs (mit Oversampling & normierten Embeddings)
Accuracy: 0.475
Macro F1: 0.4

Classification Report:

              precision    recall  f1-score   support

    Director       0.31      0.85      0.46        34
      Junior       0.11      0.25      0.15        12
        Lead       0.42      0.38      0.40       125
  Management       0.75      0.57      0.65       192
Professional       0.58      0.41      0.48       216
      Senior       0.20      0.39      0.26        44

    accuracy                           0.48       623
   macro avg       0.39      0.48      0.40       623
weighted avg       0.55      0.48      0.49       623



In [64]:
import pandas as pd

# --- Full comparison: Seniority & Department (Baseline vs Synthetic vs Synthetic + Oversampling) ---
full_comparison = pd.DataFrame({
    "Target": [
        "Seniority (ACTIVE Jobs – Baseline)",
        "Seniority (ACTIVE Jobs – with Synthetic)",
        "Seniority (ACTIVE Jobs – Synthetic + Oversampling)",
        "Seniority (ACTIVE Jobs – Synthetic + Oversampling + Zentroid Bildung)",
        "Department (ACTIVE Jobs – Baseline)",
        "Department (ACTIVE Jobs – with Synthetic)",
        "Department (ACTIVE Jobs – Synthetic + Oversampling)"
        "Department (ACTIVE Jobs – Synthetic + Oversampling + Zentroid Bildung)",
    ],
    "Accuracy": [
        s_eval_accuracy,      # Seniority baseline
        ss_eval_accuracy,     # Seniority + synthetic
        sos_eval_accuracy,    # Seniority + synthetic + oversampling
        nsos_eval_accuracy,   # Seniority + synthetic + oversampling + Zentroid Bildung
        d_eval_accuracy,      # Department baseline
        sd_eval_accuracy,     # Department + synthetic
        sod_eval_accuracy,     # Department + synthetic + oversampling
        #zsod

    ],
    "Macro F1": [
        s_eval_macro_f1,      # Seniority baseline
        ss_eval_macro_f1,     # Seniority + synthetic
        sos_eval_macro_f1,    # Seniority + synthetic + oversampling
        nsos_eval_macro_f1,   # Seniority + synthetic + oversampling + Zentroid Bildung
        d_eval_macro_f1,      # Department baseline
        sd_eval_macro_f1,     # Department + synthetic
        sod_eval_macro_f1     # Department + synthetic + oversampling
        #zsod
    ]
})

print("\nFull Model Comparison: Seniority & Department (Baseline vs Synthetic vs Synthetic + Oversampling)\n")
print(full_comparison)



Full Model Comparison: Seniority & Department (Baseline vs Synthetic vs Synthetic + Oversampling)

                                              Target  Accuracy  Macro F1
0                 Seniority (ACTIVE Jobs – Baseline)  0.430177  0.392017
1           Seniority (ACTIVE Jobs – with Synthetic)  0.478331  0.409459
2  Seniority (ACTIVE Jobs – Synthetic + Oversampl...  0.473515  0.404537
3  Seniority (ACTIVE Jobs – Synthetic + Oversampl...  0.473515  0.404537
4                Department (ACTIVE Jobs – Baseline)  0.314607  0.314955
5          Department (ACTIVE Jobs – with Synthetic)  0.495987  0.440235
6  Department (ACTIVE Jobs – Synthetic + Oversamp...  0.502408  0.446475
