## Dependencies

In [None]:
import pandas as pd

from datetime import datetime, timedelta

import sys
from pathlib import Path

# Automatically detect the repo root (parent of notebook folder)
repo_root = Path().resolve().parent  # if notebook is in 'notebooks/' folder
sys.path.append(str(repo_root))

from config.config import get_environment

from config.config import data_import_pkl, data_export_pkl, data_import_pandas, data_export_pandas

## ENV

In [None]:
ENV = get_environment(
    env_path="../environments",
    env_name="env.json"
)

# content_date = datetime.now().date() + timedelta(days=0)
content_date = ENV['CONTENT_DATE']
website = ENV['SOURCE']['NAME']
version = ENV['VERSION']

grid_search = ENV['CLASSIFICATION']['GRID_SEARCH']
load_pipeline = ENV['CLASSIFICATION']['LOAD_PIPELINE']
is_weight = ENV['CLASSIFICATION']['IS_WEIGHT']
is_pca = ENV['CLASSIFICATION']['IS_PCA']

## Classification

In [None]:
df_embed = data_import_pandas(
    website=website,
    content_date=content_date,
    version=version,
    folder_name='embeddings',
    additional_info='embeddings'
)

### Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix


from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA, TruncatedSVD
import xgboost as xgb
import numpy as np

In [None]:
# Feature Extraction
# 3. Load saved embeddings
X_qual_embed = df_embed['qualification_embedding'].to_list()
X_qual_embed = np.array(X_qual_embed)

# 4 TF-IDF for related_experience
df_embed['related_experience'] = df_embed['related_experience'].fillna('')
tfidf_exp = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
X_exp = tfidf_exp.fit_transform(df_embed['related_experience'])

# 5. Numeric features
## Encode categorical
degree_map = {
    '': 0,
    '0': 0,
    'unspecified': 0,

    'high school diploma': 1,
    'mbo': 1,
    'hbo': 1,

    'associate': 2,
    'associates': 2,

    'bachelor': 3,
    'bachelors': 3,
    'ba/bs': 3,

    'master': 4,
    'mba': 4,

    'phd': 5,
    'doctoral': 5,
    'doctor': 5,      # Sometimes appears as "doctor"
    'jd': 5,

    # Domain-specific advanced license â†’ treat as postgraduate level
    'faa airline transport pilot certificate': 5
}

df_embed['min_years'] = df_embed['min_years'].fillna(0).replace('', 0).astype(int)
df_embed['min_degree'] = df_embed['min_degree'].map(degree_map).fillna(0).astype(int)
df_embed['country_encoded'] = LabelEncoder().fit_transform(df_embed['country'])

X_numeric = df_embed[['min_years', 'min_degree', 'country_encoded']].fillna(0).values

le = LabelEncoder()
df_embed['level_encoded'] = le.fit_transform(df_embed['level'])

# Target
y = df_embed['level_encoded']  # e.g., 'Entry', 'Mid', 'Senior'
y_level_count = df_embed['level'].value_counts().to_dict()

### Weight

In [None]:
if is_weight:
    # Handle Weights due to Imbalance Class
    level_counts = df_embed['level'].value_counts()

    # Compute weights
    total_samples = len(df_embed)
    class_weights = {level: total_samples/count for level, count in level_counts.items()}
    print(class_weights)

    # Map weights to each sample
    sample_weights = df_embed['level'].map(class_weights)

else:
    pass

### Load Saved Pipeline

In [None]:
if load_pipeline:
    # Load the saved pipeline
    pipeline_objects = data_import_pkl(
        website=website,
        folder_name='classification',
        version=version,
        content_date=content_date,
        additional_info='pipeline-job_level'
    )

    # Extract objects
    model = pipeline_objects['model']
    tfidf_exp = pipeline_objects['tfidf_exp']
    X_qual_embed = pipeline_objects['embeddings_qual']
    X_exp = pipeline_objects['X_exp']
    X_numeric = pipeline_objects['X_numeric']
    y_level_count = pipeline_objects['y_level_count']
    y = pipeline_objects['y_level_encoded']
    le = pipeline_objects['label_encoder_level']
    best_params_ = pipeline_objects['best_params_']

    print("Pipeline and model loaded successfully!")
else:
    pass

### PCA

In [None]:
if is_pca:
    # Apply PCA to reduce tf-idf dimensionality
    svd_exp = TruncatedSVD(n_components=50, random_state=42)
    X_exp_reduced = svd_exp.fit_transform(X_exp)

    # Apply PCA to reduce embeddings dimensionality
    pca_dim = min(X_qual_embed.shape[0], X_qual_embed.shape[1], 100)  # automatically safe reduce 1536-dim embedding to 100 if sample less then min
    pca_qual = PCA(n_components=pca_dim, random_state=42)
    pca_qual.fit(X_qual_embed)  # fit on source embeddings

    X_qual_embed_reduced = pca_qual.transform(X_qual_embed)

else:
    X_exp_reduced = X_exp
    X_qual_embed_reduced = X_qual_embed

### Sparse and Split Train Test

In [None]:
# Convert Embeddings and Numeric to Compressed Sparse Row due to TF-IDF
# Convert to sparse to stack with TF-IDF
X_qual_embed_sparse = csr_matrix(X_qual_embed_reduced)

# Convert numeric to sparse to stack with TF-IDF
X_numeric_sparse = csr_matrix(X_numeric)

# Combine features
X = hstack([X_exp_reduced, X_qual_embed_sparse, X_numeric_sparse])

# Split Train Test (optional weight)
if is_weight:
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
        X, y, sample_weights, test_size=0.2, random_state=42, stratify=y
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

### Grid Search to get the best hyperparameter

In [None]:
# Grid Search looking for the best hyperparameter
from sklearn.model_selection import GridSearchCV

if grid_search and not load_pipeline:
    param_grid = {
        'max_depth': [4,6,8],
        'learning_rate': [0.05, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    xgb_clf = xgb.XGBClassifier(
        objective='multi:softprob',
        random_state=42,
        eval_metric='mlogloss'
    )

    grid_search = GridSearchCV(
        estimator=xgb_clf,
        param_grid=param_grid,
        scoring='f1_weighted',
        cv=5,
        verbose=3,
        n_jobs=-2
    )

    if is_weight:
        grid_search.fit(X_train, y_train, sample_weight=w_train)
    else:
        grid_search.fit(X_train, y_train)
    print("Best params:", grid_search.best_params_)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

### Direct Train XGBoost
else:
    if load_pipeline:
        model = xgb.XGBClassifier(
            n_estimators=best_params_['n_estimators'],
            max_depth=best_params_['max_depth'],
            learning_rate=best_params_['learning_rate'],
            subsample=best_params_['subsample'],
            colsample_bytree=best_params_['colsample_bytree'],
            objective='multi:softprob',
            eval_metric='mlogloss',
            random_state=42
        )

    else:
        model = xgb.XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='multi:softprob',
            eval_metric='mlogloss',
            random_state=42
        )

    if is_weight:
        model.fit(X_train, y_train, sample_weight=w_train)
    else:
        model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

### Evaluate Performance

In [None]:
# Evaluate Direct
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Save performance metrics
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred, output_dict=True)

df_report = pd.DataFrame(report).transpose().apply(lambda value: round(value, 2))

additional_info = 'performance-job_level'
if is_pca:
    additional_info=additional_info+'-pca'
if is_weight:
    additional_info=additional_info+'-weight'
if grid_search:
    additional_info=additional_info+'-grid_search'

data_export_pandas(
    df_output=df_report,
    website=website,
    content_date=content_date,
    version=version,
    folder_name='classification',
    additional_info=additional_info,
)

### Feed Complete Data

In [None]:
# Predict Overall Data
y_pred_all = best_model.predict(X)
df_embed['predicted_level_encoded'] = y_pred_all
df_embed['predicted_level'] = le.inverse_transform(y_pred_all)
data_export_pandas(
    df_output=df_embed,
    website=website,
    content_date=content_date,
    version=version,
    folder_name='classification',
    additional_info='classification',
    incl_excel=True
)

### Export pipeline

In [None]:
from sklearn.preprocessing import LabelEncoder

# Assume features are already computed
# X_exp_reduced : TF-IDF + SVD
# X_qual_embed : raw embeddings
# X_qual_embed_reduced : PCA-reduced embeddings
# X_numeric : numeric features
# X : combined sparse features
# model : trained XGBoost
# y_test, y_pred : for evaluation

# Save pipeline + features
pipeline_objects = {
    'model': model,
    'tfidf_exp': tfidf_exp,
    'embeddings_qual': X_qual_embed,     # PCA-reduced embeddings
    'X_exp': X_exp,                      # TF-IDF + SVD
    'X_numeric': X_numeric,                              # numeric features
    'y_level_count': y_level_count,
    'y_level_encoded': y,
    'label_encoder_level': LabelEncoder().fit(df_embed['level']),  # for decoding
    'best_params_': grid_search.best_params_
}

data_export_pkl(
    pipeline_objects=pipeline_objects,
        website=website,
        folder_name='classification',
        version=version,
        content_date=content_date,
        additional_info='pipeline-job_level'
    )
print("All features, embeddings, preprocessing, and model saved successfully!")


## Visualize Performance-based

In [None]:
# a) Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(df_embed['level_encoded'], df_embed['predicted_level_encoded'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=le.classes_)
disp.plot(cmap='Blues', xticks_rotation=45)

In [None]:
# a) Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(df_embed['level_encoded'], df_embed['predicted_level_encoded'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=le.classes_)
disp.plot(cmap='Blues', xticks_rotation=45)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

report = classification_report(df_embed['level_encoded'], 
                               df_embed['predicted_level_encoded'], 
                               target_names=le.classes_,
                               output_dict=True)
report_df = pd.DataFrame(report).transpose()

report_df[['precision', 'recall', 'f1-score']].iloc[:-3].plot(kind='bar')
plt.title("Performance Metrics per Job Level")
plt.ylabel("Score")
plt.ylim(0,1)
plt.show()

# Visualize Performance Data / prediction

In [None]:
# a) Predicted vs Actual
import seaborn as sns

sns.countplot(x='level', hue='predicted_level', data=df_embed)
plt.title("Actual vs Predicted Job Levels")
plt.xticks(rotation=45)
plt.show()


In [None]:
# b) Feature importance
max_num_features = 10

# Sizes of each feature block
n_tfidf = X_exp_reduced.shape[1]           # e.g., 50
n_embed = X_qual_embed_reduced.shape[1]    # e.g., 100
n_numeric = X_numeric.shape[1]             # e.g., 3

# TF-IDF reduced features
tfidf_names = [f"tfidf_{i}" for i in range(n_tfidf)]

# Embedding PCA features
embed_names = [f"embed_pca_{i}" for i in range(n_embed)]

# Numeric features (use your original numeric column names)
numeric_names = ['min_years', 'min_degree', 'country_encoded']

# Combine all names in the same order as X
feature_names = tfidf_names + embed_names + numeric_names

xgb.plot_importance(best_model, max_num_features=max_num_features, importance_type='weight', 
                    xlabel='F Score', grid=True)
plt.show()

importance = model.get_booster().get_score(importance_type='weight')
# Map f0..fn to actual names
importance_named = {feature_names[int(k[1:])]: v for k, v in importance.items()}
importance_df = pd.DataFrame.from_dict(importance_named, orient='index', columns=['fscore'])
importance_df = importance_df.sort_values(by='fscore', ascending=False)
print(importance_df.head(max_num_features))

In [None]:
# c) Embedding visualization (optional, for portfolio)
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_emb_2d = tsne.fit_transform(X_qual_embed_reduced)

df_embed['emb_2d_x'] = X_emb_2d[:,0]
df_embed['emb_2d_y'] = X_emb_2d[:,1]

sns.scatterplot(x='emb_2d_x', y='emb_2d_y', hue='level', data=df_embed, palette='tab10')
plt.title("Qualification Embeddings (2D)")
plt.show()