# Salary Prediction from LinkedIn Job Postings - Train Neural Network

In [13]:
import xgboost as xgb
import pandas as pd, numpy as np
import salary
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import tensorflow.keras as keras
from sklearn.model_selection import KFold, GridSearchCV
from skopt import BayesSearchCV
import joblib
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from scikeras.wrappers import KerasRegressor

In [14]:
sns.set_theme()

## Train & Evaluate Models

In [15]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [16]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', TargetEncoder(), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
)

In [31]:
def build_model(
    n_units_1=256,
    n_units_2=128,
    n_units_3=64,
    n_units_4=None,
    dropout_rate=0.2,
    learning_rate=1e-2,
    optimizer_name="adamw"
):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(318,)))

    # Layer 1
    model.add(keras.layers.Dense(n_units_1, activation='relu'))
    model.add(keras.layers.LayerNormalization())
    model.add(keras.layers.Dropout(dropout_rate))

    # Layer 2
    model.add(keras.layers.Dense(n_units_2, activation='relu'))
    model.add(keras.layers.LayerNormalization())
    model.add(keras.layers.Dropout(dropout_rate))

    # Layer 3
    if n_units_3:
        model.add(keras.layers.Dense(n_units_3, activation='relu'))
        model.add(keras.layers.LayerNormalization())
        model.add(keras.layers.Dropout(dropout_rate))

    # Layer 4
    if n_units_4:
        model.add(keras.layers.Dense(n_units_4, activation='relu'))
        model.add(keras.layers.LayerNormalization())
        model.add(keras.layers.Dropout(dropout_rate))

    model.add(keras.layers.Dense(1))  # Output layer for regression

    if optimizer_name == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "adamw":
        optimizer = keras.optimizers.AdamW(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    model.compile(optimizer=optimizer, loss='mse', metrics=['mae', 'r2_score'])
    return model


In [35]:
model = make_pipeline(
    clone(preprocessor), 
    GridSearchCV(
        estimator=KerasRegressor(
            model=build_model,
            epochs=100,
            batch_size=32,
            verbose=1,
            validation_split=0.2,
            callbacks=[
                EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
                ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, verbose=1)
            ]
        ),
        param_grid={
            'model__learning_rate': [1e-1, 1e-2, 1e-3]
        },
        scoring="r2",
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        verbose=1,
    )
).fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Epoch 1/100
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 8435720192.0000 - mae: 68183.1250 - r2_score: -1.2322 - val_loss: 2528091648.0000 - val_mae: 30069.5566 - val_r2_score: 0.3373 - learning_rate: 0.1000
Epoch 2/100
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 2598951168.0000 - mae: 30132.9160 - r2_score: 0.3438 - val_loss: 2349121792.0000 - val_mae: 29219.2617 - val_r2_score: 0.3842 - learning_rate: 0.1000
Epoch 3/100
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 2219572992.0000 - mae: 28956.9199 - r2_score: 0.4113 - val_loss: 2294774272.0000 - val_mae: 28814.9414 - val_r2_score: 0.3984 - learning_rate: 0.1000
Epoch 4/100
[1m558/558[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 2175843072.0000 - mae: 28211.9473 - r2_score: 0.4340 - val_loss: 2339815424.0000 - val_mae: 28752.0859 - val_

In [None]:
search = model[-1]
best_params = search.best_params_
best_params

{'model__learning_rate': 0.1}

In [36]:
result_train = salary.evaluate_train_predictions(model.predict(X_train))

[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 647us/step
Train R2: 0.6536
Train RMSE: 36205.9423
Train MAE: 20947.4501


In [38]:
result_test = salary.evaluate_test_predictions(model.predict(X_test))

[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 918us/step
Test R2: 0.4918
Test RMSE: 41558.8136
Test MAE: 26346.0318


## Train & Evaluate Best Model

In [41]:
best_model = make_pipeline(
    clone(preprocessor),
    KerasRegressor(
        model=build_model,
        epochs=100,
        batch_size=32,
        verbose=1,
        validation_split=0.2,
        callbacks=[
            EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
            ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, verbose=1)
        ],
        **best_params
    )
).fit(X_train, y_train)

Epoch 1/100
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 7679608832.0000 - mae: 63115.7891 - r2_score: -0.9903 - val_loss: 2421245696.0000 - val_mae: 30053.0664 - val_r2_score: 0.3553 - learning_rate: 0.1000
Epoch 2/100
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 2251360000.0000 - mae: 29627.8105 - r2_score: 0.3844 - val_loss: 2377335552.0000 - val_mae: 28677.4863 - val_r2_score: 0.3670 - learning_rate: 0.1000
Epoch 3/100
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 2304815360.0000 - mae: 28680.3145 - r2_score: 0.3991 - val_loss: 2239586048.0000 - val_mae: 28379.7656 - val_r2_score: 0.4037 - learning_rate: 0.1000
Epoch 4/100
[1m698/698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 2101372032.0000 - mae: 27978.3262 - r2_score: 0.4306 - val_loss: 2232780544.0000 - val_mae: 27964.0762 - val_r2_score: 0.4055 - learning_rate: 0.1000
Epoch 5/100
[1m698

In [42]:
results_train = salary.evaluate_train_predictions(best_model.predict(X_train))
result_test = salary.evaluate_test_predictions(best_model.predict(X_test))

[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Train R2: 0.6232
Train RMSE: 37758.2855
Train MAE: 22090.5336
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 733us/step
Test R2: 0.4953
Test RMSE: 41418.2689
Test MAE: 26489.2360
