# Salary Prediction from LinkedIn Job Postings - Train Neural Network

In [1]:
import salary
import seaborn as sns
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import tensorflow.keras as keras
from sklearn.model_selection import KFold, GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from scikeras.wrappers import KerasRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sns.set_theme()

## Train & Evaluate Models

In [3]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [4]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', TargetEncoder(random_state=42), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
)

In [5]:
def build_model(
    n_units_1=256,
    n_units_2=192,
    n_units_3=64,
    n_units_4=32,
    dropout_rate=0.3,
    learning_rate=0.05,
    optimizer_name="adamw"
):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(318,)))

    # Layer 1
    model.add(keras.layers.Dense(n_units_1, activation='leaky_relu'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dropout(dropout_rate))

    # Layer 2
    model.add(keras.layers.Dense(n_units_2, activation='leaky_relu'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dropout(dropout_rate))

    # Layer 3
    if n_units_3:
        model.add(keras.layers.Dense(n_units_3, activation='leaky_relu'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(dropout_rate))

    # Layer 4
    if n_units_4:
        model.add(keras.layers.Dense(n_units_4, activation='leaky_relu'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(dropout_rate))

    model.add(keras.layers.Dense(1))  # Output layer for regression

    if optimizer_name == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "adamw":
        optimizer = keras.optimizers.AdamW(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    model.compile(optimizer=optimizer, loss='mse', metrics=['mae', 'r2_score'])
    return model


In [6]:
model = make_pipeline(
    clone(preprocessor), 
    GridSearchCV(
        estimator=KerasRegressor(
            model=build_model,
            epochs=150,
            batch_size=64,
            verbose=1,
            validation_split=0.2,
            callbacks=[
                EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
                ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, verbose=1)
            ]
        ),
        param_grid={
            'model__learning_rate': [5e-2]
        },
        scoring="r2",
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        verbose=1,
    )
).fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Epoch 1/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 12900023296.0000 - mae: 95258.2500 - r2_score: -2.2993 - val_loss: 8236237824.0000 - val_mae: 77363.8750 - val_r2_score: -1.1591 - learning_rate: 0.0500
Epoch 2/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 8612246528.0000 - mae: 76600.5703 - r2_score: -1.2921 - val_loss: 5254558720.0000 - val_mae: 55707.8906 - val_r2_score: -0.3775 - learning_rate: 0.0500
Epoch 3/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4236187648.0000 - mae: 47914.9688 - r2_score: -0.1287 - val_loss: 3849106432.0000 - val_mae: 41505.1875 - val_r2_score: -0.0090 - learning_rate: 0.0500
Epoch 4/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2904269312.0000 - mae: 33758.8984 - r2_score: 0.2763 - val_loss: 3289467136.0000 - val_mae: 30346.1152 -

In [10]:
search = model[-1]
best_params = search.best_params_
best_params

{'model__learning_rate': 0.05}

In [11]:
result_train = salary.evaluate_train_predictions(model.predict(X_train))

[1m436/436[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880us/step
Train R2: 0.5996
Train RMSE: 38926.7294
Train MAE: 24265.8338


In [12]:
result_test = salary.evaluate_test_predictions(model.predict(X_test))

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step
Test R2: 0.4907
Test RMSE: 41605.9919
Test MAE: 26693.9291
