# Salary Prediction from LinkedIn Job Postings - Train Neural Network

In [1]:
import pandas as pd, numpy as np
import salary
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import tensorflow.keras as keras
from sklearn.model_selection import KFold, GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from scikeras.wrappers import KerasRegressor

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd, numpy as np
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sns.set_theme()

## Train & Evaluate Models

In [3]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [4]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(use_cat_names=True), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', TargetEncoder(), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
)

In [5]:
def build_model(
    n_units_1=256,
    n_units_2=192,
    n_units_3=64,
    n_units_4=32,
    dropout_rate=0.3,
    learning_rate=0.05,
    optimizer_name="adamw"
):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(318,)))

    # Layer 1
    model.add(keras.layers.Dense(n_units_1, activation='leaky_relu'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dropout(dropout_rate))

    # Layer 2
    model.add(keras.layers.Dense(n_units_2, activation='leaky_relu'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dropout(dropout_rate))

    # Layer 3
    if n_units_3:
        model.add(keras.layers.Dense(n_units_3, activation='leaky_relu'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(dropout_rate))

    # Layer 4
    if n_units_4:
        model.add(keras.layers.Dense(n_units_4, activation='leaky_relu'))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(dropout_rate))

    model.add(keras.layers.Dense(1))  # Output layer for regression

    if optimizer_name == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == "adamw":
        optimizer = keras.optimizers.AdamW(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    model.compile(optimizer=optimizer, loss='mse', metrics=['mae', 'r2_score'])
    return model


In [6]:
model = make_pipeline(
    clone(preprocessor), 
    GridSearchCV(
        estimator=KerasRegressor(
            model=build_model,
            epochs=150,
            batch_size=64,
            verbose=1,
            validation_split=0.2,
            callbacks=[
                EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
                ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, verbose=1)
            ]
        ),
        param_grid={
            'model__learning_rate': [1e-2, 5e-2]
        },
        scoring="r2",
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        verbose=1,
    )
).fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Epoch 1/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 13094193152.0000 - mae: 96726.5234 - r2_score: -2.5084 - val_loss: 13175993344.0000 - val_mae: 97297.7109 - val_r2_score: -2.4540 - learning_rate: 0.0100
Epoch 2/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 12959981568.0000 - mae: 95593.9922 - r2_score: -2.3289 - val_loss: 12633285632.0000 - val_mae: 95318.1719 - val_r2_score: -2.3118 - learning_rate: 0.0100
Epoch 3/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 12543975424.0000 - mae: 94111.0781 - r2_score: -2.1979 - val_loss: 11938516992.0000 - val_mae: 92966.8516 - val_r2_score: -2.1296 - learning_rate: 0.0100
Epoch 4/150
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 11326956544.0000 - mae: 90177.5078 - r2_score: -2.1159 - val_loss: 10928696320.0000 - val_mae: 887

In [7]:
search = model[-1]
best_params = search.best_params_
best_params

{'model__learning_rate': 0.05}

In [8]:
result_train = salary.evaluate_train_predictions(model.predict(X_train))

[1m436/436[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 702us/step
Train R2: 0.5695
Train RMSE: 40360.7444
Train MAE: 24419.8215


In [9]:
result_test = salary.evaluate_test_predictions(model.predict(X_test))

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step
Test R2: 0.5106
Test RMSE: 40784.4440
Test MAE: 26181.9552


## Train & Evaluate Best Model

In [14]:
best_model = make_pipeline(
    clone(preprocessor),
    KerasRegressor(
        model=build_model,
        epochs=150,
        batch_size=64,
        verbose=1,
        validation_split=0.2,
        callbacks=[
            EarlyStopping(patience=10, restore_best_weights=True, verbose=1),
            ReduceLROnPlateau(patience=5, factor=0.5, min_lr=1e-6, verbose=1)
        ],
        **best_params
    )
).fit(X_train, y_train)

Epoch 1/100
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 12704825344.0000 - mae: 94878.9766 - r2_score: -2.3119 - val_loss: 7874866176.0000 - val_mae: 75407.8203 - val_r2_score: -1.0968 - learning_rate: 0.0500
Epoch 2/100
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 7290342400.0000 - mae: 67988.2969 - r2_score: -0.8460 - val_loss: 4445874176.0000 - val_mae: 50241.9102 - val_r2_score: -0.1838 - learning_rate: 0.0500
Epoch 3/100
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3078017792.0000 - mae: 37809.7812 - r2_score: 0.1505 - val_loss: 2789701120.0000 - val_mae: 31797.0801 - val_r2_score: 0.2572 - learning_rate: 0.0500
Epoch 4/100
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2689631488.0000 - mae: 30840.7285 - r2_score: 0.3514 - val_loss: 2533696000.0000 - val_mae: 28245.9941 - val_r2_score: 0.3254 - learning_rate: 0.0500
Epoch 5/100
[1

In [16]:
results_train = salary.evaluate_train_predictions(best_model.predict(X_train))
result_test = salary.evaluate_test_predictions(best_model.predict(X_test))

[1m436/436[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Train R2: 0.5881
Train RMSE: 39481.3863
Train MAE: 24281.0941
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 843us/step
Test R2: 0.5072
Test RMSE: 40926.0630
Test MAE: 26105.7880
