# Salary Prediction from LinkedIn Job Postings - Train Multi Layer Perceptron

In [1]:
import salary
import numpy as np
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, GridSearchCV
import torch
from torch import nn, optim
import random
from skorch import NeuralNetRegressor, dataset
from skorch.callbacks import EarlyStopping, LRScheduler, EpochScoring

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)
random.seed(42)

## Train & Evaluate Models

In [3]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [4]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('one_hot_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', TargetEncoder(random_state=42), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', SimpleImputer(strategy='median'), ['company_employee_count']),
        ],
        remainder='drop'
    ),
    StandardScaler(),
)

In [5]:
class Model(nn.Module):
    def __init__(self, n_units_1=256, n_units_2=192, n_units_3=64, n_units_4=32,
                dropout_rate=0.3, leaky_relu_slope=0.2):
        super().__init__()
        # Layer 1
        self.linear1 = nn.Linear(318, n_units_1).double()
        self.bn1 = nn.BatchNorm1d(n_units_1).double()
        self.dropout1 = nn.Dropout(dropout_rate).double()

        # Layer 2
        self.linear2 = nn.Linear(n_units_1, n_units_2).double()
        self.bn2 = nn.BatchNorm1d(n_units_2).double()
        self.dropout2 = nn.Dropout(dropout_rate).double()

        # Layer 3
        self.linear3 = nn.Linear(n_units_2, n_units_3).double()
        self.bn3 = nn.BatchNorm1d(n_units_3).double()
        self.dropout3 = nn.Dropout(dropout_rate).double()

        # Layer 4
        self.linear4 = nn.Linear(n_units_3, n_units_4).double()
        self.bn4 = nn.BatchNorm1d(n_units_4).double()
        self.dropout4 = nn.Dropout(dropout_rate).double()

        # Output layer
        self.output = nn.Linear(n_units_4, 1).double()

        # Activation function
        self.leaky_relu = nn.LeakyReLU(leaky_relu_slope).double()

    def forward(self, X):

        # Layer 1
        X = self.leaky_relu(self.linear1(X))
        X = self.bn1(X)
        X = self.dropout1(X)

        # Layer 2
        X = self.leaky_relu(self.linear2(X))
        X = self.bn2(X)
        X = self.dropout2(X)

        # Layer 3
        X = self.leaky_relu(self.linear3(X))
        X = self.bn3(X)
        X = self.dropout3(X)

        # Layer 4
        X = self.leaky_relu(self.linear4(X))
        X = self.bn4(X)
        X = self.dropout4(X)

        # Output layer
        X = self.output(X)
        
        return X

In [6]:
model = make_pipeline(
    clone(preprocessor), 
    GridSearchCV(
        NeuralNetRegressor(
            Model,
            max_epochs=150,
            criterion=nn.MSELoss,
            batch_size=64,
            optimizer=optim.AdamW,
            iterator_train__shuffle=True,
            train_split=dataset.ValidSplit(cv=5),
            callbacks=[
                EarlyStopping(patience=10, monitor='valid_loss', load_best=True),
                LRScheduler(policy=optim.lr_scheduler.ReduceLROnPlateau, patience=5, factor=0.5, monitor='valid_loss'),  # type: ignore
                EpochScoring(scoring='r2', on_train=False),
            ]
        ),
        { 'lr': [5e-2] },
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42)
    )
).fit(X_train, np.array(y_train).reshape(-1, 1))

  y = column_or_1d(y, warn=True)


  epoch       r2        train_loss        valid_loss      lr     dur
-------  -------  ----------------  ----------------  ------  ------
      1  [36m-1.5844[0m  [32m12241183997.2655[0m  [35m10334733339.5899[0m  0.0500  1.0196
      2  -0.5073  [32m7908640791.1345[0m  [35m6027698767.4430[0m  0.0500  0.9402
      3  -0.1633  [32m4595999017.4259[0m  [35m4651986914.3073[0m  0.0500  0.7745
      4  0.1020  [32m3241416652.2363[0m  [35m3590952239.3042[0m  0.0500  0.7950
      5  0.0139  [32m2729053634.3478[0m  3943381777.5763  0.0500  0.9503
      6  0.0893  [32m2599307056.8922[0m  3641668810.6091  0.0500  0.9463
      7  0.0893  [32m2450339603.0276[0m  3641909785.9649  0.0500  0.9222
      8  0.1194  2462910727.6348  [35m3521583475.5094[0m  0.0500  0.9933
      9  0.2367  [32m2401801479.0568[0m  [35m3052294773.2477[0m  0.0500  0.9371
     10  0.2134  [32m2355540460.8375[0m  3145681048.3908  0.0500  0.9600
     11  0.1543  [32m2354710390.7656[0m  3382040777

In [7]:
seach = model[-1]

In [8]:
seach.cv_results_

{'mean_fit_time': array([59.73524218]),
 'std_fit_time': array([15.48263526]),
 'mean_score_time': array([0.09017854]),
 'std_score_time': array([0.0130369]),
 'param_lr': masked_array(data=[0.05],
              mask=[False],
        fill_value=1e+20),
 'params': [{'lr': 0.05}],
 'split0_test_score': array([0.43931413]),
 'split1_test_score': array([0.43628819]),
 'split2_test_score': array([0.40893616]),
 'split3_test_score': array([0.44645232]),
 'split4_test_score': array([0.42688541]),
 'mean_test_score': array([0.43157524]),
 'std_test_score': array([0.01294568]),
 'rank_test_score': array([1], dtype=int32)}

In [9]:
result_train = salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.5371
Train RMSE: 41851.1526
Train MAE: 25747.6193


In [10]:
result_test = salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.4858
Test RMSE: 41803.7793
Test MAE: 27076.5028
