# Salary Prediction from LinkedIn Job Postings - Train Multi Layer Perceptron

In [1]:
import salary
import numpy as np
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, GridSearchCV
import torch
from torch import nn, optim
import random
from skorch import NeuralNetRegressor, dataset
from skorch.callbacks import EarlyStopping, LRScheduler, EpochScoring

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)
random.seed(42)

## Train & Tune Model

In [3]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [4]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('sbert_pca_encoder', make_pipeline(
                salary.SentenceBertEncoder(),
                StandardScaler(),
                PCA(n_components=0.9, random_state=42) 
            ), ['title']),
            ('one_hot_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', make_pipeline(
                TargetEncoder(random_state=42),
                StandardScaler(),
            ), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', make_pipeline(
                SimpleImputer(strategy='median'),
                StandardScaler(),
            ), ['company_employee_count']),
        ],
        remainder='drop'
    )
)

In [5]:
(train_size, num_features) = clone(preprocessor).fit_transform(X_train, y_train).shape
(train_size, num_features)

(27885, 416)

In [6]:
class Model(nn.Module):
    def __init__(self, n_units_1=128, n_units_2=64, n_units_3=32,
                dropout_rate=0.3, leaky_relu_slope=0.2):
        super().__init__()
        # Layer 1
        self.linear1 = nn.Linear(num_features, n_units_1).double()
        self.bn1 = nn.BatchNorm1d(n_units_1).double()
        self.dropout1 = nn.Dropout(dropout_rate).double()

        # Layer 2
        self.linear2 = nn.Linear(n_units_1, n_units_2).double()
        self.bn2 = nn.BatchNorm1d(n_units_2).double()
        self.dropout2 = nn.Dropout(dropout_rate).double()

        # Layer 3
        self.linear3 = nn.Linear(n_units_2, n_units_3).double()
        self.bn3 = nn.BatchNorm1d(n_units_3).double()
        self.dropout3 = nn.Dropout(dropout_rate).double()

        # Output layer
        self.output = nn.Linear(n_units_3, 1).double()

        # Activation function
        self.leaky_relu = nn.LeakyReLU(negative_slope=leaky_relu_slope).double()

    def forward(self, X):

        # Layer 1
        X = self.leaky_relu(self.linear1(X))
        X = self.bn1(X)
        X = self.dropout1(X)

        # Layer 2
        X = self.leaky_relu(self.linear2(X))
        X = self.bn2(X)
        X = self.dropout2(X)

        # Layer 3
        X = self.leaky_relu(self.linear3(X))
        X = self.bn3(X)
        X = self.dropout3(X)

        # Output layer
        X = self.output(X)
        
        return X

In [7]:
model = make_pipeline(
    clone(preprocessor), 
    GridSearchCV(
        NeuralNetRegressor(
            Model,
            max_epochs=150,
            criterion=nn.MSELoss,
            batch_size=64,
            optimizer=optim.AdamW,
            iterator_train__shuffle=True,
            train_split=dataset.ValidSplit(cv=5),
            callbacks=[
                EarlyStopping(patience=10, monitor='valid_loss', load_best=True),
                LRScheduler(policy=optim.lr_scheduler.ReduceLROnPlateau, patience=5, factor=0.5, monitor='valid_loss'),  # type: ignore
                EpochScoring(scoring='r2', on_train=False),
            ]
        ),
        { 'lr': [5e-2] },
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42)
    )
).fit(X_train, np.array(y_train).reshape(-1, 1))

  y = column_or_1d(y, warn=True)


  epoch       r2        train_loss        valid_loss      lr     dur
-------  -------  ----------------  ----------------  ------  ------
      1  [36m-1.5616[0m  [32m12214716175.6372[0m  [35m10243652971.8926[0m  0.0500  0.8837
      2  -0.3997  [32m7763566347.2989[0m  [35m5597123205.5185[0m  0.0500  0.7853
      3  0.1612  [32m4353052402.4303[0m  [35m3354309960.0587[0m  0.0500  0.7356
      4  0.3467  [32m2927128071.0143[0m  [35m2612572143.3684[0m  0.0500  0.6585
      5  0.4717  [32m2407537233.4736[0m  [35m2112515329.5892[0m  0.0500  0.6905
      6  0.4657  [32m2221162960.7383[0m  2136601366.6568  0.0500  0.7230
      7  0.4539  [32m2144760811.9689[0m  2183656105.5181  0.0500  0.8007
      8  0.4923  [32m2074109868.9030[0m  [35m2030251776.9537[0m  0.0500  0.8047
      9  0.5092  [32m2033463272.6732[0m  [35m1962469826.6615[0m  0.0500  0.7950
     10  0.5259  [32m2026239007.4539[0m  [35m1895881934.5715[0m  0.0500  0.8315
     11  0.5243  [32m19617

In [11]:
search = model[-1]
search.cv_results_

{'mean_fit_time': array([31.9399004]),
 'std_fit_time': array([4.38391876]),
 'mean_score_time': array([0.05998826]),
 'std_score_time': array([0.00811996]),
 'param_lr': masked_array(data=[0.05],
              mask=[False],
        fill_value=1e+20),
 'params': [{'lr': 0.05}],
 'split0_test_score': array([0.5619257]),
 'split1_test_score': array([0.54816932]),
 'split2_test_score': array([0.53213595]),
 'split3_test_score': array([0.56185541]),
 'split4_test_score': array([0.58780601]),
 'mean_test_score': array([0.55837848]),
 'std_test_score': array([0.01834965]),
 'rank_test_score': array([1], dtype=int32)}

In [12]:
result_train = salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.7054
Train RMSE: 33390.5534
Train MAE: 20270.4114


In [13]:
result_test = salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.6133
Test RMSE: 36255.3517
Test MAE: 22850.6605
