# Salary Prediction from LinkedIn Job Postings - Train Multi Layer Perceptron

In [1]:
import salary
import numpy as np
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, GridSearchCV
import torch
from torch import nn, optim
import random
from skorch import NeuralNetRegressor, dataset
from skorch.callbacks import EarlyStopping, LRScheduler, EpochScoring

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)
random.seed(42)

## Train & Tune Model

In [3]:
(X_train, y_train) = salary.get_train_dataset()
(X_test, y_test) = salary.get_test_dataset()

In [6]:
preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('title_sbert_pca_encoder', make_pipeline(
                salary.SentenceBertEncoder(),
                StandardScaler(),
                PCA(n_components=0.9, random_state=42) 
            ), ['title']),
            ('location_sbert_pca_encoder', make_pipeline(
                salary.SentenceBertEncoder(),
                StandardScaler(),
                PCA(n_components=0.9, random_state=42) 
            ), ['location']),
            ('company_industries_sbert_pca_encoder', make_pipeline(
                SimpleImputer(strategy='constant', fill_value='Unknown'),
                salary.SentenceBertEncoder(),
                StandardScaler(),
                PCA(n_components=0.9, random_state=42) 
            ), ['company_industries']),
            ('requirements_sbert_pca_encoder', make_pipeline(
                SimpleImputer(strategy='constant', fill_value='Unknown'),
                salary.SentenceBertEncoder(),
                StandardScaler(),
                PCA(n_components=0.9, random_state=42) 
            ), ['Educational_Requirements', 'Preferred_Qualifications', 'Required_Skills']),
            ('one_hot_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['formatted_experience_level', 'formatted_work_type']),
            ('target_encoder', make_pipeline(
                TargetEncoder(random_state=42),
                StandardScaler(),
            ), ['norm_title', 'clustered_edu_req', 'clustered_pref_qual', 'clustered_req_skill', 'location_state', 'company_industries', 'formatted_experience_level', 'formatted_work_type']),
            ('experience_level', salary.experience_level_encoder, ['formatted_experience_level']),
            ('work_type', salary.work_type_encoder, ['formatted_work_type']),
            ('remote_allowed', 'passthrough', ['remote_allowed']),
            ('company_employee_count', make_pipeline(
                SimpleImputer(strategy='median'),
                StandardScaler(),
            ), ['company_employee_count']),
        ],
        remainder='drop'
    )
)

In [7]:
(train_size, num_features) = clone(preprocessor).fit_transform(X_train, y_train).shape
(train_size, num_features)

(27885, 693)

In [8]:
class Model(nn.Module):
    def __init__(self, n_units_1=256, n_units_2=128, n_units_3=64,
                dropout_rate=0.3, leaky_relu_slope=0.2):
        super().__init__()
        # Layer 1
        self.linear1 = nn.Linear(num_features, n_units_1).double()
        self.bn1 = nn.BatchNorm1d(n_units_1).double()
        self.dropout1 = nn.Dropout(dropout_rate).double()

        # Layer 2
        self.linear2 = nn.Linear(n_units_1, n_units_2).double()
        self.bn2 = nn.BatchNorm1d(n_units_2).double()
        self.dropout2 = nn.Dropout(dropout_rate).double()

        # Layer 3
        self.linear3 = nn.Linear(n_units_2, n_units_3).double()
        self.bn3 = nn.BatchNorm1d(n_units_3).double()
        self.dropout3 = nn.Dropout(dropout_rate).double()

        # Output layer
        self.output = nn.Linear(n_units_3, 1).double()

        # Activation function
        self.leaky_relu = nn.LeakyReLU(negative_slope=leaky_relu_slope).double()

    def forward(self, X):
        # Layer 1
        X = self.leaky_relu(self.linear1(X))
        X = self.bn1(X)
        X = self.dropout1(X)

        # Layer 2
        X = self.leaky_relu(self.linear2(X))
        X = self.bn2(X)
        X = self.dropout2(X)

        # Layer 3
        X = self.leaky_relu(self.linear3(X))
        X = self.bn3(X)
        X = self.dropout3(X)

        # Output layer
        X = self.output(X)
        
        return X

In [9]:
model = make_pipeline(
    clone(preprocessor), 
    GridSearchCV(
        NeuralNetRegressor(
            Model,
            max_epochs=150,
            criterion=nn.MSELoss,
            batch_size=64,
            optimizer=optim.AdamW,
            iterator_train__shuffle=True,
            train_split=dataset.ValidSplit(cv=5),
            callbacks=[
                EarlyStopping(patience=10, monitor='valid_loss', load_best=True),
                LRScheduler(policy=optim.lr_scheduler.ReduceLROnPlateau, patience=5, factor=0.5, monitor='valid_loss'),  # type: ignore
                EpochScoring(scoring='r2', on_train=False),
            ]
        ),
        { 'lr': [5e-2] },
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42)
    )
).fit(X_train, np.array(y_train).reshape(-1, 1))

  y = column_or_1d(y, warn=True)


  epoch       r2        train_loss       valid_loss      lr     dur
-------  -------  ----------------  ---------------  ------  ------
      1  [36m-1.0049[0m  [32m11310803984.4794[0m  [35m8017317688.9673[0m  0.0500  1.2315
      2  0.2749  [32m4943660606.8604[0m  [35m2899475631.9895[0m  0.0500  1.4282
      3  0.4804  [32m2350779603.1019[0m  [35m2077917644.0610[0m  0.0500  1.2294
      4  0.5320  [32m1868645828.5832[0m  [35m1871566533.9728[0m  0.0500  1.1663
      5  0.5504  [32m1746741505.7012[0m  [35m1798026135.4735[0m  0.0500  1.2045
      6  0.5765  [32m1657345899.1574[0m  [35m1693412445.5932[0m  0.0500  1.2065
      7  0.5705  [32m1613961236.1124[0m  1717595886.5485  0.0500  1.1979
      8  0.5500  [32m1546857175.2528[0m  1799543725.6605  0.0500  1.1423
      9  0.5395  [32m1539727724.6805[0m  1841543189.0935  0.0500  1.2030
     10  0.5548  [32m1492516503.0028[0m  1780267156.8163  0.0500  1.1696
     11  0.5452  [32m1489065880.3144[0m  181887

In [10]:
search = model[-1]
search.cv_results_

{'mean_fit_time': array([60.6911325]),
 'std_fit_time': array([17.91788435]),
 'mean_score_time': array([0.10628586]),
 'std_score_time': array([0.00624012]),
 'param_lr': masked_array(data=[0.05],
              mask=[False],
        fill_value=1e+20),
 'params': [{'lr': 0.05}],
 'split0_test_score': array([0.62353731]),
 'split1_test_score': array([0.61052332]),
 'split2_test_score': array([0.56944357]),
 'split3_test_score': array([0.57867442]),
 'split4_test_score': array([0.58790514]),
 'mean_test_score': array([0.59401675]),
 'std_test_score': array([0.02010229]),
 'rank_test_score': array([1], dtype=int32)}

In [11]:
result_train = salary.evaluate_train_predictions(model.predict(X_train))

Train R2: 0.8703
Train RMSE: 22155.0747
Train MAE: 13860.8270


In [12]:
result_test = salary.evaluate_test_predictions(model.predict(X_test))

Test R2: 0.6303
Test RMSE: 35445.6845
Test MAE: 22076.0723
