# Train Multi Layer Perceptron

In [1]:
import salary
import numpy as np
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator
import pandas as pd
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
import torch
from torch import nn, optim
import random
import joblib
from skorch import NeuralNetRegressor, dataset
from skorch.callbacks import EarlyStopping, LRScheduler, EpochScoring

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)
random.seed(42)

In [3]:
(X_train, y_train) = salary.get_train_dataset(include_extracted_salaries=True)

In [4]:
preprocessor = salary.get_preprocessor()
(train_size, num_features) = clone(preprocessor).fit_transform(X_train, y_train).shape
(train_size, num_features)

(32103, 3670)

## Train & Tune Model

In [5]:
class Model(nn.Module):
    def __init__(self, num_hidden_layers: int, n_units_last: int, dropout_rate: float):
        super().__init__()

        if num_hidden_layers < 1 or num_hidden_layers > 4:
            raise ValueError("num_hidden_layers must be between 1 and 4")

        self.layers = nn.ModuleList()
        layer_sizes = [n_units_last * (2 ** i) for i in reversed(range(num_hidden_layers))]
        
        # Add hidden layers based on num_hidden_layers parameter
        for i in range(num_hidden_layers):
            layer_size = layer_sizes[i]
            self.layers.append(nn.LazyLinear(layer_size))
            self.layers.append(nn.BatchNorm1d(layer_size))
            self.layers.append(nn.LeakyReLU())
            self.layers.append(nn.Dropout(dropout_rate))

         # Output layer
        self.layers.append(nn.LazyLinear(1))

    def forward(self, X):
        for layer in self.layers:
            X = layer(X)
            
        return X


class CustomNeuralNetRegressor(NeuralNetRegressor):
    def __init__(self, *args, lambda1=0.01, **kwargs):
        super().__init__(*args, **kwargs)
        self.lambda1 = lambda1

    def get_loss(self, y_pred, y_true, X=None, training=False):
        loss = super().get_loss(y_pred, y_true, X=X, training=training)
        # L1 regularization for only the first layer
        loss += self.lambda1 * sum([w.abs().sum() for w in self.module_.layers[0].parameters()])
        return loss
        

In [6]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class TensorTransformer(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return torch.tensor(X, dtype=torch.float32)
    
y_train_tensor = torch.tensor(np.array(y_train).reshape(-1, 1), dtype=torch.float32)

model = make_pipeline(
    clone(preprocessor), 
    TensorTransformer(),
    BayesSearchCV(
        CustomNeuralNetRegressor(
            Model,
            max_epochs=100,
            criterion=nn.MSELoss,
            optimizer=optim.AdamW,
            iterator_train__shuffle=True,
            iterator_train__drop_last=True,
            train_split=dataset.ValidSplit(cv=5),
            callbacks=[
                EarlyStopping(patience=10, monitor='valid_loss', load_best=True),
                LRScheduler(policy=optim.lr_scheduler.ReduceLROnPlateau, patience=5, factor=0.5, monitor='valid_loss'),  # type: ignore
                EpochScoring(scoring='r2', on_train=False),
            ],
            device=DEVICE,
        ),
        # Comment to use tuned hyperparameters
        {
            'lambda1': [0.0001],
            'lr': [0.00447],
            'batch_size': [128],
            'module__num_hidden_layers': [3],
            'module__n_units_last': [256],
            'module__dropout_rate': [0.5],
        },
        # Uncomment to tune hyperparameters
        # { 
        #     'lambda1': (1e-4, 1e-1, 'log-uniform'),
        #     'lr': (1e-4, 1e-1, 'log-uniform'),
        #     'batch_size': [32, 64, 128, 256],
        #     'module__num_hidden_layers': [1, 2, 3, 4],
        #     'module__n_units_last': [16, 32, 64, 128, 256],
        #     'module__dropout_rate': (0.1, 0.5, 'uniform'),
        # },
        verbose=3,
        scoring='r2',
        n_iter=1,
        # n_iter=50,
        random_state=42,
        cv=KFold(n_splits=5, shuffle=True, random_state=42)
    )
).fit(X_train, y_train_tensor)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
  epoch       r2        train_loss        valid_loss      lr     dur
-------  -------  ----------------  ----------------  ------  ------
      1  [36m-2.4580[0m  [32m12863820505.6000[0m  [35m12860708981.0115[0m  0.0045  2.2399
      2  -2.3888  [32m12686198182.4000[0m  [35m12603213351.5686[0m  0.0045  2.4460
      3  -2.2731  [32m12311920371.2000[0m  [35m12172921234.4637[0m  0.0045  2.2827
      4  -2.1104  [32m11772511161.6000[0m  [35m11567903282.3329[0m  0.0045  2.2145
      5  -1.9158  [32m11093443270.4000[0m  [35m10844347179.9042[0m  0.0045  2.4947
      6  -1.7501  [32m10332852268.8000[0m  [35m10227930152.5653[0m  0.0045  2.1523
      7  -1.5002  [32m9505489782.4000[0m  [35m9298722113.9311[0m  0.0045  2.1174
      8  -1.3333  [32m8629473446.4000[0m  [35m8677978995.1676[0m  0.0045  2.6664
      9  -1.1410  [32m7747103155.2000[0m  [35m7962777569.5013[0m  0.0045  2.1460
     10  -0.8749 

In [7]:
search = model[-1]
search.cv_results_

{'mean_fit_time': array([161.20745792]),
 'std_fit_time': array([27.78315415]),
 'mean_score_time': array([0.16876402]),
 'std_score_time': array([0.00729483]),
 'param_batch_size': masked_array(data=[128],
              mask=[False],
        fill_value=999999),
 'param_lambda1': masked_array(data=[0.0001],
              mask=[False],
        fill_value=1e+20),
 'param_lr': masked_array(data=[0.00447],
              mask=[False],
        fill_value=1e+20),
 'param_module__dropout_rate': masked_array(data=[0.5],
              mask=[False],
        fill_value=1e+20),
 'param_module__n_units_last': masked_array(data=[256],
              mask=[False],
        fill_value=999999),
 'param_module__num_hidden_layers': masked_array(data=[3],
              mask=[False],
        fill_value=999999),
 'params': [OrderedDict([('batch_size', 128),
               ('lambda1', 0.0001),
               ('lr', 0.00447),
               ('module__dropout_rate', 0.5),
               ('module__n_units_last', 2

In [8]:
search.best_params_

OrderedDict([('batch_size', 128),
             ('lambda1', 0.0001),
             ('lr', 0.00447),
             ('module__dropout_rate', 0.5),
             ('module__n_units_last', 256),
             ('module__num_hidden_layers', 3)])

In [9]:
result_train = salary.evaluate_train_predictions(model.predict(X_train), y_train)

Train size: 32103
Train R2: 0.8977
Train RMSE: 19280.7058
Train MAE: 9552.3990


## Evaluate on Test Set

In [10]:
(X_test, y_test) = salary.get_test_dataset()

In [11]:
result_test = salary.evaluate_test_predictions(model.predict(X_test))

Test size: 10000
Test R2: 0.6726
Test RMSE: 34290.0580
Test MAE: 20141.6577


## Export Test Predictions

In [12]:
y_test_preds = model.predict(X_test)

In [13]:
X_test_preds = pd.concat([X_test[['job_id']], pd.Series(y_test_preds.reshape(-1), name='predicted_salary')], axis=1)
X_test_preds.to_csv('data/test_preds_mlp.csv', index=False)

## Export Model

In [14]:
joblib.dump(model, 'models/model_mlp.pkl')

['models/model_mlp.pkl']