# Train Linear Model

In [19]:
import salary
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from skopt import BayesSearchCV
import numpy as np
from sklearn.model_selection import KFold

In [20]:
(X_train, y_train) = salary.get_train_dataset(include_extracted_salaries=True)

In [21]:
preprocessor = salary.get_preprocessor()
(train_size, num_features) = clone(preprocessor).fit_transform(X_train, y_train).shape
(train_size, num_features)

(32103, 3670)

## Train & Tune Lasso (L1)

In [22]:
model_lasso = make_pipeline(
    clone(preprocessor),
    BayesSearchCV(
        Lasso(warm_start=True, max_iter=100),
        # Comment to use tuned hyperparameters
        { 'alpha': [87.145] },
        # Uncomment to tune hyperparameters
        # { 'alpha': (1e-3, 1e+3, 'log-uniform') },
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        n_iter=1,
        # n_iter=50,
        verbose=3
    )
).fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END ......................alpha=87.145;, score=0.576 total time= 8.8min


  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END ......................alpha=87.145;, score=0.554 total time= 9.0min


  model = cd_fast.enet_coordinate_descent(


[CV 3/5] END ......................alpha=87.145;, score=0.581 total time= 9.0min


  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END ......................alpha=87.145;, score=0.552 total time=12.0min


  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END ......................alpha=87.145;, score=0.575 total time= 4.5min


  model = cd_fast.enet_coordinate_descent(


In [23]:
model_lasso[-1].best_params_, model_lasso[-1].best_score_

(OrderedDict([('alpha', 87.145)]), 0.5676450422728676)

In [24]:
result_train_lasso = salary.evaluate_train_predictions(model_lasso.predict(X_train), y_train)

Train size: 32103
Train R2: 0.6196
Train RMSE: 37176.0120
Train MAE: 23027.3189


## Train & Tune Ridge (L2)

In [25]:
model_ridge = make_pipeline(
    clone(preprocessor),
    BayesSearchCV(
        Ridge(),
        # Comment to use tuned hyperparameters
        { 'alpha': [1000.0] },
        # Uncomment to tune hyperparameters
        # { 'alpha': (1e-3, 1e+3, 'log-uniform') },
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        n_iter=1,
        # n_iter=50,
        verbose=3
    )
).fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ......................alpha=1000.0;, score=0.571 total time=  24.5s
[CV 2/5] END ......................alpha=1000.0;, score=0.550 total time=   6.4s
[CV 3/5] END ......................alpha=1000.0;, score=0.579 total time=   6.4s
[CV 4/5] END ......................alpha=1000.0;, score=0.547 total time=15.3min
[CV 5/5] END ......................alpha=1000.0;, score=0.573 total time=15.7min


In [26]:
model_ridge[-1].best_params_, model_ridge[-1].best_score_

(OrderedDict([('alpha', 1000.0)]), 0.5640118927071411)

In [27]:
result_train_ridge = salary.evaluate_train_predictions(model_ridge.predict(X_train), y_train)

Train size: 32103
Train R2: 0.6358
Train RMSE: 36374.1587
Train MAE: 22798.8486


## Train & Tune ElasticNet

In [28]:
model_elasticnet = make_pipeline(
    clone(preprocessor),
    BayesSearchCV(
        ElasticNet(warm_start=True, max_iter=100),
        # Comment to use tuned hyperparameters
        { 
            'alpha': [0.32865],
            'l1_ratio': [0.48027] 
        },
        # Uncomment to tune hyperparameters
        # { 
        #     'alpha': (1e-3, 1e+3, 'log-uniform'),
        #     'l1_ratio': (0.0, 1.0, 'uniform')
        # },
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        n_iter=1,
        # n_iter=50,
        verbose=3
    )
).fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END ...alpha=0.32865, l1_ratio=0.48027;, score=0.576 total time= 3.4min


  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END ...alpha=0.32865, l1_ratio=0.48027;, score=0.553 total time= 3.6min


  model = cd_fast.enet_coordinate_descent(


[CV 3/5] END ...alpha=0.32865, l1_ratio=0.48027;, score=0.581 total time= 4.1min


  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END ...alpha=0.32865, l1_ratio=0.48027;, score=0.553 total time= 4.7min


  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END ...alpha=0.32865, l1_ratio=0.48027;, score=0.575 total time= 4.4min


  model = cd_fast.enet_coordinate_descent(


In [29]:
model_elasticnet[-1].best_params_, model_elasticnet[-1].best_score_

(OrderedDict([('alpha', 0.32865), ('l1_ratio', 0.48027)]), 0.5677541591106834)

In [30]:
result_train_elasticnet = salary.evaluate_train_predictions(model_elasticnet.predict(X_train), y_train)

Train size: 32103
Train R2: 0.6195
Train RMSE: 37178.0472
Train MAE: 22914.5429


## Select Best Model

In [31]:
MODEL_NAMES = ['lasso', 'ridge', 'elasticnet']
best_model_name = MODEL_NAMES[np.argmax([model_lasso[-1].best_score_, model_ridge[-1].best_score_, model_elasticnet[-1].best_score_])]
best_model = {
    'lasso': model_lasso,
    'ridge': model_ridge,
    'elasticnet': model_elasticnet
}[best_model_name]

print(f'Best model: {best_model_name}')
print(f'Best model score: {best_model[-1].best_score_}')
print(f'Best model params: {best_model[-1].best_params_}')

Best model: elasticnet
Best model score: 0.5677541591106834
Best model params: OrderedDict({'alpha': 0.32865, 'l1_ratio': 0.48027})


## Evaluate on Test Set

In [32]:
(X_test, y_test) = salary.get_test_dataset()

In [33]:
result_test = salary.evaluate_test_predictions(best_model.predict(X_test))

Test size: 10000
Test R2: 0.5703
Test RMSE: 39283.5080
Test MAE: 24611.0859


In [34]:
for model_name, model in zip(MODEL_NAMES, [model_lasso, model_ridge, model_elasticnet]):
    print(f'For {model_name}...')
    salary.evaluate_test_predictions(model.predict(X_test))

For lasso...
Test size: 10000
Test R2: 0.5704
Test RMSE: 39280.8173
Test MAE: 24732.2059
For ridge...
Test size: 10000
Test R2: 0.5680
Test RMSE: 39388.4993
Test MAE: 25113.3795
For elasticnet...
Test size: 10000
Test R2: 0.5703
Test RMSE: 39283.5080
Test MAE: 24611.0859
