# Train Random Forest

In [1]:
import salary
from sklearn.base import clone
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from skopt import BayesSearchCV
from sklearn.pipeline import make_pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
(X_train, y_train) = salary.get_train_dataset(include_extracted_salaries=True)

In [3]:
preprocessor = salary.get_preprocessor()
(train_size, num_features) = clone(preprocessor).fit_transform(X_train, y_train).shape
(train_size, num_features)

(32103, 3670)

## Train & Tune Model

In [4]:
model_rf = make_pipeline(
    clone(preprocessor),
    BayesSearchCV(
        RandomForestRegressor(random_state=42),
        # Comment to use tuned hyperparameters
        {
            'n_estimators': [194],
            'max_depth': [20],
            'min_samples_split': [2],
            'min_samples_leaf': [1],
            'max_features': ['sqrt'],
        },
        # Uncomment to tune hyperparameters
        # {
        #   'n_estimators': (50, 200),
        #   'max_depth': (10, 20), 
        #   'min_samples_split': (2, 10), 
        #   'min_samples_leaf': (1, 5),
        #   'max_features': ('sqrt', 'log2'), 
        # },
        n_iter=1,
        # n_iter=50,
        verbose=3,
        scoring='r2',
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
    )
).fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=194;, score=0.501 total time= 2.2min
[CV 2/5] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=194;, score=0.481 total time= 2.2min
[CV 3/5] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=194;, score=0.494 total time= 2.2min
[CV 4/5] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=194;, score=0.478 total time= 2.2min
[CV 5/5] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=194;, score=0.510 total time= 2.2min


In [5]:
model_rf

In [6]:
model_rf[-1].best_params_, model_rf[-1].best_score_

(OrderedDict([('max_depth', 20),
              ('max_features', 'sqrt'),
              ('min_samples_leaf', 1),
              ('min_samples_split', 2),
              ('n_estimators', 194)]),
 0.4929983047593227)

In [7]:
result_rf_train = salary.evaluate_train_predictions(model_rf.predict(X_train), y_train)

Train size: 32103
Train R2: 0.8937
Train RMSE: 19650.2198
Train MAE: 13634.8158


## Evaluate on Test Set

In [8]:
(X_test, y_test) = salary.get_test_dataset()

In [9]:
result_rf_test = salary.evaluate_test_predictions(model_rf.predict(X_test))

Test size: 10000
Test R2: 0.4827
Test RMSE: 43103.8318
Test MAE: 27512.3486
