In [1]:
%load_ext autoreload
%autoreload 2

# Getting Started Tutorial with Tabular ML Toolkit

> A tutorial on getting started with Tabular ml toolkit

> tabular_ml_toolkit is a superfast helper library to speedup your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter tuning techniques.

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*For example, Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*


*No need to install scikit-learn as it comes preinstall with Tabular_ML_Toolkit*

In [2]:
from tabular_ml_toolkit.tmlt import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

# for displaying diagram of pipelines 
from sklearn import set_config
set_config(display="diagram")

# Just to compare fit times
import time



In [3]:
# Dataset file names and Paths
DIRECTORY_PATH = "https://raw.githubusercontent.com/psmathur/tabular_ml_toolkit/master/input/home_data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [4]:
from xgboost import XGBRegressor

xgb_params = {
    'n_estimators':250,
    'learning_rate':0.05,
    'random_state':42,
    # for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}


# create xgb ml model
xgb_model = XGBRegressor(**xgb_params)

In [5]:
# createm ml pipeline for scikit-learn model
tmlt = TMLT().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=xgb_model,
    random_state=42,
    problem_type="regression")

2021-11-18 23:18:57,335 INFO 8 cores found, parallel processing is enabled!
2021-11-18 23:18:57,736 INFO DataFrame Memory usage decreased to 0.58 Mb (35.5% reduction)
2021-11-18 23:18:58,066 INFO DataFrame Memory usage decreased to 0.58 Mb (34.8% reduction)
2021-11-18 23:18:58,093 INFO Both Numerical & Categorical columns found, Preprocessing will done accordingly!


In [6]:
# let' see default pipeline
tmlt.spl

#### To see clear picture, let's do k_fold training on updated scikit model

In [7]:
# # k-fold training
# xgb_model_metrics_score, xgb_model_preds = tmlt.do_k_fold_training(n_splits=5,
#                                                                           metrics=mean_absolute_error,
#                                                                           random_state=42)
# print("mean metrics score:", np.mean(xgb_model_metrics_score))
# # predict
# print(xgb_model_preds.shape)

##### Let's see if we can improve our K_Fold score with hyperparams tunning

In [8]:
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.impute import SimpleImputer
# # from sklearn.

In [9]:
study = tmlt.do_xgb_optuna_optimization(xgb_eval_metric="rmse",
                                        kfold_metrics=mean_absolute_error,
                                        output_dir_path="output/")
print(study.best_trial)

2021-11-18 23:19:29,926 INFO direction is: minimize
[32m[I 2021-11-18 23:19:29,978][0m Using an existing study with name 'tmlt_autoxgb' instead of creating a new one.[0m


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:33,234 INFO fold: 1 , mean_absolute_error: 20805.823295697774


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:36,177 INFO fold: 2 , mean_absolute_error: 19226.308366331337


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:39,180 INFO fold: 3 , mean_absolute_error: 20080.022802065498


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:42,251 INFO fold: 4 , mean_absolute_error: 18467.855575770547


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:45,392 INFO fold: 5 , mean_absolute_error: 19638.841154751713
2021-11-18 23:19:45,392 INFO  mean metrics score: 19643.770238923375
[32m[I 2021-11-18 23:19:45,428][0m Trial 7 finished with value: 19643.770238923375 and parameters: {'learning_rate': 0.09553662209579879, 'reg_lambda': 0.6052569845392561, 'reg_alpha': 8.335455567301325e-05, 'subsample': 0.32599069235941713, 'colsample_bytree': 0.7836864633954334, 'max_depth': 5, 'early_stopping_rounds': 398, 'n_estimators': 7000, 'tree_method': 'approx', 'booster': 'gblinear'}. Best is trial 6 with value: 17098.02529831978.[0m


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:48,707 INFO fold: 1 , mean_absolute_error: 19168.4951171875


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:51,801 INFO fold: 2 , mean_absolute_error: 17245.59119488442


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:54,933 INFO fold: 3 , mean_absolute_error: 18543.452402611303


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:19:58,041 INFO fold: 4 , mean_absolute_error: 16902.828874143837


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:20:01,243 INFO fold: 5 , mean_absolute_error: 17934.75784594392
2021-11-18 23:20:01,244 INFO  mean metrics score: 17959.0250869542
[32m[I 2021-11-18 23:20:01,265][0m Trial 8 finished with value: 17959.0250869542 and parameters: {'learning_rate': 0.19256200478224605, 'reg_lambda': 0.004356080228500992, 'reg_alpha': 12.200425206909655, 'subsample': 0.31335741793364735, 'colsample_bytree': 0.8363127339177757, 'max_depth': 7, 'early_stopping_rounds': 115, 'n_estimators': 7000, 'tree_method': 'hist', 'booster': 'gblinear'}. Best is trial 6 with value: 17098.02529831978.[0m


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:20:04,480 INFO fold: 1 , mean_absolute_error: 19521.103034032534


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:20:07,871 INFO fold: 2 , mean_absolute_error: 17689.62231779752


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:20:10,970 INFO fold: 3 , mean_absolute_error: 17897.935894691782


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:20:13,955 INFO fold: 4 , mean_absolute_error: 16004.63112692637


Parameters: { colsample_bytree, max_depth, subsample, tree_method } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




2021-11-18 23:20:17,152 INFO fold: 5 , mean_absolute_error: 17576.910664597603
2021-11-18 23:20:17,152 INFO  mean metrics score: 17738.04060760916
[32m[I 2021-11-18 23:20:17,175][0m Trial 9 finished with value: 17738.04060760916 and parameters: {'learning_rate': 0.01849171116282053, 'reg_lambda': 0.051657995636785164, 'reg_alpha': 0.00010578539845628686, 'subsample': 0.22296944267303037, 'colsample_bytree': 0.3271278971978473, 'max_depth': 6, 'early_stopping_rounds': 499, 'n_estimators': 7000, 'tree_method': 'hist', 'booster': 'gblinear'}. Best is trial 6 with value: 17098.02529831978.[0m
2021-11-18 23:21:40,048 INFO fold: 1 , mean_absolute_error: 21102.952723672945
2021-11-18 23:23:10,856 INFO fold: 2 , mean_absolute_error: 17694.773103060787
2021-11-18 23:24:37,495 INFO fold: 3 , mean_absolute_error: 18632.003411279966


KeyboardInterrupt: 

2021-11-18 22:05:18,152 INFO direction is: minimize
[I 2021-11-18 22:05:18,207] Using an existing study with name 'tmlt_autoxgb' instead of creating a new one.
/Users/pankajmathur/anaconda3/envs/nbdev_env/lib/python3.9/site-packages/sklearn/model_selection/_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
2021-11-18 22:05:55,715 INFO fold: 1 , mean_absolute_error: 17737.759805757705
2021-11-18 22:06:31,210 INFO fold: 2 , mean_absolute_error: 13937.593455693494
2021-11-18 22:07:07,237 INFO fold: 3 , mean_absolute_error: 13931.05033979024
2021-11-18 22:07:48,523 INFO fold: 4 , mean_absolute_error: 12936.826492936643
2021-11-18 22:08:25,673 INFO fold: 5 , mean_absolute_error: 14848.775216716609
[I 2021-11-18 22:08:25,699] Trial 48 finished with value: 14678.401062178938 and parameters: {'learning_rate': 0.010227648390602546, 'reg_lambda': 8.015393563720193e-06, 'reg_alpha': 3.7753443233851705e-06, 'subsample': 0.5799411949016183, 'colsample_bytree': 0.613735233825501, 'max_depth': 4, 'early_stopping_rounds': 200, 'n_estimators': 20000, 'tree_method': 'hist', 'booster': 'gbtree', 'gamma': 0.01783928295659629, 'grow_policy': 'lossguide'}. Best is trial 48 with value: 14678.401062178938.
/Users/pankajmathur/anaconda3/envs/nbdev_env/lib/python3.9/site-packages/sklearn/model_selection/_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(
2021-11-18 22:09:02,651 INFO fold: 1 , mean_absolute_error: 17615.194228916953
2021-11-18 22:09:40,540 INFO fold: 2 , mean_absolute_error: 14338.525511023116
2021-11-18 22:10:18,411 INFO fold: 3 , mean_absolute_error: 13896.110378317637
2021-11-18 22:10:56,342 INFO fold: 4 , mean_absolute_error: 13176.088907320205
2021-11-18 22:11:35,685 INFO fold: 5 , mean_absolute_error: 14923.216475813357
[I 2021-11-18 22:11:35,704] Trial 49 finished with value: 14789.827100278253 and parameters: {'learning_rate': 0.01003614304176459, 'reg_lambda': 1.3143006220261207e-05, 'reg_alpha': 3.3506330151130134e-06, 'subsample': 0.543467363947305, 'colsample_bytree': 0.6249109087231277, 'max_depth': 4, 'early_stopping_rounds': 192, 'n_estimators': 20000, 'tree_method': 'hist', 'booster': 'gbtree', 'gamma': 0.028252682493464184, 'grow_policy': 'lossguide'}. Best is trial 48 with value: 14678.401062178938.
FrozenTrial(number=48, values=[14678.401062178938], datetime_start=datetime.datetime(2021, 11, 18, 22, 5, 18, 249529), datetime_complete=datetime.datetime(2021, 11, 18, 22, 8, 25, 674630), params={'booster': 'gbtree', 'colsample_bytree': 0.613735233825501, 'early_stopping_rounds': 200, 'gamma': 0.01783928295659629, 'grow_policy': 'lossguide', 'learning_rate': 0.010227648390602546, 'max_depth': 4, 'n_estimators': 20000, 'reg_alpha': 3.7753443233851705e-06, 'reg_lambda': 8.015393563720193e-06, 'subsample': 0.5799411949016183, 'tree_method': 'hist'}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear')), 'colsample_bytree': UniformDistribution(high=1.0, low=0.1), 'early_stopping_rounds': IntUniformDistribution(high=500, low=100, step=1), 'gamma': LogUniformDistribution(high=1.0, low=1e-08), 'grow_policy': CategoricalDistribution(choices=('depthwise', 'lossguide')), 'learning_rate': LogUniformDistribution(high=0.25, low=0.01), 'max_depth': IntUniformDistribution(high=9, low=1, step=1), 'n_estimators': CategoricalDistribution(choices=(7000, 15000, 20000)), 'reg_alpha': LogUniformDistribution(high=100.0, low=1e-08), 'reg_lambda': LogUniformDistribution(high=100.0, low=1e-08), 'subsample': UniformDistribution(high=1.0, low=0.1), 'tree_method': CategoricalDistribution(choices=('exact', 'approx', 'hist'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=49, state=TrialState.COMPLETE, value=None)

In [None]:
study.best_trial.params

**Awesome we found best params with K-fold variations ~ 1 minute!**

##### Now let's use best params to update preprocessor and model in our pipeline

In [None]:
# autoxgb_params = {'learning_rate': 0.016067642810265004,

# 'reg_lambda': 0.0005033307729410949,

# 'reg_alpha': 1.125131255655592e-06,

# 'subsample': 0.43211847297916883,

# 'colsample_bytree': 0.4106787563173376,

# 'max_depth': 5,

# 'early_stopping_rounds': 354,

# 'n_estimators': 7000,

# 'tree_method': 'approx',

# 'booster': 'gbtree',

# 'gamma': 0.2870988185671683,

# 'grow_policy': 'depthwise'}

In [None]:
# xgb_params = autoxgb_params
tmlt_xgb_params = study.best_trial.params
xgb_params =  tmlt_xgb_params
xgb_model = XGBRegressor(**xgb_params)
tmlt.update_model(xgb_model)
tmlt.spl

In [None]:
# k-fold training
xgb_model_metrics_score, xgb_model_preds = tmlt.do_kfold_training(n_splits=5,
                                                                          metrics=mean_absolute_error,
                                                                          random_state=42)
# predict
print(xgb_model_preds.shape)

mean metrics score: 14761.77

(1459,)

#### Yup Indeed Optuna tunning xgb model has improved MAE from earlier cross validated model!

**Amazing our MAE has reduced to 14761.77 by using Optuna based HyperParams search, If we can continue doing hyperparmas tunning, may be we can even do better, take that as challenge!**