In [1]:
%load_ext autoreload
%autoreload 2

# Getting Started Tutorial with Tabular ML Toolkit

> A tutorial on getting started with Tabular ml toolkit

> tabular_ml_toolkit is a superfast helper library to speedup your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter tuning techniques.

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*For example, Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*


*No need to install scikit-learn as it comes preinstall with Tabular_ML_Toolkit*

In [2]:
from tabular_ml_toolkit.mlpipeline import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

# for displaying diagram of pipelines 
from sklearn import set_config
set_config(display="diagram")

# Just to compare fit times
import time

In [3]:
# Dataset file names and Paths
DIRECTORY_PATH = "https://raw.githubusercontent.com/psmathur/tabular_ml_toolkit/master/input/home_data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [4]:
from xgboost import XGBRegressor

xgb_params = {
    'n_estimators':250,
    'learning_rate':0.05,
    'random_state':42,
    # for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}


# create xgb ml model
xgb_model = XGBRegressor(**xgb_params)

In [5]:
# createm ml pipeline for scikit-learn model
tmlt = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=xgb_model,
    random_state=42)

2021-11-15 16:05:39,439 INFO 12 cores found, parallel processing is enabled!
2021-11-15 16:05:39,695 INFO DataFrame Memory usage decreased to 0.58 Mb (35.5% reduction)
2021-11-15 16:05:39,867 INFO DataFrame Memory usage decreased to 0.58 Mb (34.8% reduction)


In [6]:
tmlt.spl

In [7]:
# create train, valid split to evaulate model on valid dataset
tmlt.dfl.create_train_valid(valid_size=0.2)

start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
print('X_valid MAE:', mean_absolute_error(tmlt.dfl.y_valid, preds))

Fit Time: 0.491671085357666
X_valid MAE: 15851.009123501712


#### To see clear picture, let's do k_fold training on updated scikit model

In [8]:
# k-fold training
xgb_model_metrics_score, xgb_model_preds = tmlt.do_k_fold_training(n_splits=5,
                                                                          metrics=mean_absolute_error,
                                                                          random_state=42)
print("mean metrics score:", np.mean(xgb_model_metrics_score))
# predict
print(xgb_model_preds.shape)

2021-11-15 16:05:41,276 INFO fold: 1 , mean_absolute_error: 18947.19236943493
2021-11-15 16:05:41,852 INFO fold: 2 , mean_absolute_error: 15652.96465646404
2021-11-15 16:05:42,449 INFO fold: 3 , mean_absolute_error: 16128.323335830479
2021-11-15 16:05:43,028 INFO fold: 4 , mean_absolute_error: 15037.816045055652
2021-11-15 16:05:43,580 INFO fold: 5 , mean_absolute_error: 17555.253585188355


mean metrics score: 16664.309998394692
(1459,)


##### *MAE did become slightly bad with K_Fold*

#### Let's do HyperParameters Tunning for our entire MLPipeline

##### Let's see if we can improve our K_Fold score with hyperparams tunning

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [10]:
study = tmlt.do_xgb_optuna_optimization(task="regression", xgb_eval_metric="mae",
                                        kfold_metrics=mean_absolute_error, output_dir_path="output/")
print(study.best_trial)

[32m[I 2021-11-15 16:05:43,728][0m Using an existing study with name 'tmlt_autoxgb' instead of creating a new one.[0m


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:05:45,987 INFO fold: 1 , mean_absolute_error: 19246.632772367295


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:05:48,221 INFO fold: 2 , mean_absolute_error: 15704.935078660103


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:05:50,502 INFO fold: 3 , mean_absolute_error: 17000.048640839042


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:05:53,416 INFO fold: 4 , mean_absolute_error: 16007.503625321062


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:05:55,826 INFO fold: 5 , mean_absolute_error: 18046.178045403467
[32m[I 2021-11-15 16:05:55,858][0m Trial 7 finished with value: 17201.059632518194 and parameters: {'learning_rate': 0.02142495766588104, 'reg_lambda': 2.0905054096816745e-05, 'reg_alpha': 36.1202193756502, 'subsample': 0.8332071490092481, 'colsample_bytree': 0.25944468287140465, 'max_depth': 7, 'early_stopping_rounds': 376, 'n_estimators': 7000, 'tree_method': 'hist', 'booster': 'gblinear'}. Best is trial 7 with value: 17201.059632518194.[0m
2021-11-15 16:06:21,046 INFO fold: 1 , mean_absolute_error: 18157.227659460616
2021-11-15 16:06:46,056 INFO fold: 2 , mean_absolute_error: 16507.62276594606
2021-11-15 16:07:11,457 INFO fold: 3 , mean_absolute_error: 15874.978676155823
2021-11-15 16:07:37,260 INFO fold: 4 , mean_absolute_error: 14893.212863869863
2021-11-15 16:08:02,566 INFO fold: 5 , mean_absolute_error: 16745.390237050513
[32m[I 2021-11-15 16:08:02,601][0m Trial 8 finished with value: 16435.68644

Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:08:38,371 INFO fold: 1 , mean_absolute_error: 19616.919199486303


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:08:43,648 INFO fold: 2 , mean_absolute_error: 16925.43998822774


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:08:48,689 INFO fold: 3 , mean_absolute_error: 18552.299764554795


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:08:53,811 INFO fold: 4 , mean_absolute_error: 17627.064760809077


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:08:59,062 INFO fold: 5 , mean_absolute_error: 18836.926831389126
[32m[I 2021-11-15 16:08:59,085][0m Trial 10 finished with value: 18311.730108893407 and parameters: {'learning_rate': 0.19340434499674977, 'reg_lambda': 0.0003481743943921397, 'reg_alpha': 0.00032522317449823414, 'subsample': 0.24181761244898764, 'colsample_bytree': 0.8695919781291696, 'max_depth': 1, 'early_stopping_rounds': 186, 'n_estimators': 15000, 'tree_method': 'hist', 'booster': 'gblinear'}. Best is trial 8 with value: 16435.686440496575.[0m
2021-11-15 16:09:08,979 INFO fold: 1 , mean_absolute_error: 19629.71943225599
2021-11-15 16:09:17,126 INFO fold: 2 , mean_absolute_error: 14900.258855950342
2021-11-15 16:09:25,519 INFO fold: 3 , mean_absolute_error: 16803.461940817637
2021-11-15 16:09:34,709 INFO fold: 4 , mean_absolute_error: 14951.752100278254
2021-11-15 16:09:43,694 INFO fold: 5 , mean_absolute_error: 17642.639120023545
[32m[I 2021-11-15 16:09:43,716][0m Trial 11 finished with value: 167

Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:11:24,571 INFO fold: 1 , mean_absolute_error: 22087.79153735017


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:11:31,512 INFO fold: 2 , mean_absolute_error: 16808.001899614726


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:11:38,362 INFO fold: 3 , mean_absolute_error: 18454.003692208906


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:11:44,660 INFO fold: 4 , mean_absolute_error: 17232.824780607876


Parameters: { "colsample_bytree", "max_depth", "subsample", "tree_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 16:11:51,822 INFO fold: 5 , mean_absolute_error: 19826.390568145333
[32m[I 2021-11-15 16:11:51,846][0m Trial 14 finished with value: 18881.8024955854 and parameters: {'learning_rate': 0.01200110353506723, 'reg_lambda': 1.7433494721195334e-08, 'reg_alpha': 0.002858733837232982, 'subsample': 0.5833270860621196, 'colsample_bytree': 0.34650781620677695, 'max_depth': 4, 'early_stopping_rounds': 203, 'n_estimators': 20000, 'tree_method': 'exact', 'booster': 'gblinear'}. Best is trial 8 with value: 16435.686440496575.[0m


FrozenTrial(number=8, values=[16435.686440496575], datetime_start=datetime.datetime(2021, 11, 15, 16, 5, 55, 864211), datetime_complete=datetime.datetime(2021, 11, 15, 16, 8, 2, 567937), params={'booster': 'gbtree', 'colsample_bytree': 0.10914041370541645, 'early_stopping_rounds': 486, 'gamma': 0.007706290160321332, 'grow_policy': 'lossguide', 'learning_rate': 0.13886304297932908, 'max_depth': 7, 'n_estimators': 15000, 'reg_alpha': 0.03419106215700024, 'reg_lambda': 44.99931007602852, 'subsample': 0.6525689046101213, 'tree_method': 'exact'}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear')), 'colsample_bytree': UniformDistribution(high=1.0, low=0.1), 'early_stopping_rounds': IntUniformDistribution(high=500, low=100, step=1), 'gamma': LogUniformDistribution(high=1.0, low=1e-08), 'grow_policy': CategoricalDistribution(choices=('depthwise', 'lossguide')), 'learning_rate': LogUniformDistribution(high=0.25, low=0.01), 'max_depth': IntUniformDistribution(high=

In [11]:
study.best_trial.params

{'booster': 'gbtree',
 'colsample_bytree': 0.10914041370541645,
 'early_stopping_rounds': 486,
 'gamma': 0.007706290160321332,
 'grow_policy': 'lossguide',
 'learning_rate': 0.13886304297932908,
 'max_depth': 7,
 'n_estimators': 15000,
 'reg_alpha': 0.03419106215700024,
 'reg_lambda': 44.99931007602852,
 'subsample': 0.6525689046101213,
 'tree_method': 'exact'}

**Awesome we found best params with K-fold variations ~ 1 minute!**

##### Now let's use best params to update preprocessor and model in our pipeline

In [12]:
xgb_params = study.best_trial.params
xgb_model = XGBRegressor(**xgb_params)
tmlt.update_model(xgb_model)
tmlt.spl

In [13]:
# k-fold training
xgb_model_metrics_score, xgb_model_preds = tmlt.do_k_fold_training(n_splits=5,
                                                                          metrics=mean_absolute_error,
                                                                          random_state=42)
print("mean metrics score:", np.mean(xgb_model_metrics_score))
# predict
print(xgb_model_preds.shape)



Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 20:16:24,793 INFO fold: 1 , mean_absolute_error: 17798.944723886987


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 20:16:48,465 INFO fold: 2 , mean_absolute_error: 16216.41672463613


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 20:17:13,932 INFO fold: 3 , mean_absolute_error: 17065.856793129282


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 20:17:43,045 INFO fold: 4 , mean_absolute_error: 15368.304312928081


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




2021-11-15 20:18:09,125 INFO fold: 5 , mean_absolute_error: 16821.148330479453


mean metrics score: 16654.134177011987
(1459,)


In [None]:
# Now do cross_validation
start = time.time()

scores = tmlt.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
end = time.time()
print("Cross Validation Time:", end - start)

print("scores:", scores)
print("Average MAE score:", scores.mean())

#### Yup Indeed HyperParams tunning for data preprocessing and model has improved MAE from earlier cross validated model!

In background `prepare_data_for_training` method loads your input data into Pandas DataFrame, seprates X(features) and y(target).

The `prepare_data_for_training` methods prepare X and y DataFrames, preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances.

The `create_train_valid` method use valid_size to split X(features) into X_train, y_train, X_valid and y_valid DataFrames, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.


Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
#!pip install -U xgboost

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
from xgboost import XGBRegressor

xgb_params = {
    'n_estimators':250,
    'learning_rate':0.05,
    'random_state':42,
    # for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}


# create xgb ml model
xgb_model = XGBRegressor(**xgb_params)

In [None]:
# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl

In [None]:
# create train, valid split to evaulate model on valid dataset
tmlt.dfl.create_train_valid(valid_size=0.2)

start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
print('X_valid MAE:', mean_absolute_error(tmlt.dfl.y_valid, preds))

Wow, xgboost blown away scikit-model with impressive MAE from out of the box

#### Let's do Cross Validation for XGB Model on our MLPipeline

In [None]:
# cross_validation
scores = tmlt.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

#### let's use hyperparam tunning to find best xgb_params using tune grid search

In [None]:
# let's tune data preprocessing and model hyperparams
param_grid = {
#     "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
#     "preprocessor__low_card_cat_cols__imputer": [SimpleImputer(strategy='constant'),
#                                                  SimpleImputer(strategy='most_frequent')],
#     'model__n_estimators': [500,1000],
    'model__learning_rate': [0.02,0.05],
    'model__max_depth': [5,10]
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=5,
                                       scoring='neg_mean_absolute_error',
                                      early_stopping=False,
                                      time_budget_s=60)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {-1*(tune_search.best_score_):.3f}")

**Amazing our MAE has reduced to 15689.22 by HyperParamss tunning, If we can continue doing hyperparmas tunning, may be we can even do better, take that as challenge!**

###### Let's use our newly found params for k-fold training

In [None]:
model_params = tmlt.get_model_best_params(tune_search)
xgb_model = XGBRegressor(**xgb_params)
tmlt.update_model(xgb_model)
tmlt.spl

#### Let's Use K-Fold Training for XGB model

In [None]:
# k-fold training
xgb_model_metrics_score, xgb_model_preds = tmlt.do_k_fold_training(n_splits=10, metrics=mean_absolute_error)
print("mean metrics score:", np.mean(xgb_model_metrics_score))
# predict on test dataset
print(xgb_model_preds.shape)


##### Let's mix the predictions, using weighted average

In [None]:
x1 = 0.4
x2 = 0.6

final_preds = ((x1*scikit_model_preds) + (x2*xgb_model_preds)) / 2

print(final_preds.shape)

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()