In [26]:
%load_ext autoreload
%autoreload 2

# Getting Started Tutorial with Tabular ML Toolkit

> A tutorial on getting started with Tabular ml toolkit

> tabular_ml_toolkit is a superfast helper library to speedup your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter tuning techniques.

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*For example, Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*


*No need to install scikit-learn as it comes preinstall with Tabular_ML_Toolkit*

In [1]:
from tabular_ml_toolkit.mlpipeline import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

# for displaying diagram of pipelines 
from sklearn import set_config
set_config(display="diagram")

# Just to compare fit times
import time

In [2]:
# Dataset file names and Paths
DIRECTORY_PATH = "https://raw.githubusercontent.com/psmathur/tabular_ml_toolkit/master/input/home_data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [3]:
# create scikit-learn ml model
scikit_model = RandomForestRegressor(random_state=42)

# createm ml pipeline for scikit-learn model
tmlt = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=scikit_model,
    random_state=42)

2021-11-15 14:48:51,682 INFO 12 cores found, parallel processing is enabled!
2021-11-15 14:48:52,293 INFO DataFrame Memory usage decreased to 0.58 Mb (35.5% reduction)
2021-11-15 14:48:52,766 INFO DataFrame Memory usage decreased to 0.58 Mb (34.8% reduction)


In [4]:
tmlt.spl

In [5]:
# create train, valid split to evaulate model on valid dataset
tmlt.dfl.create_train_valid(valid_size=0.2)

start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
print('X_valid MAE:', mean_absolute_error(tmlt.dfl.y_valid, preds))

Fit Time: 1.0281760692596436
X_valid MAE: 17634.989965753426




#### To see more clear picture of model performance, Let's do Cross Validation on our Pipeline

In [6]:
start = time.time()
# Now do cross_validation
scores = tmlt.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
end = time.time()
print("Cross Validation Time:", end - start)

print("scores:", scores)
print("Average MAE score:", scores.mean())

Cross Validation Time: 6.794143915176392
scores: [18028.27554795 17426.42736301 17942.29359589 15938.34671233
 19148.49674658]
Average MAE score: 17696.767993150686


##### *MAE did become slightly bad with cross validation*

#### Let's do HyperParameters Tunning for our entire MLPipeline

##### Let's see if we can improve our cross validation score with hyperparams tunning

In [7]:
# create a new scikit-learn ml model with minimum amount of estimator for warm start
scikit_model_new = RandomForestRegressor(n_estimators=10,
                                     random_state=42)
# Update pipeline with new model
tmlt.update_model(scikit_model_new)
tmlt.spl

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [9]:
# let's tune data preprocessing and model hyperparams

param_grid = {
    "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
    "preprocessor__cat_cols__imputer": [SimpleImputer(strategy='constant'),
                                                 SimpleImputer(strategy='most_frequent')],
    'model__n_estimators': [250,500],
    'model__max_features': ["auto", "sqrt"],
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=5,
                                       scoring='neg_mean_absolute_error',
                                      early_stopping=False,
                                      time_budget_s=60)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {-1*(tune_search.best_score_):.3f}")

[2m[36m(pid=48183)[0m 2021-11-15 14:50:43,272	ERROR worker.py:425 -- SystemExit was raised from the worker
[2m[36m(pid=48183)[0m Traceback (most recent call last):
[2m[36m(pid=48183)[0m   File "python/ray/_raylet.pyx", line 692, in ray._raylet.task_execution_handler
[2m[36m(pid=48183)[0m   File "python/ray/_raylet.pyx", line 521, in ray._raylet.execute_task
[2m[36m(pid=48183)[0m   File "python/ray/_raylet.pyx", line 558, in ray._raylet.execute_task
[2m[36m(pid=48183)[0m   File "python/ray/_raylet.pyx", line 565, in ray._raylet.execute_task
[2m[36m(pid=48183)[0m   File "python/ray/_raylet.pyx", line 569, in ray._raylet.execute_task
[2m[36m(pid=48183)[0m   File "python/ray/_raylet.pyx", line 519, in ray._raylet.execute_task.function_executor
[2m[36m(pid=48183)[0m   File "/Users/pamathur/miniconda3/envs/nbdev_env/lib/python3.9/site-packages/ray/_private/function_manager.py", line 576, in actor_method_executor
[2m[36m(pid=48183)[0m     return method(__ray_acto

Grid Search Time: 70.82670211791992
Best params:
{'preprocessor__num_cols__scaler': StandardScaler(), 'preprocessor__cat_cols__imputer': SimpleImputer(strategy='most_frequent'), 'model__n_estimators': 500, 'model__max_features': 'sqrt'}
Internal CV Metrics score: 17418.836


**Awesome we found best params with K-fold variations ~ 1 minute!**

##### Now let's use best params to update preprocessor and model in our pipeline

In [10]:
pp_params = tmlt.get_preprocessor_best_params(tune_search)
tmlt.update_preprocessor(**pp_params)
tmlt.spl

In [11]:
model_params = tmlt.get_model_best_params(tune_search)
scikit_model = RandomForestRegressor(**model_params)
tmlt.update_model(scikit_model)
tmlt.spl

In [12]:
# Now do cross_validation
start = time.time()

scores = tmlt.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
end = time.time()
print("Cross Validation Time:", end - start)

print("scores:", scores)
print("Average MAE score:", scores.mean())

Cross Validation Time: 18.817033052444458
scores: [15963.85982192 18682.62384932 18153.63838356 15763.98878082
 18451.25459589]
Average MAE score: 17403.073086301367


#### Yup Indeed HyperParams tunning for data preprocessing and model has improved MAE from earlier cross validated model!

In background `prepare_data_for_training` method loads your input data into Pandas DataFrame, seprates X(features) and y(target).

The `prepare_data_for_training` methods prepare X and y DataFrames, preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances.

The `create_train_valid` method use valid_size to split X(features) into X_train, y_train, X_valid and y_valid DataFrames, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.


Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Now let's do k_fold training on updated scikit model

In [13]:
# show_doc(tmlt.do_k_fold_training)

In [14]:
# k-fold training
scikit_model_metrics_score, scikit_model_preds = tmlt.do_k_fold_training(n_splits=5,
                                                                          metrics=mean_absolute_error,
                                                                          random_state=42)
print("mean metrics score:", np.mean(scikit_model_metrics_score))
# predict
print(scikit_model_preds.shape)

2021-11-15 14:51:13,994 INFO fold: 1 , mean_absolute_error: 18424.251904109587
2021-11-15 14:51:17,980 INFO fold: 2 , mean_absolute_error: 17082.042757990865
2021-11-15 14:51:22,280 INFO fold: 3 , mean_absolute_error: 17391.390965753424
2021-11-15 14:51:26,224 INFO fold: 4 , mean_absolute_error: 16125.604657534248
2021-11-15 14:51:30,076 INFO fold: 5 , mean_absolute_error: 18398.456000000002


mean metrics score: 17484.349257077625
(1459,)


#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [15]:
#!pip install -U xgboost

In [16]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [17]:
from xgboost import XGBRegressor

xgb_params = {
    'n_estimators':250,
    'learning_rate':0.05,
    'random_state':42,
    # for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}


# create xgb ml model
xgb_model = XGBRegressor(**xgb_params)

In [18]:
# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl

In [19]:
# create train, valid split to evaulate model on valid dataset
tmlt.dfl.create_train_valid(valid_size=0.2)

start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
print('X_valid MAE:', mean_absolute_error(tmlt.dfl.y_valid, preds))

Fit Time: 0.5464799404144287
X_valid MAE: 15982.860940175513


Wow, xgboost blown away scikit-model with impressive MAE from out of the box

#### Let's do Cross Validation for XGB Model on our MLPipeline

In [20]:
# cross_validation
scores = tmlt.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

scores: [15811.14815657 16374.30316246 16706.67996843 14460.8927119
 16755.42412243]
Average MAE score: 16021.689624357876


#### let's use hyperparam tunning to find best xgb_params using tune grid search

In [21]:
# let's tune data preprocessing and model hyperparams
param_grid = {
#     "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
#     "preprocessor__low_card_cat_cols__imputer": [SimpleImputer(strategy='constant'),
#                                                  SimpleImputer(strategy='most_frequent')],
#     'model__n_estimators': [500,1000],
    'model__learning_rate': [0.02,0.05],
    'model__max_depth': [5,10]
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=5,
                                       scoring='neg_mean_absolute_error',
                                      early_stopping=False,
                                      time_budget_s=60)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {-1*(tune_search.best_score_):.3f}")



Grid Search Time: 33.13752293586731
Best params:
{'model__learning_rate': 0.05, 'model__max_depth': 5}
Internal CV Metrics score: 15723.134


**Amazing our MAE has reduced to 15689.22 by HyperParamss tunning, If we can continue doing hyperparmas tunning, may be we can even do better, take that as challenge!**

###### Let's use our newly found params for k-fold training

In [22]:
model_params = tmlt.get_model_best_params(tune_search)
xgb_model = XGBRegressor(**xgb_params)
tmlt.update_model(xgb_model)
tmlt.spl

#### Let's Use K-Fold Training for XGB model

In [23]:
# k-fold training
xgb_model_metrics_score, xgb_model_preds = tmlt.do_k_fold_training(n_splits=10, metrics=mean_absolute_error)
print("mean metrics score:", np.mean(xgb_model_metrics_score))
# predict on test dataset
print(xgb_model_preds.shape)

2021-11-15 14:52:07,754 INFO fold: 1 , mean_absolute_error: 16302.642016267124
2021-11-15 14:52:08,483 INFO fold: 2 , mean_absolute_error: 14976.48464255137
2021-11-15 14:52:09,205 INFO fold: 3 , mean_absolute_error: 14484.688222388699
2021-11-15 14:52:09,898 INFO fold: 4 , mean_absolute_error: 14882.516588184932
2021-11-15 14:52:10,630 INFO fold: 5 , mean_absolute_error: 14369.18535958904
2021-11-15 14:52:11,361 INFO fold: 6 , mean_absolute_error: 19155.028173159248
2021-11-15 14:52:12,138 INFO fold: 7 , mean_absolute_error: 15500.647741866438
2021-11-15 14:52:12,903 INFO fold: 8 , mean_absolute_error: 17888.65625
2021-11-15 14:52:13,606 INFO fold: 9 , mean_absolute_error: 14083.246735873288
2021-11-15 14:52:14,368 INFO fold: 10 , mean_absolute_error: 16558.240983518837


mean metrics score: 15820.133671339898
(1459,)



##### Let's mix the predictions, using weighted average

In [24]:
x1 = 0.4
x2 = 0.6

final_preds = ((x1*scikit_model_preds) + (x2*xgb_model_preds)) / 2

print(final_preds.shape)

(1459,)


In [25]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_dataframeloader.ipynb.
Converted 01_preprocessor.ipynb.
Converted 02_mlpipeline.ipynb.
Converted 03_tutorial.ipynb.
Converted 07_kaggle_tps_tutorial.ipynb.
Converted 08_optuna_tutorial.ipynb.
Converted do_optuna_opt_tutorial.ipynb.
Converted index.ipynb.
Converted logger.ipynb.
