In [None]:
# default_exp Tutorial

# Getting Started Tutorial with Tabular ML Toolkit

> A tutorial on getting started with Tabular ml toolkit

> tabular_ml_toolkit is a superfast helper library to speedup your machine learning project based on Tabular or Structured data.

> It comes with model parallelism and cutting edge hyperparameter tuning techniques.

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*For example, Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*


*No need to install scikit-learn as it comes preinstall with Tabular_ML_Toolkit*

In [None]:
#export
from tabular_ml_toolkit.MLPipeline import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

# for displaying diagram of pipelines 
from sklearn import set_config
set_config(display="diagram")

# Just to compare fit times
import time

In [None]:
#export
# Dataset file names and Paths
DIRECTORY_PATH = "https://raw.githubusercontent.com/psmathur/tabular_ml_toolkit/master/input/home_data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [None]:
#export
# create scikit-learn ml model
scikit_model = RandomForestRegressor(random_state=42)

# createm ml pipeline for scikit-learn model
tmlt = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=scikit_model,
    random_state=42)

In [None]:
tmlt.spl

In [None]:
# create train, valid split to evaulate model on valid dataset
tmlt.dfl.create_train_valid(valid_size=0.2)

start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
print('X_valid MAE:', mean_absolute_error(tmlt.dfl.y_valid, preds))

Fit Time: 1.0021109580993652
X_valid MAE: 17634.989965753426


#### To see more clear picture of model performance, Let's do Cross Validation on our Pipeline

In [None]:
start = time.time()
# Now do cross_validation
scores = tmlt.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
end = time.time()
print("Cross Validation Time:", end - start)

print("scores:", scores)
print("Average MAE score:", scores.mean())

Cross Validation Time: 6.714529037475586
scores: [18028.27554795 17426.42736301 17942.29359589 15938.34671233
 19148.49674658]
Average MAE score: 17696.767993150686


##### *MAE did become slightly bad with cross validation*

#### Let's do HyperParameters Tunning for our entire MLPipeline

##### Let's see if we can improve our cross validation score with hyperparams tunning

In [None]:
# create a new scikit-learn ml model with minimum amount of estimator for warm start
scikit_model_new = RandomForestRegressor(n_estimators=10,
                                     random_state=42)
# Update pipeline with new model
tmlt.update_model(scikit_model_new)
tmlt.spl

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [None]:
# let's tune data preprocessing and model hyperparams

param_grid = {
    "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
    "preprocessor__low_card_cat_cols__imputer": [SimpleImputer(strategy='constant'),
                                                 SimpleImputer(strategy='most_frequent')],
    'model__n_estimators': [250,500],
    'model__max_features': ["auto", "sqrt"],
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=5,
                                       scoring='neg_mean_absolute_error',
                                      early_stopping=False)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {-1*(tune_search.best_score_):.3f}")

Grid Search Time: 245.83602786064148
Best params:
{'preprocessor__num_cols__scaler': MinMaxScaler(), 'preprocessor__low_card_cat_cols__imputer': SimpleImputer(strategy='most_frequent'), 'model__n_estimators': 500, 'model__max_features': 'sqrt'}
Internal CV Metrics score: 17415.875

Grid Search Time: 494.44151496887207
Best params:
{'preprocessor__num_cols__scaler': MinMaxScaler(), 'preprocessor__low_card_cat_cols__imputer': SimpleImputer(strategy='constant'), 'model__n_estimators': 250, 'model__max_features': 'sqrt', 'model__min_samples_split': 2}
Internal CV Metrics score: 17425.125

WHEN EARLY_STOPPING : TRUE

Grid Search Time: 98.20898699760437
Best params:
{'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Internal CV Metrics score: 17612.271

##### Now let's use best params to update preprocessor and model in our pipeline

In [None]:
tmlt.update_preprocessor(num_scaler=MinMaxScaler(),
                         cat_imputer=SimpleImputer(strategy='most_frequent'))
tmlt.spl

In [None]:
# create a new scikit-learn ml model with best params
scikit_model_tuned = RandomForestRegressor(n_estimators=500,
                                           max_features='sqrt',
                                           random_state=42)
# Update pipeline with new model
tmlt.update_model(scikit_model_tuned)
tmlt.spl

In [None]:
# Now do cross_validation
start = time.time()

scores = tmlt.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
end = time.time()
print("Cross Validation Time:", end - start)

print("scores:", scores)
print("Average MAE score:", scores.mean())

Cross Validation Time: 17.414206981658936
scores: [15823.07915753 19165.97215068 18298.28758904 15589.08655479
 18202.94716895]
Average MAE score: 17415.874524200914


#### Yup Indeed HyperParams tunning for data preprocessing and model has improved MAE from earlier cross validated model!

In background `prepare_data_for_training` method loads your input data into Pandas DataFrame, seprates X(features) and y(target).

The `prepare_data_for_training` methods prepare X and y DataFrames, preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances.

The `create_train_valid` method use valid_size to split X(features) into X_train, y_train, X_valid and y_valid DataFrames, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.


Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Now let's do k_fold training on updated scikit model

In [None]:
# k-fold training
scikit_model_k_fold, scikit_model_metrics_score = tmlt.do_k_fold_training(n_splits=5,
                                                                          metrics=mean_absolute_error,
                                                                          random_state=42)
print("mean metrics score:", np.mean(scikit_model_metrics_score))



fold: 1 , mean_absolute_error: 18577.711020547948
fold: 2 , mean_absolute_error: 17243.96052739726
fold: 3 , mean_absolute_error: 17387.308623287674
fold: 4 , mean_absolute_error: 16114.673212328766
fold: 5 , mean_absolute_error: 18259.706
mean metrics score: 17516.67187671233


In [None]:
# predict
scikit_model_preds = tmlt.do_k_fold_prediction(k_fold=scikit_model_k_fold)
print(scikit_model_preds.shape)
# preds

(1459,)


#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
#!pip install -U xgboost

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
# #export
from xgboost import XGBRegressor

xgb_params = {
    'n_estimators':250,
    'learning_rate':0.05,
    'random_state':42,
    # for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}


# create xgb ml model
xgb_model = XGBRegressor(**xgb_params)

In [None]:
# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl

In [None]:
# create train, valid split to evaulate model on valid dataset
tmlt.dfl.create_train_valid(valid_size=0.2)

start = time.time()
# Now fit
tmlt.spl.fit(tmlt.dfl.X_train, tmlt.dfl.y_train)
end = time.time()
print("Fit Time:", end - start)

#predict
preds = tmlt.spl.predict(tmlt.dfl.X_valid)
print('X_valid MAE:', mean_absolute_error(tmlt.dfl.y_valid, preds))

Fit Time: 0.6943919658660889
X_valid MAE: 16430.334613120718


``` xgb_params = {
    'n_estimators':250,
    'learning_rate':0.05,
    'eval_metric': 'mae',
#     'booster': 'gbtree',
    #     'max_depth': 9,
    'random_state':42,
    'n_jobs': -1,
    # for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}
```
Fit Time: 0.6319022178649902
X_valid MAE: 16430.334613120718

Wow, xgboost blown away scikit-model with impressive MAE from out of the box

#### Let's do Cross Validation for XGB Model on our MLPipeline

In [None]:
# cross_validation
scores = tmlt.do_cross_validation(cv=10, scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

scores: [15949.78084867 16410.97034193 16670.44976723 14807.25535103
 17340.9091128 ]
Average MAE score: 16235.873084332194


#### let's use hyperparam tunning to find best xgb_params using tune grid search

In [None]:
# let's tune data preprocessing and model hyperparams
param_grid = {
#     "preprocessor__num_cols__scaler": [StandardScaler(), MinMaxScaler()],
#     "preprocessor__low_card_cat_cols__imputer": [SimpleImputer(strategy='constant'),
#                                                  SimpleImputer(strategy='most_frequent')],
#     'model__n_estimators': [500,1000],
    'model__learning_rate': [0.02,0.05],
    'model__max_depth': [5,10]
}

start = time.time()
# Now do tune grid search
tune_search = tmlt.do_tune_grid_search(param_grid=param_grid,
                                       cv=5,
                                       scoring='neg_mean_absolute_error',
                                      early_stopping=False)
end = time.time()
print("Grid Search Time:", end - start)

print("Best params:")
print(tune_search.best_params_)

print(f"Internal CV Metrics score: {-1*(tune_search.best_score_):.3f}")

Grid Search Time: 639.5532441139221
Best params:
{'model__n_estimators': 500, 'model__learning_rate': 0.05, 'model__max_depth': 5}
Internal CV Metrics score: 15689.292

Grid Search Time: 78.6581199169159
Best params:
{'model__n_estimators': 1000, 'model__learning_rate': 0.02}
Internal CV Metrics score: 16043.939

**Amazing our MAE has reduced to 15689.22 by HyperParamss tunning, If we can continue doing hyperparmas tunning, may be we can even do better, take that as challenge!**

###### Let's use our newly found params for k-fold training

In [None]:
xgb_params = {
    'n_estimators':500,
    'learning_rate':0.05,
        'max_depth': 5,
    'random_state':42,
    # for GPU
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor',
}

# Update pipeline with xgb model
tmlt.update_model(xgb_model)
tmlt.spl
# create xgb ml model
xgb_model = XGBRegressor(**xgb_params)

#### Let's Use K-Fold Training for XGB model

In [None]:
# k-fold training
xgb_model_k_fold, xgb_model_metrics_score = tmlt.do_k_fold_training(n_splits=10, metrics=mean_absolute_error)
print("mean metrics score:", np.mean(xgb_model_metrics_score))



fold: 1 , mean_absolute_error: 16873.444456335616
fold: 2 , mean_absolute_error: 14342.549282962329
fold: 3 , mean_absolute_error: 14823.832780393835
fold: 4 , mean_absolute_error: 14989.402290239726
fold: 5 , mean_absolute_error: 14393.218321917808
fold: 6 , mean_absolute_error: 19192.9765625
fold: 7 , mean_absolute_error: 15307.134043236301
fold: 8 , mean_absolute_error: 17752.64586900685
fold: 9 , mean_absolute_error: 14619.683807791096
fold: 10 , mean_absolute_error: 16351.175727739726
mean metrics score: 15864.606314212331


In [None]:
# predict
xgb_model_preds = tmlt.do_k_fold_prediction(k_fold=xgb_model_k_fold)
print(xgb_model_preds.shape)
# preds

(1459,)



##### Let's bin the predictions, using weighted average

In [None]:
x1 = 0.6
x2 = 0.4

final_preds = ((x1*scikit_model_preds) + (x2*xgb_model_preds)) / 2

print(final_preds.shape)

(1459,)


In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_DataFrameLoader.ipynb.
Converted 01_PreProcessor.ipynb.
Converted 02_MLPipeline.ipynb.
Converted 03_Tutorial.ipynb.
Converted 07_Kaggle_TPS_Tutorial.ipynb.
Converted Optuna_Tutorial.ipynb.
Converted automl_in_sklearn_pipeline.ipynb.
Converted index.ipynb.
