In [None]:
# default_exp Tutorial

# Getting Started Tutorial with Tabular ML Toolkit

> A tutorial to getting started with ML Toolkit to jumpstart your Machine Learning Project based on Tabular or Structured data.

## Install

`pip install -U tabular_ml_toolkit`

## How to Best Use tabular_ml_toolkit

Start with your favorite model and then just simply create MLPipeline with one API.

*For example, Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*


*No need to install scikit-learn as it comes preinstall with Tabular_ML_Toolkit*

In [None]:
#export
from tabular_ml_toolkit.MLPipeline import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

In [None]:
#export
# Dataset file names and Paths
DIRECTORY_PATH = "https://raw.githubusercontent.com/psmathur/tabular_ml_toolkit/master/input/home_data/"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

In [None]:
#export

# create scikit-learn ml model
scikit_model = RandomForestRegressor(n_estimators=200, random_state=42)

# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=scikit_model,
    random_state=42,
    valid_size=0.2)

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
# # Now fit and predict
sci_ml_pl.scikit_pipeline.fit(sci_ml_pl.dataframeloader.X_train, sci_ml_pl.dataframeloader.y_train)

preds = sci_ml_pl.scikit_pipeline.predict(sci_ml_pl.dataframeloader.X_valid)
print('X_valid MAE:', mean_absolute_error(sci_ml_pl.dataframeloader.y_valid, preds))

X_valid MAE: 17676.01967465753


#### Let's do Cross Validation for Scikit-Learn Model on our MLPipeline

In [None]:
#export
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=scikit_model,random_state=42,
    cv_cols_type = "all") #cv_cols_type = all|num|cat

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
# Now fit and predict
scores = sci_ml_pl.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

scores: [17867.9652226  17677.27378425 17968.04160959 16295.53832192
 19320.4890411 ]
Average MAE score: 17825.86159589041


In background `prepare_data_for_training` and `prepare_data_for_cv`  methods loads your input data into Pandas DataFrame, seprates X(features) and y(target).

Then `prepare_data_for_training` methods split X(features) into X_train, y_train, X_valid and y_valid DataFrames.
However, `prepare_data_for_cv`  method do not split but let's cross validation split internally X and y DataFrames.

Then both methods preprocess all numerical and categorical type data found in these DataFrames using scikit-learn pipelines. Then it bundle preprocessed data with your given model and return an MLPipeline object, this class instance has dataframeloader, preprocessor and scikit-lean pipeline instances, so you can call fit methods on X_train and y_train and predict methods on X_valid or X_test.

Please check detail documentation and source code for more details.

*NOTE: If you want to customize data and preprocessing steps you can do so by using `DataFrameLoader` and `PreProessor` classes. Check detail documentations for these classes for more options.*

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Just make sure to install XGBooost first depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
#!pip install -U xgboost

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
#export
from xgboost import XGBRegressor
# create xgb ml model
xgb_model = XGBRegressor(n_estimators=250,learning_rate=0.05, random_state=42)

# createm ml pipeline for xgb model
xgb_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id",
    target="SalePrice",
    model=xgb_model,
    random_state=42,
    valid_size=0.2)

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
xgb_ml_pl.scikit_pipeline

In [None]:
# Now fit and predict
xgb_ml_pl.scikit_pipeline.fit(xgb_ml_pl.dataframeloader.X_train, xgb_ml_pl.dataframeloader.y_train)
preds = xgb_ml_pl.scikit_pipeline.predict(xgb_ml_pl.dataframeloader.X_valid)
print('X_valid MAE:', mean_absolute_error(xgb_ml_pl.dataframeloader.y_valid, preds))

X_valid MAE: 15824.136571596746


#### Let's do Cross Validation for Scikit Model on our MLPipeline

In [None]:
# create scikit-learn ml model
scikit_model = RandomForestRegressor(n_estimators=200, random_state=42)
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=scikit_model,random_state=42,
    cv_cols_type = "all") #cv_cols_type = all|num|cat

In [None]:
print(len(sci_ml_pl.dataframeloader.cv_cols))
# sci_ml_pl.dataframeloader.cv_cols

sci_ml_pl.dataframeloader.X_cv.head()

79


Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,548,0,61,0,0,0,0,0,2,2008
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Feedr,Norm,...,460,298,0,0,0,0,0,0,5,2007
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,608,0,42,0,0,0,0,0,9,2008
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,642,0,35,272,0,0,0,0,2,2006
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,Norm,Norm,...,836,192,84,0,0,0,0,0,12,2008


In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
xgb_ml_pl.scikit_pipeline

In [None]:
# Now fit and predict
scores = sci_ml_pl.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

scores: [17867.9652226  17677.27378425 17968.04160959 16295.53832192
 19320.4890411 ]
Average MAE score: 17825.86159589041


#### Let's do Cross Validation for XGB Model on our MLPipeline

In [None]:
# createm ml pipeline for scikit-learn model
xgb_ml_pl = MLPipeline().prepare_data_for_cv(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id", target="SalePrice",
    model=xgb_model,random_state=42,
    cv_cols_type = "all") #cv_cols_type = all|num|cat

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
xgb_ml_pl.scikit_pipeline

In [None]:
# Now fit and predict
scores = xgb_ml_pl.do_cross_validation(cv=5, scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

#### Let's do Grid Search for HyperParameters Tunning for Scikit Model on our MLPipeline

In [None]:
# create blanket scikit-learn RandomForestRegressor model
scikit_model = RandomForestRegressor()

# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id",
    target="SalePrice",
    model=scikit_model,
    random_state=42,
    cv_cols_type = "all") #cv_cols_type = all|num|cat

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
param_grid = {
#     "preprocessor__num__imputer__strategy": ["constant", "mean", "median"],
#     "preprocessor__low_cad_cat__imputer__strategy": ["most_frequent", "constant"],
#     "model__max_depth": [40,80,100],
#     'model__max_features': [2, 3],
#     'model__min_samples_leaf': [3,4,5],
#     'model__min_samples_split': [8,10,12],
    "model__n_estimators": [250, 350,450, 550]
}

grid_search = sci_ml_pl.do_grid_search(param_grid=param_grid, cv=10,
                                       scoring='neg_mean_absolute_error')

print("Best params:")
print(grid_search.best_params_)

print(f"Internal CV Metrics score: {-1*(grid_search.best_score_):.3f}")

Best params:
{'model__n_estimators': 450}
Internal CV Metrics score: 17335.794


#### Let's Use K-Fold Training with best params from grid search

In [None]:
# create scikit-learn RandomForestRegressor model with best params from grid search
scikit_model = RandomForestRegressor(
    n_estimators=450,
#     max_depth = 80,
#     max_features = 3,
#     min_samples_leaf = 3,
#     min_samples_split = 8,
    random_state=42)

In [None]:
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_k_fold(
    train_file_path= DIRECTORY_PATH+TRAIN_FILE,
    test_file_path= DIRECTORY_PATH+TEST_FILE,
    idx_col="Id",
    target="SalePrice",
    model=scikit_model,
    random_state=42)

In [None]:
# sci_ml_pl.dataframeloaderrocessor.

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
sci_ml_pl.dataframeloader.X_cv.head()

# sci_ml_pl.dataframeloader.y.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,548,0,61,0,0,0,0,0,2,2008
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Feedr,Norm,...,460,298,0,0,0,0,0,0,5,2007
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,608,0,42,0,0,0,0,0,9,2008
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,642,0,35,272,0,0,0,0,2,2006
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,Norm,Norm,...,836,192,84,0,0,0,0,0,12,2008


In [None]:
print(len(sci_ml_pl.dataframeloader.final_cols))
# sci_ml_pl.dataframeloader.final_cols

79


In [None]:
# fit and predict
k_fold, metrics_score = sci_ml_pl.do_k_fold_training(n_splits=10, metrics=mean_absolute_error)
print("mean metrics score:", np.mean(metrics_score))



fold: 1 , mean_absolute_error: 15675.695038051752
fold: 2 , mean_absolute_error: 17127.49095890411
fold: 3 , mean_absolute_error: 18662.65322678843
fold: 4 , mean_absolute_error: 15114.152496194823
fold: 5 , mean_absolute_error: 14763.087503805176
fold: 6 , mean_absolute_error: 23143.34774733638
fold: 7 , mean_absolute_error: 19269.37409436834
fold: 8 , mean_absolute_error: 17272.28508371385
fold: 9 , mean_absolute_error: 14591.83619482496
fold: 10 , mean_absolute_error: 18392.702633181125
mean metrics score: 17401.262497716893


In [None]:
# predict
preds = sci_ml_pl.do_k_fold_prediction(k_fold=k_fold)

print(preds.shape)
preds

(1459,)


array([125830.22      , 154523.64444444, 180892.32888889, ...,
       160160.63333333, 109862.98444444, 230314.17555556])

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_DataFrameLoader.ipynb.
Converted 01_PreProcessor.ipynb.
Converted 02_MLPipeline.ipynb.
Converted 03_Tutorial.ipynb.
Converted 04_Kaggle_TPS_Challenge_Nov_2021_XGB.ipynb.
Converted 05_Kaggle_TPS_Challenge_Nov_2021_SCIKIT_LEARN.ipynb.
Converted 06_Getting_Started_Kaggle_TPS_Challenge.ipynb.
Converted index.ipynb.
