In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp MLPipeline

# Training Pipeline

> An API to create training pipeline for machine learning models on tabular or strucuture data

In [None]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [None]:
# export
from tabular_ml_toolkit.DataFrameLoader import *
from tabular_ml_toolkit.PreProcessor import *

In [None]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
# export

class MLPipeline:
    """
    Represent MLPipeline class
    
    Attributes:\n
    pipeline: An MLPipeline instance \n
    dataframeloader: A DataFrameLoader instance \n
    preprocessor: A PreProcessor Instance \n
    model: The given Model
    """

    def __init__(self):
        self.pipeline = None
        self.dataframeloader = None
        self.preprocessor = None
        self.model = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        return "Training Pipeline object with attributes: pl"
    
    def __repr__(self):
        return self.__str__()
    
#     def __lt__(self):
#         """returns: boolean"""
#         return True
    
    # core methods
    # Bundle preprocessing and modeling code in a training pipeline
    def bundle_preproessor_model(self, preprocessor:object, model:object):
        self.pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
#     # return pipeline object
#     def create_pipeline(self, preprocessor:object, model:object):
#         self.bundle_preproessor_model(preprocessor, model)
    
    def prepare_data_for_training(self, train_file_path:str, test_file_path:str, idx_col:str, target:str, valid_size:float, model:object, random_state:int):
        self.model = model
        # call DataFrameLoader module
        self.dataframeloader = DataFrameLoader().from_csv(train_file_path,test_file_path,idx_col,target,valid_size)
        # call PreProcessor module
        self.preprocessor = PreProcessor().preprocess_data(
            numerical_cols=self.dataframeloader.numerical_cols,
            low_card_cat_cols=self.dataframeloader.low_card_cat_cols,
            high_card_cat_cols=self.dataframeloader.high_card_cat_cols
        )
        
        # call self module method
        self.bundle_preproessor_model(self.preprocessor.columns_transfomer, model)
        return self

In [None]:
show_doc(MLPipeline)

<h2 id="MLPipeline" class="doc_header"><code>class</code> <code>MLPipeline</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>MLPipeline</code>()

Represent MLPipeline class

Attributes:

pipeline: An MLPipeline instance 

dataframeloader: A DataFrameLoader instance 

preprocessor: A PreProcessor Instance 

model: The given Model

In [None]:
show_doc(MLPipeline.prepare_data_for_training)

<h4 id="MLPipeline.prepare_data_for_training" class="doc_header"><code>MLPipeline.prepare_data_for_training</code><a href="__main__.py#L41" class="source_link" style="float:right">[source]</a></h4>

> <code>MLPipeline.prepare_data_for_training</code>(**`train_file_path`**:`str`, **`test_file_path`**:`str`, **`idx_col`**:`str`, **`target`**:`str`, **`valid_size`**:`float`, **`model`**:`object`, **`random_state`**:`int`)



#### Test MLPipeline Class with House Data

*You can use MLPipeline to train any model. Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
# create scikit-learn ml model
scikit_model = RandomForestRegressor(n_estimators=100, random_state=42)
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_training(train_file_path= "input/home_data/train.csv",
                                              test_file_path= "input/home_data/test.csv",
                                              idx_col="Id", target="SalePrice",valid_size=0.2,
                                              model=scikit_model, random_state=42)
# Now fit and predict
sci_ml_pl.pipeline.fit(sci_ml_pl.dataframeloader.X_train, sci_ml_pl.dataframeloader.y_train)
preds = sci_ml_pl.pipeline.predict(sci_ml_pl.dataframeloader.X_valid)
print('X_valid MAE:', mean_absolute_error(sci_ml_pl.dataframeloader.y_valid, preds))

X_valid MAE: 17582.46150684932


In [None]:
# hide
print(len(sci_ml_pl.preprocessor.low_card_cat_cols))
sci_ml_pl.preprocessor.low_card_cat_cols

40


['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [None]:
# hide
print(len(sci_ml_pl.preprocessor.high_card_cat_cols))
sci_ml_pl.preprocessor.high_card_cat_cols

3


['Neighborhood', 'Exterior1st', 'Exterior2nd']

In [None]:
# hide
print(len(sci_ml_pl.preprocessor.numerical_cols))
sci_ml_pl.preprocessor.numerical_cols

36


['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

*You can also use MLPipeline with XGBoost model, Make sure to install XGBooost depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
from xgboost import XGBRegressor
# create xgb ml model
xgb_model = XGBRegressor(n_estimators=250,learning_rate=0.05, random_state=42)
# createm ml pipeline for xgb model
xgb_ml_pl = MLPipeline().prepare_data_for_training(train_file_path= "input/home_data/train.csv",
                                              test_file_path= "input/home_data/test.csv",
                                              idx_col="Id", target="SalePrice",valid_size=0.2,
                                              model=xgb_model, random_state=42)
# Now fit and predict
xgb_ml_pl.pipeline.fit(xgb_ml_pl.dataframeloader.X_train, xgb_ml_pl.dataframeloader.y_train)
preds = xgb_ml_pl.pipeline.predict(xgb_ml_pl.dataframeloader.X_valid)
print('X_valid MAE:', mean_absolute_error(xgb_ml_pl.dataframeloader.y_valid, preds))

X_valid MAE: 15801.672262949487


In [None]:
# hide
# error to stop below cell from running when press run all!

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_DataFrameLoader.ipynb.
Converted 01_PreProcessor.ipynb.
Converted 02_MLPipeline.ipynb.
Converted index.ipynb.
