In [2]:
# default_exp pipeline

# Training Pipeline

> An API to create training pipeline for machine learning models on tabular or strucuture data

In [19]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [20]:
# export
from tabular_ml_toolkit.dataframeloader import *
from tabular_ml_toolkit.preprocessor import *

In [21]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [39]:
# export

class TrainingPipeline:
    """
    Represent TrainingPipeline class
    
    Attributes:
    pl: A training pipeline 
    """

    def __init__(self):
        self.pl = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        return "Training Pipeline object with attributes: pl"
    
    def __repr__(self):
        return self.__str__()
    
#     def __lt__(self):
#         """returns: boolean"""
#         return True
    
    # core methods
    # Bundle preprocessing and modeling code in a training pipeline
    def bundle_preproessor_model(self, preprocessor:object, model:object):
        self.pl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
    # return pipeline object
    def create_pipeline(self, preprocessor:object, model:object):
        self.bundle_preproessor_model(preprocessor, model)
        return self.pl

In [40]:
# show_doc(DataFrameLoader.from_csv)

#### Test TrainingPipeline Class with House Data

*Let's load [Melbourne Home Sale price raw data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [41]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id", target="SalePrice",
    valid_size=0.2)
dfl

DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

In [42]:
pp = PreProcessor().preprocess_data(numerical_cols=dfl.numerical_cols,
                                    categorical_cols=dfl.categorical_cols)
pp

PreProcessor object with attributes: numerical_transformer, categorical_transformer, preprocessor

In [43]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model

RandomForestRegressor(random_state=42)

In [45]:
# Bundle preprocessor and model in a pipeline
pl = TrainingPipeline().create_pipeline(preprocessor=pp.preprocessor, model=model)
pl

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBa...
                                                 

In [46]:
# Preprocessing of training data and then fit model 
pl.fit(dfl.X_train, dfl.y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBa...
                                                 

In [47]:
# Preprocessing of validation data and then get predictions
preds = pl.predict(dfl.X_valid)

print('X_valid MAE:', mean_absolute_error(dfl.y_valid, preds))

X_valid MAE: 17678.294143835617


In [None]:
# hide
# todo
# modify pipeline class to create three methods aka three pipeline methods
# create_data_pipeline
# create_prepreprocessor_pipeline
# create_training_pipeline
# so in the end user will just create instance of pipeline and run three line to train model
# model = RandomForestRegressor(n_estimators=100, random_state=42)
# pl = prepare_data_for_training(train_path, test_path, valid_size, random_state, model)
    # intenrally call below methods
    # pl.load_csv(train_path, test_path, valid_size, random_state)
    # pl.preprocessor()
    # pl.bundle(preprocessor,model)
# pl.fit(pl.X_train, pl.y_train)
# preds = pl.predict(pl.X_valid)
# print('X_valid MAE:', mean_absolute_error(pl.y_valid, preds))