In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp MLPipeline

# Training Pipeline

> An API to create training pipeline for machine learning models on tabular or strucuture data

In [3]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [4]:
# export
from tabular_ml_toolkit.DataFrameLoader import *
from tabular_ml_toolkit.PreProcessor import *

In [5]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [6]:
# export

class MLPipeline:
    """
    Represent MLPipeline class
    
    Attributes:\n
    pipeline: An MLPipeline instance \n
    dataframeloader: A DataFrameLoader instance \n
    preprocessor: A PreProcessor Instance \n
    model: The given Model
    """

    def __init__(self):
        self.pipeline = None
        self.dataframeloader = None
        self.preprocessor = None
        self.model = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = ("pipeline, dataframeloader, preprocessor, model")
        return ("Training Pipeline object with attributes:"+attr_str)
    
    def __repr__(self):
        return self.__str__()
    
#     def __lt__(self):
#         """returns: boolean"""
#         return True
    
    # core methods
    # Bundle preprocessing and modeling code in a training pipeline
    def bundle_preproessor_model(self, preprocessor:object, model:object):
        self.pipeline = Pipeline(
            steps=[('preprocessor', preprocessor.columns_transfomer),
                   ('model', model)])
#     # return pipeline object
#     def create_pipeline(self, preprocessor:object, model:object):
#         self.bundle_preproessor_model(preprocessor, model)
    
    def prepare_data_for_training(self, train_file_path:str,
                                  test_file_path:str,
                                  idx_col:str, target:str,
                                  random_state:int,
                                  valid_size:float,
                                  model:object):
        self.model = model
        # call DataFrameLoader module
        self.dataframeloader = DataFrameLoader().from_csv(
            train_file_path=train_file_path,
            test_file_path=test_file_path,
            idx_col=idx_col,target=target,
            random_state=random_state,valid_size=valid_size)
        # call PreProcessor module
        self.preprocessor = PreProcessor().preprocess_data_for_training(
            dataframeloader=self.dataframeloader)
        
        # call bundle method
        self.bundle_preproessor_model(self.preprocessor, model)
        return self
    
    
    def prepare_data_for_cv(self, train_file_path:str, test_file_path:str,
                                          idx_col:str, target:str, model:object,
                                          random_state:int, cv_cols_type:str):
        self.model = model
        
        # call DataFrameLoader module
        self.dataframeloader = DataFrameLoader().from_csv(
            train_file_path=train_file_path,
            test_file_path=test_file_path,
            idx_col=idx_col, target=target,
            random_state=random_state,
            cv_cols_type=cv_cols_type)
        
        # call PreProcessor module
        self.preprocessor = PreProcessor().preprocess_data_for_cv(
            cv_cols_type = cv_cols_type,
            dataframeloader=self.dataframeloader)
        
        # call bundle method
        self.bundle_preproessor_model(self.preprocessor, model)
        return self
        
    
    def cross_validation(self,X:object, y:object, cv=5,
                         scoring='neg_mean_absolute_error'):
        # Multiply by -1 since sklearn calculates *negative* MAE
        scores = -1 * cross_val_score(
            estimator=self.pipeline,
            X=self.dataframeloader.X_cv,
            y=self.dataframeloader.y,
            scoring=scoring,
            cv=cv)
        return scores
        

In [7]:
show_doc(MLPipeline)

<h2 id="MLPipeline" class="doc_header"><code>class</code> <code>MLPipeline</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>MLPipeline</code>()

Represent MLPipeline class

Attributes:

pipeline: An MLPipeline instance 

dataframeloader: A DataFrameLoader instance 

preprocessor: A PreProcessor Instance 

model: The given Model

In [8]:
show_doc(MLPipeline.prepare_data_for_training)

<h4 id="MLPipeline.prepare_data_for_training" class="doc_header"><code>MLPipeline.prepare_data_for_training</code><a href="__main__.py#L42" class="source_link" style="float:right">[source]</a></h4>

> <code>MLPipeline.prepare_data_for_training</code>(**`train_file_path`**:`str`, **`test_file_path`**:`str`, **`idx_col`**:`str`, **`target`**:`str`, **`random_state`**:`int`, **`valid_size`**:`float`, **`model`**:`object`)



#### Test MLPipeline Class with House Data

*You can use MLPipeline to train any model. Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [9]:
# # create scikit-learn ml model
# scikit_model = RandomForestRegressor(n_estimators=100, random_state=42)

# # createm ml pipeline for scikit-learn model
# sci_ml_pl = MLPipeline().prepare_data_for_training(
#     train_file_path= "input/home_data/train.csv",
#     test_file_path= "input/home_data/test.csv",
#     idx_col="Id",
#     target="SalePrice",
#     model=scikit_model,
#     random_state=42,
#     valid_size=0.2)

# # # Now fit and predict
# sci_ml_pl.pipeline.fit(sci_ml_pl.dataframeloader.X_train, sci_ml_pl.dataframeloader.y_train)
# preds = sci_ml_pl.pipeline.predict(sci_ml_pl.dataframeloader.X_valid)
# print('X_valid MAE:', mean_absolute_error(sci_ml_pl.dataframeloader.y_valid, preds))

In [10]:
# create scikit-learn ml model
scikit_model = RandomForestRegressor(n_estimators=100, random_state=42)
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(train_file_path= "input/home_data/train.csv",
                                             test_file_path= "input/home_data/test.csv",
                                             idx_col="Id", target="SalePrice",
                                             model=scikit_model,random_state=42,
                                             cv_cols_type = "num")

In [11]:
sci_ml_pl.dataframeloader.X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [12]:
sci_ml_pl.dataframeloader.y.head()

Id
1    208500
2    181500
3    223500
4    140000
5    250000
Name: SalePrice, dtype: int64

In [13]:
# print(sci_ml_pl.preprocessor.cv_cols)

AttributeError: 'PreProcessor' object has no attribute 'cv_cols'

In [None]:
# hide
print(len(sci_ml_pl.preprocessor.cv_cols))
sci_ml_pl.preprocessor.cv_cols

In [None]:
print(sci_ml_pl.dataframeloader.X_test)

In [None]:
# # Now fit and predict
scores = sci_ml_pl.cross_validation(X = sci_ml_pl.dataframeloader.X,
                                    y= sci_ml_pl.dataframeloader.y,
                                    cv=3, scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

In [None]:
# hide
print(len(sci_ml_pl.dataframeloader.final_cols))
sci_ml_pl.dataframeloader.final_cols

In [None]:
# hide
print(len(sci_ml_pl.dataframeloader.low_card_cat_cols))
sci_ml_pl.dataframeloader.low_card_cat_cols

In [None]:
# hide
print(len(sci_ml_pl.dataframeloader.high_card_cat_cols))
sci_ml_pl.dataframeloader.high_card_cat_cols

In [None]:
# hide
print(len(sci_ml_pl.dataframeloader.numerical_cols))
sci_ml_pl.dataframeloader.numerical_cols

#### Let's do Cross Validation on our pipeline

In [None]:
sci_ml_pl.dataframeloader.X.head()

In [None]:
sci_ml_pl.dataframeloader.X[sci_ml_pl.dataframeloader.numerical_cols]

In [None]:
my_pipeline = Pipeline(steps=[
    ('preprocessor', SimpleImputer()),
    ('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

In [None]:
scores = -1 * cross_val_score(
            estimator=my_pipeline,
            X=sci_ml_pl.dataframeloader.X[sci_ml_pl.dataframeloader.numerical_cols],
            y=sci_ml_pl.dataframeloader.y,
            scoring='neg_mean_absolute_error',
            cv=3)
scores

In [None]:
scores = sci_ml_pl.do_cross_validation(cv_cols=sci_ml_pl.dataframeloader.numerical_cols,
                                       cv=3, scoring='neg_mean_absolute_error')
# scores = sci_ml_pl.do_cross_validation(cv_cols="num", cv=3, scoring='neg_mean_absolute_error')
scores

*You can also use MLPipeline with XGBoost model, Make sure to install XGBooost depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
from xgboost import XGBRegressor
# create xgb ml model
xgb_model = XGBRegressor(n_estimators=250,learning_rate=0.05, random_state=42)
# createm ml pipeline for xgb model
xgb_ml_pl = MLPipeline().prepare_data_for_training(train_file_path= "input/home_data/train.csv",
                                              test_file_path= "input/home_data/test.csv",
                                              idx_col="Id", target="SalePrice",valid_size=0.2,
                                              model=xgb_model, random_state=42)
# Now fit and predict
xgb_ml_pl.pipeline.fit(xgb_ml_pl.dataframeloader.X_train, xgb_ml_pl.dataframeloader.y_train)
preds = xgb_ml_pl.pipeline.predict(xgb_ml_pl.dataframeloader.X_valid)
print('X_valid MAE:', mean_absolute_error(xgb_ml_pl.dataframeloader.y_valid, preds))

In [None]:
# hide
# error to stop below cell from running when press run all!

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()