In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp MLPipeline

# Training Pipeline

> An API to create training pipeline for machine learning models on tabular or strucuture data

In [3]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [4]:
# export
from tabular_ml_toolkit.DataFrameLoader import *
from tabular_ml_toolkit.PreProcessor import *

In [70]:
# export
# hide
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score,accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold


In [140]:
# export

class MLPipeline:
    """
    Represent MLPipeline class
    
    Attributes:\n
    pipeline: An MLPipeline instance \n
    dataframeloader: A DataFrameLoader instance \n
    preprocessor: A PreProcessor Instance \n
    model: The given Model
    """

    def __init__(self):
        self.pipeline = None
        self.dataframeloader = None
        self.preprocessor = None
        self.model = None
        self.scikit_pipeline = None
        self.transformer_type = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = ("pipeline, dataframeloader, preprocessor, model")
        return ("Training Pipeline object with attributes:"+attr_str)
    
    def __repr__(self):
        return self.__str__()
    
#     def __lt__(self):
#         """returns: boolean"""
#         return True
    
    # core methods
    
    # Bundle preprocessing and modeling code in a training pipeline
    def bundle_preproessor_model(self, transformer_type, model):
        self.scikit_pipeline = Pipeline(
            steps=[('preprocessor', transformer_type),
                   ('model', model)])
    
    # Core methods for Simple Training
    def prepare_data_for_training(self, train_file_path:str,
                                  test_file_path:str,
                                  idx_col:str, target:str,
                                  random_state:int,
                                  valid_size:float,
                                  model:object):
        self.model = model
        # call DataFrameLoader module
        self.dataframeloader = DataFrameLoader().from_csv(
            train_file_path=train_file_path,
            test_file_path=test_file_path,
            idx_col=idx_col,target=target,
            random_state=random_state,valid_size=valid_size)
        # call PreProcessor module
        self.preprocessor = PreProcessor().preprocess_all_cols_for_training(
            dataframeloader=self.dataframeloader)
        
        # call bundle method
        self.bundle_preproessor_model(transformer_type=self.preprocessor.transformer_type,
                                     model = model)
        return self
    
    
    # Core methods for Cross Validation
    def prepare_data_for_cv(self, train_file_path:str, test_file_path:str,
                                          idx_col:str, target:str, model:object,
                                          random_state:int, cv_cols_type:str):
        self.model = model
        
        # call DataFrameLoader module
        self.dataframeloader = DataFrameLoader().from_csv(
            train_file_path=train_file_path,
            test_file_path=test_file_path,
            idx_col=idx_col, target=target,
            random_state=random_state,
            cv_cols_type=cv_cols_type)
        
        # call PreProcessor module
        self.preprocessor = PreProcessor().preprocess_cols_for_cv(
            cv_cols_type = cv_cols_type,
            dataframeloader=self.dataframeloader)
        
        # call bundle method
        self.bundle_preproessor_model(transformer_type=self.preprocessor.transformer_type,
                                     model = model)
        return self
        
    
    def do_cross_validation(self,estimator:object, cv:int, scoring:str):
        scores = cross_val_score(
            estimator=estimator,
            X=self.dataframeloader.X_cv,
            y=self.dataframeloader.y,
            scoring=scoring,
            cv=cv)
        # Multiply by -1 since sklearn calculates *negative* scoring for some of the metrics
        if "neg_" in scoring:
            scores = -1 * scores
        return scores
        
    # Core methods for GridSearch
    def do_grid_search(self, estimator:object, param_grid:object, cv:int, scoring:str):
        
        # create GridSeachCV instance
        grid_search = GridSearchCV(estimator=estimator,
                                   param_grid=param_grid,
                                   cv=cv,
                                   scoring=scoring)
        # now call fit
        grid_search.fit(self.dataframeloader.X_cv, self.dataframeloader.y)
        return grid_search
    
    # core method for K-Fold training
    def prepare_data_for_k_fold(self, train_file_path:str, test_file_path:str,
                                          idx_col:str, target:str, model:object,
                                          random_state:int):
        
        return self.prepare_data_for_cv(train_file_path,
                                        test_file_path,
                                        idx_col,
                                        target,
                                        model,
                                        random_state,
                                        cv_cols_type="all")
    
    
#     def prepare_data_for_k_fold(self, train_file_path:str, test_file_path:str,
#                                           idx_col:str, target:str, model:object,
#                                           random_state:int):
#         self.model = model
        
#         # call DataFrameLoader module
#         self.dataframeloader = DataFrameLoader().from_csv(
#             train_file_path=train_file_path,
#             test_file_path=test_file_path,
#             idx_col=idx_col, target=target,
#             random_state=random_state)
        
#         # call PreProcessor module
#         self.preprocessor = PreProcessor().preprocess_all_cols_for_training(
#             dataframeloader=self.dataframeloader)
        
#         # call bundle method
#         self.bundle_preproessor_model(transformer_type=self.preprocessor.transformer_type,
#                                      model = model)
#         return self
    
    def do_k_fold_training(self, estimator:object, n_splits:int, metrics:object):
        
        #create stratified K Folds instance
        k_fold = StratifiedKFold(n_splits=n_splits,
                             random_state=48,
                             shuffle=True)
        
        # list contains metrics score for each fold
        metrics_score = []
        n=0
        for train_idx, valid_idx in k_fold.split(self.dataframeloader.X_cv, self.dataframeloader.y):
            # create X_train
            self.dataframeloader.X_train = self.dataframeloader.X_cv.iloc[train_idx]
            # create X_valid
            self.dataframeloader.X_valid = self.dataframeloader.X_cv.iloc[valid_idx] 
            # create y_train
            self.dataframeloader.y_train = self.dataframeloader.y.iloc[train_idx]
            # create y_valid
            self.dataframeloader.y_valid = self.dataframeloader.y.iloc[valid_idx]
            
            # fit
            self.scikit_pipeline.fit(self.dataframeloader.X_train, self.dataframeloader.y_train)
            
            #evaluate metrics
            metrics_score.append(metrics(sci_ml_pl.dataframeloader.y_valid,
                                               self.scikit_pipeline.predict(self.dataframeloader.X_valid)))
            
            print(f"fold: {n+1} , {str(metrics.__name__)}: {metrics_score[n]}")
            # increment fold counter label
            n += 1
        return k_fold
            
    def do_k_fold_prediction(self, k_fold:object):
        # create preds dataframe
        preds = np.zeros(self.dataframeloader.X_test_cv.shape[0])
        for _ in range(k_fold.n_splits):
            # predict
            preds += self.scikit_pipeline.predict(self.dataframeloader.X_test_cv) / k_fold.n_splits
        return preds

In [141]:
show_doc(MLPipeline)

<h2 id="MLPipeline" class="doc_header"><code>class</code> <code>MLPipeline</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>MLPipeline</code>()

Represent MLPipeline class

Attributes:

pipeline: An MLPipeline instance 

dataframeloader: A DataFrameLoader instance 

preprocessor: A PreProcessor Instance 

model: The given Model

In [142]:
show_doc(MLPipeline.prepare_data_for_training)

<h4 id="MLPipeline.prepare_data_for_training" class="doc_header"><code>MLPipeline.prepare_data_for_training</code><a href="__main__.py#L43" class="source_link" style="float:right">[source]</a></h4>

> <code>MLPipeline.prepare_data_for_training</code>(**`train_file_path`**:`str`, **`test_file_path`**:`str`, **`idx_col`**:`str`, **`target`**:`str`, **`random_state`**:`int`, **`valid_size`**:`float`, **`model`**:`object`)



#### Build MLPipeline Class with House Data

*You can use MLPipeline to train any model. Here we are using RandomForestRegressor from Scikit-Learn, on  [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [123]:
# mean_absolute_error.__name__

In [9]:
# create scikit-learn ml model
scikit_model = RandomForestRegressor(n_estimators=100, random_state=42)

# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= "input/home_data/train.csv",
    test_file_path= "input/home_data/test.csv",
    idx_col="Id",
    target="SalePrice",
    model=scikit_model,
    random_state=42,
    valid_size=0.2)

In [None]:
sci_ml_pl.dataframeloader.X.head()

In [None]:
sci_ml_pl.dataframeloader.y.head()

In [None]:
print(len(sci_ml_pl.dataframeloader.final_cols))
sci_ml_pl.dataframeloader.final_cols

In [None]:
print(len(sci_ml_pl.dataframeloader.low_card_cat_cols))
sci_ml_pl.dataframeloader.low_card_cat_cols

In [None]:
print(len(sci_ml_pl.dataframeloader.high_card_cat_cols))
sci_ml_pl.dataframeloader.high_card_cat_cols

In [None]:
print(len(sci_ml_pl.dataframeloader.numerical_cols))
sci_ml_pl.dataframeloader.numerical_cols

In [None]:
# # Now fit and predict
sci_ml_pl.scikit_pipeline.fit(sci_ml_pl.dataframeloader.X_train, sci_ml_pl.dataframeloader.y_train)

preds = sci_ml_pl.scikit_pipeline.predict(sci_ml_pl.dataframeloader.X_valid)
print('X_valid MAE:', mean_absolute_error(sci_ml_pl.dataframeloader.y_valid, preds))

#### Let's do Cross Validation for Scikit Model on our MLPipeline

In [None]:
# create scikit-learn ml model
scikit_model = RandomForestRegressor(n_estimators=200, random_state=42)
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(train_file_path= "input/home_data/train.csv",
                                             test_file_path= "input/home_data/test.csv",
                                             idx_col="Id", target="SalePrice",
                                             model=scikit_model,random_state=42,
                                             cv_cols_type = "all") #cv_cols_type = all|num|cat

In [None]:
print(len(sci_ml_pl.dataframeloader.cv_cols))
sci_ml_pl.dataframeloader.cv_cols

In [None]:
sci_ml_pl.dataframeloader.X_cv.head()

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
sci_ml_pl.scikit_pipeline

In [None]:
# Now fit and predict
scores = sci_ml_pl.do_cross_validation(estimator=sci_ml_pl.scikit_pipeline, cv=10,
                                    scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

#### Let's do Grid Search for HyperParameters Tunning for Scikit Model on our MLPipeline

In [31]:
# create blanket scikit-learn ml model

scikit_model = RandomForestRegressor()

In [32]:
# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_cv(train_file_path= "input/home_data/train.csv",
                                             test_file_path= "input/home_data/test.csv",
                                             idx_col="Id",
                                             target="SalePrice",
                                             model=scikit_model,
                                             random_state=42,
                                             cv_cols_type = "all") #cv_cols_type = all|num|cat

In [41]:
param_grid = {
#     "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__max_depth": [80,100,110],
    'model__max_features': [2, 3],
    'model__min_samples_leaf': [3,4,5],
    'model__min_samples_split': [8,10,12],
    "model__n_estimators": [100,200,1000]
}

In [42]:
grid_search = sci_ml_pl.do_grid_search(estimator=sci_ml_pl.scikit_pipeline,
                                       param_grid=param_grid,
                                       cv=5,
                                       scoring='neg_mean_absolute_error')

print("Best params:")
print(grid_search.best_params_)

Best params:
{'model__max_depth': 80, 'model__min_samples_leaf': 3, 'model__min_samples_split': 8, 'model__n_estimators': 1000}


In [43]:
print(f"Internal CV MAE score: {-1*(grid_search.best_score_):.3f}")

Internal CV MAE score: 27492.755


#### Let's Use K-Fold Training

In [143]:
# create scikit-learn ml model
scikit_model = RandomForestRegressor(n_estimators=200, random_state=42)

# createm ml pipeline for scikit-learn model
sci_ml_pl = MLPipeline().prepare_data_for_k_fold(
    train_file_path= "input/home_data/train.csv",
    test_file_path= "input/home_data/test.csv",
    idx_col="Id",
    target="SalePrice",
    model=scikit_model,
    random_state=42)

In [144]:
sci_ml_pl.dataframeloader.X_cv.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,548,0,61,0,0,0,0,0,2,2008
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Feedr,Norm,...,460,298,0,0,0,0,0,0,5,2007
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,608,0,42,0,0,0,0,0,9,2008
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,642,0,35,272,0,0,0,0,2,2006
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,Norm,Norm,...,836,192,84,0,0,0,0,0,12,2008


In [145]:
sci_ml_pl.dataframeloader.y.head()

Id
1    208500
2    181500
3    223500
4    140000
5    250000
Name: SalePrice, dtype: int64

In [146]:
print(len(sci_ml_pl.dataframeloader.final_cols))
# sci_ml_pl.dataframeloader.final_cols

79


In [147]:
k_fold = sci_ml_pl.do_k_fold_training(estimator=sci_ml_pl.scikit_pipeline, n_splits=5, metrics=mean_absolute_error)



fold: 1 , mean_absolute_error: 18574.931164383564
fold: 2 , mean_absolute_error: 16330.733304794521
fold: 3 , mean_absolute_error: 18121.307705479452
fold: 4 , mean_absolute_error: 16604.494006849316
fold: 5 , mean_absolute_error: 17764.65690068493


In [148]:
preds = sci_ml_pl.do_k_fold_prediction(k_fold=k_fold)

In [149]:
print(len(preds))
preds

1459


array([125665.25 , 155381.625, 187048.84 , ..., 156165.9  , 109808.915,
       231453.865])

#### Let's Use XGBosst on MLPipeline

*You can also use MLPipeline with XGBoost model, Make sure to install XGBooost depending upon your OS.*

*After that all steps remains same. Here is example using XGBRegressor with [Melbourne Home Sale price data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
# Best way to install xgboost if you are on macosx and windows machine is using conda
# !conda install -c conda-forge xgboost

In [None]:
from xgboost import XGBRegressor
# create xgb ml model
xgb_model = XGBRegressor(n_estimators=250,learning_rate=0.05, random_state=42)

# createm ml pipeline for xgb model
xgb_ml_pl = MLPipeline().prepare_data_for_training(
    train_file_path= "input/home_data/train.csv",
    test_file_path= "input/home_data/test.csv",
    idx_col="Id",
    target="SalePrice",
    model=xgb_model,
    random_state=42,
    valid_size=0.2)

# Now fit and predict
xgb_ml_pl.scikit_pipeline.fit(xgb_ml_pl.dataframeloader.X_train, xgb_ml_pl.dataframeloader.y_train)
preds = xgb_ml_pl.scikit_pipeline.predict(xgb_ml_pl.dataframeloader.X_valid)
print('X_valid MAE:', mean_absolute_error(xgb_ml_pl.dataframeloader.y_valid, preds))

#### Let's do Cross Validation for XGB Model on our MLPipeline

In [None]:
# createm ml pipeline for scikit-learn model
xgb_ml_pl = MLPipeline().prepare_data_for_cv(train_file_path= "input/home_data/train.csv",
                                             test_file_path= "input/home_data/test.csv",
                                             idx_col="Id", target="SalePrice",
                                             model=xgb_model,random_state=42,
                                             cv_cols_type = "all") #cv_cols_type = all|num|cat
# Now fit and predict
scores = xgb_ml_pl.do_cross_validation(estimator=xgb_ml_pl.scikit_pipeline, cv=5,
                                    scoring='neg_mean_absolute_error')
print("scores:", scores)
print("Average MAE score:", scores.mean())

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()