In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp PreProcessor

# PreProcessor

> An API to Preprocess train, valid and test dataset for Machine Learning models based on tabular or strucuture data

In [None]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [None]:
# export
from tabular_ml_toolkit.DataFrameLoader import *

In [None]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
# export

class PreProcessor:
    """
    Represent PreProcessor class
    
    Attributes:
    numerical_transformer: Numerical Columns Tranformer
    categorical_transformer: Categorical Columns Transformer
    preprocessor: Preprocessor for Columns Tranformer
    """

    def __init__(self):
        self.numerical_transformer = None
        self.categorical_transformer = None
        self.columns_transfomer = None
        self.transformer_type = None
        self.OHE_categorical_transformer = None
        self.ORE_categorical_transformer = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = "numerical_transformer, categorical_transformer,columns_transfomer"
        return "PreProcessor object with attributes:" + attr_str
    
    def __repr__(self):
        return self.__str__()
    
    # PreProcessor core methods
    
    # Preprocessing for numerical data
    def preprocess_numerical_data(self, num_imputer, num_scaler):
        self.numerical_transformer = Pipeline(steps=[
            ('imputer', num_imputer),
            ('scaler',  num_scaler)
        ])
        
    # Preprocessing for categorical data
    def preprocess_OHE_categorical_data(self, cat_imputer):
        self.OHE_categorical_transformer = Pipeline(steps=[
        ('imputer', cat_imputer),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])   

    def preprocess_OE_categorical_data(self, cat_imputer):
        self.OE_categorical_transformer = Pipeline(steps=[
        ('imputer', cat_imputer),
        ('ordinal', OrdinalEncoder())
        ])
        
    #TO-DO
#     def reduce_dims(self):
#         pass
    
    # Bundle preprocessing pipelines based upon types of columns
    def preprocess_all_cols(self, dataframeloader,
                            num_imputer = SimpleImputer(strategy='median'),
                            num_scaler = StandardScaler(),
                            cat_imputer = SimpleImputer(strategy='constant')):
        
        # change preprocessor according to type of column found
        
        if len(dataframeloader.high_card_cat_cols) < 1:
            if len(dataframeloader.low_card_cat_cols) < 1:
#                 print("both high_card_cat_cols and low_card_cat_cols are None")
                self.preprocess_numerical_data(num_imputer, num_scaler)
                self.transformer_type = self.numerical_transformer
        
            elif len(dataframeloader.numerical_cols) < 1:
#                 print("both numeric nand high card cat cols are None")
                # create scikit-learn pipelines instances
                self.preprocess_OHE_categorical_data(cat_imputer)
                # convert all categorical columns to OneHotEncoder with Scikit-learn ColumnTranfomer
                self.columns_transfomer = ColumnTransformer(
                    transformers=[
                        ('low_card_cat_cols', self.OHE_categorical_transformer,
                         dataframeloader.low_card_cat_cols)
                    ])
                self.transformer_type = self.columns_transfomer
        
        elif len(dataframeloader.low_card_cat_cols) < 1:
            if len(dataframeloader.numerical_cols) < 1:
#                 print("both numeric nand low card cat cols are None")
                # create scikit-learn pipelines instances
                self.preprocess_OHE_categorical_data(cat_imputer)
                # convert all categorical columns to OneHotEncoder with Scikit-learn ColumnTranfomer
                self.columns_transfomer = ColumnTransformer(
                    transformers=[
                        ('high_card_cat_cols', self.OHE_categorical_transformer,
                         dataframeloader.high_card_cat_cols)
                    ])
                self.transformer_type = self.columns_transfomer
        
        elif len(dataframeloader.numerical_cols) < 1:
#             print("numeric cols are None")
            # create scikit-learn pipelines instances
            self.preprocess_OHE_categorical_data(cat_imputer)
            # convert all categorical columns to OneHotEncoder with Scikit-learn ColumnTranfomer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('low_card_cat_cols', self.OHE_categorical_transformer,
                     dataframeloader.low_card_cat_cols),
                    ('high_card_cat_cols', self.OHE_categorical_transformer,
                     dataframeloader.high_card_cat_cols)
                ])
            self.transformer_type = self.columns_transfomer
        else:
            # create scikit-learn pipelines instances
#             print("No cols types are None, So ALL preprocessors called!")
            self.preprocess_numerical_data(num_imputer, num_scaler)
            self.preprocess_OHE_categorical_data(cat_imputer)
            self.preprocess_OE_categorical_data(cat_imputer)
            # convert to Scikit-learn ColumnTranfomer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('num_cols', self.numerical_transformer,
                     dataframeloader.numerical_cols),
                    ('low_card_cat_cols', self.OHE_categorical_transformer,
                     dataframeloader.low_card_cat_cols),
                    ('high_card_cat_cols', self.OHE_categorical_transformer,
                     dataframeloader.high_card_cat_cols)
                ])
            self.transformer_type = self.columns_transfomer
        
        return self

In [None]:
# showdoc(PreProcessor.preprocess_data)

#### Test PreProcessor with House Data

In [None]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id", target="SalePrice",
    random_state=42)
dfl

DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

In [None]:
# let's create train, valid split
dfl.create_train_valid(valid_size=0.2, random_state = 42)

In [None]:
pp = PreProcessor().preprocess_all_cols(dataframeloader=dfl)
pp

PreProcessor object with attributes:numerical_transformer, categorical_transformer,columns_transfomer

In [None]:
# pp.columns_transfomer

In [None]:
print(len(dfl.numerical_cols))
# dfl.numerical_cols

36


In [None]:
print(len(dfl.high_card_cat_cols))
# dfl.high_card_cat_cols

3


In [None]:
print(len(dfl.low_card_cat_cols))
# dfl.low_card_cat_cols

40


In [None]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model

RandomForestRegressor(random_state=42)

In [None]:
# Bundle preprocessor and model in a pipeline
pl = Pipeline(steps=[('preprocessor', pp.columns_transfomer),
                      ('model', model)
                     ])
# pl

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
pl

In [None]:
# Preprocessing of training data and then fit model 
pl.fit(dfl.X_train, dfl.y_train)

In [None]:
# Preprocessing of validation data and then get predictions
preds = pl.predict(dfl.X_valid)

print('X_valid MAE:', mean_absolute_error(dfl.y_valid, preds))

X_valid MAE: 17634.989965753426


In [None]:
# X_valid MAE: 17582.46150684932

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_DataFrameLoader.ipynb.
Converted 01_PreProcessor.ipynb.
Converted 02_MLPipeline.ipynb.
Converted 03_Tutorial.ipynb.
Converted 07_Kaggle_TPS_Tutorial.ipynb.
Converted Optuna_Tutorial.ipynb.
Converted automl_in_sklearn_pipeline.ipynb.
Converted index.ipynb.
