In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# default_exp PreProcessor

# PreProcessor

> An API to Preprocess train, valid and test dataset for Machine Learning models based on tabular or strucuture data

In [None]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [None]:
# export
from tabular_ml_toolkit.DataFrameLoader import *
from tabular_ml_toolkit.Logger import *

In [None]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
# export

class PreProcessor:
    """
    Represent PreProcessor class
    
    Attributes:
    numerical_transformer: Numerical Columns Tranformer
    categorical_transformer: Categorical Columns Transformer
    preprocessor: Preprocessor for Columns Tranformer
    """

    def __init__(self):
        self.numerical_transformer = None
        self.categorical_transformer = None
        self.columns_transfomer = None
        self.transformer_type = None
        self.categorical_transformer = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = "numerical_transformer, categorical_transformer,columns_transfomer"
        return "PreProcessor object with attributes:" + attr_str
    
    def __repr__(self):
        return self.__str__()
    
    # PreProcessor core methods
    
    # Preprocessing for numerical data
    def preprocess_numerical_data(self, num_cols__imputer, num_cols__scaler):
        self.numerical_transformer = Pipeline(steps=[
            ('imputer', num_cols__imputer),
            ('scaler',  num_cols__scaler)
        ])
        return self.numerical_transformer
        
    # Preprocessing for categorical data
    def preprocess_categorical_data(self, cat_cols__imputer, cat_cols__encoder):
        self.categorical_transformer = Pipeline(steps=[
        ('imputer', cat_cols__imputer),
        ('encoder', cat_cols__encoder)
        ])
        return self.categorical_transformer
    
    # Bundle preprocessing pipelines based upon types of columns
    def preprocess_all_cols(self, dataframeloader,
                            num_cols__imputer=SimpleImputer(strategy='median'),
                            num_cols__scaler=StandardScaler(),
                            cat_cols__imputer=SimpleImputer(strategy='constant'),
                            cat_cols__encoder=OneHotEncoder(handle_unknown='ignore')):
        
        # change preprocessor according to type of column found
        
        if len(dataframeloader.categorical_cols) < 1:
            #print("categorical cols are None")
            # create scikit-learn pipelines instances
            self.numerical_transformer = self.preprocess_numerical_data(num_cols__imputer, num_cols__scaler)
            
            # scaled and impute all numerical columns
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('num_cols', self.numerical_transformer,
                     dataframeloader.numerical_cols)
                ])
            
            self.transformer_type = self.columns_transfomer
            
        elif len(dataframeloader.numerical_cols) < 1:
            #print("numeric cols are None")
            # create scikit-learn pipelines instances
            self.categorical_transformer = self.preprocess_categorical_data(cat_cols__imputer, cat_cols__encoder)
            
            # convert all categorical columns to OneHotEncoder with Scikit-learn ColumnTranfomer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('cat_cols', self.categorical_transformer,
                     dataframeloader.categorical_cols)
                ])
            
            self.transformer_type = self.columns_transfomer
        
        else:
            # create scikit-learn pipelines instances
            #print("No cols types are None, So ALL preprocessors called!")
            self.numerical_transformer = self.preprocess_numerical_data(num_cols__imputer, num_cols__scaler)
            self.categorical_transformer = self.preprocess_categorical_data(cat_cols__imputer, cat_cols__encoder)
            
            # convert to Scikit-learn ColumnTranfomer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('num_cols', self.numerical_transformer,
                     dataframeloader.numerical_cols),
                    ('cat_cols', self.categorical_transformer,
                     dataframeloader.categorical_cols)
                ])
            
            self.transformer_type = self.columns_transfomer
        
        return self

In [None]:
# showdoc(PreProcessor.preprocess_data)

#### Test PreProcessor with House Data

In [None]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id", target="SalePrice",
    random_state=42)
dfl

2021-11-07 00:57:23,322 INFO DataFrame Memory usage decreased to 0.58 Mb (35.5% reduction)
2021-11-07 00:57:23,367 INFO DataFrame Memory usage decreased to 0.58 Mb (34.8% reduction)


DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

In [None]:
# let's create train, valid split
dfl.create_train_valid(valid_size=0.2, random_state = 42)

In [None]:
pp = PreProcessor().preprocess_all_cols(dataframeloader=dfl)
pp

PreProcessor object with attributes:numerical_transformer, categorical_transformer,columns_transfomer

In [None]:
# pp.columns_transfomer

In [None]:
print(len(dfl.numerical_cols))
# dfl.numerical_cols

36


In [None]:
print(len(dfl.high_card_cat_cols))
# dfl.high_card_cat_cols

3


In [None]:
print(len(dfl.low_card_cat_cols))
# dfl.low_card_cat_cols

40


In [None]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model

RandomForestRegressor(random_state=42)

In [None]:
# Bundle preprocessor and model in a pipeline
pl = Pipeline(steps=[('preprocessor', pp.columns_transfomer),
                      ('model', model)
                     ])
# pl

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
pl

In [None]:
# Preprocessing of training data and then fit model 
pl.fit(dfl.X_train, dfl.y_train)

In [None]:
# Preprocessing of validation data and then get predictions
preds = pl.predict(dfl.X_valid)

print('X_valid MAE:', mean_absolute_error(dfl.y_valid, preds))

X_valid MAE: 17634.989965753426


In [None]:
# X_valid MAE: 17582.46150684932

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_DataFrameLoader.ipynb.
Converted 01_PreProcessor.ipynb.
Converted 02_MLPipeline.ipynb.
Converted 03_Tutorial.ipynb.
Converted 07_Kaggle_TPS_Tutorial.ipynb.
Converted Logger.ipynb.
Converted index.ipynb.
