In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp PreProcessor

# PreProcessor

> An API to Preprocess train, valid and test dataset for Machine Learning models based on tabular or strucuture data

In [3]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [4]:
# export
from tabular_ml_toolkit.DataFrameLoader import *

In [5]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [6]:
# export

class PreProcessor:
    """
    Represent PreProcessor class
    
    Attributes:
    numerical_transformer: Numerical Columns Tranformer
    categorical_transformer: Categorical Columns Transformer
    preprocessor: Preprocessor for Columns Tranformer
    """

    def __init__(self):
        self.numerical_transformer = None
        self.categorical_transformer = None
        self.columns_transfomer = None
#         self.numerical_cols = None
#         self.low_card_cat_cols = None
#         self.high_card_cat_cols = None
#         self.final_cols = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = "numerical_transformer, categorical_transformer,columns_transfomer"
        return "PreProcessor object with attributes:" + attr_str
    
    def __repr__(self):
        return self.__str__()
    
#     def __lt__(self):
#         """returns: boolean"""
#         return True
    
    # PreProcess core methods
    # Preprocessing for numerical data
    def preprocess_numerical_data(self):
        self.numerical_transformer = SimpleImputer(strategy='constant')       
        
    def preprocess_OHE_categorical_data(self):
        self.OHE_categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])   

    def preprocess_ORE_categorical_data(self):
        self.ORE_categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OrdinalEncoder(handle_unknown='error'))
        ])       

    # Bundle preprocessing for numerical and categorical data
    def preprocess_data_for_training(self, dataframeloader):
        # create scikit-learn pipelines instances
        self.preprocess_numerical_data()
        self.preprocess_OHE_categorical_data()
        self.preprocess_ORE_categorical_data()
        # convert to Scikit-learn ColumnTranfomer
        self.columns_transfomer = ColumnTransformer(
            transformers=[
                ('num', self.numerical_transformer, dataframeloader.numerical_cols),
                ('low_cad_cat', self.OHE_categorical_transformer,
                 dataframeloader.low_card_cat_cols),
                ('high_cad_cat', self.ORE_categorical_transformer,
                 dataframeloader.high_card_cat_cols)
            ])
        return self
    
    # Bundle preprocessing for cv_cols
    def preprocess_data_for_cv(self, cv_cols_type, dataframeloader):
                        # change column types and preprocessor according to cv_cols provided
        if cv_cols_type == "all":
            # for all columns it' similar to call preprocess_data_for_training
            self.columns_transfomer = preprocess_data_for_training(dataframeloader)
        
        elif cv_cols_type == "num":
            # for num columns, just process numerocal columns
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                ('num', self.numerical_transformer, dataframeloader.numerical_cols)
                ])
        
        elif cv_cols_type == "cat":
            # convert all categorical columns to Scikit-learn ColumnTranfomer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('low_cad_cat', self.OHE_categorical_transformer,
                     dataframeloader.low_card_cat_cols),
                    ('high_cad_cat', self.ORE_categorical_transformer,
                     dataframeloader.high_card_cat_cols)
                ])
        else:
            raise ValueError("Bad cv_cols_type, Only 'num','cat','all' are allowed!")
        return self

In [7]:
# showdoc(PreProcessor.preprocess_data)

#### Test PreProcessor with House Data

In [8]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id", target="SalePrice",
    random_state=42, valid_size=0.2)
dfl

DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

In [9]:
pp = PreProcessor().preprocess_data_for_training(dataframeloader=dfl)
pp

PreProcessor object with attributes:numerical_transformer, categorical_transformer,columns_transfomer

In [10]:
pp.columns_transfomer

ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                 ['MSSubClass', 'LotFrontage', 'LotArea',
                                  'OverallQual', 'OverallCond', 'YearBuilt',
                                  'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                                  'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                  '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
                                  'FullBath', 'HalfBath', 'BedroomAbvGr',
                                  'K...
                                  'RoofMatl', 'MasVnrType', 'ExterQual',
                                  'ExterCond', 'Foundation', 'BsmtQual',
                                  'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Heating', 'HeatingQC',
                                  'CentralAir', 'Elect

In [11]:
print(len(dfl.numerical_cols))
dfl.numerical_cols

36


['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [12]:
print(len(dfl.high_card_cat_cols))
dfl.high_card_cat_cols

3


['Neighborhood', 'Exterior1st', 'Exterior2nd']

In [13]:
print(len(dfl.low_card_cat_cols))
dfl.low_card_cat_cols

40


['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [14]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model

RandomForestRegressor(random_state=42)

In [15]:
# Bundle preprocessor and model in a pipeline
pl = Pipeline(steps=[('preprocessor', pp.columns_transfomer),
                      ('model', model)
                     ])
pl

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBa...
                                                 

In [16]:
# Preprocessing of training data and then fit model 
pl.fit(dfl.X_train, dfl.y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBa...
                                                 

In [17]:
# Preprocessing of validation data and then get predictions
preds = pl.predict(dfl.X_valid)

print('X_valid MAE:', mean_absolute_error(dfl.y_valid, preds))

X_valid MAE: 17582.46150684932


In [18]:
# X_valid MAE: 17582.46150684932

In [19]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_DataFrameLoader.ipynb.
Converted 01_PreProcessor.ipynb.
Converted 02_MLPipeline.ipynb.
Converted Tutorial.ipynb.
Converted index.ipynb.
