In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp PreProcessor

# PreProcessor

> An API to Preprocess train, valid and test dataset for Machine Learning models based on tabular or strucuture data

In [3]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [4]:
# export
from tabular_ml_toolkit.DataFrameLoader import *

In [None]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
# export

class PreProcessor:
    """
    Represent PreProcessor class
    
    Attributes:
    numerical_transformer: Numerical Columns Tranformer
    categorical_transformer: Categorical Columns Transformer
    preprocessor: Preprocessor for Columns Tranformer
    """

    def __init__(self):
        self.numerical_transformer = None
        self.categorical_transformer = None
        self.columns_transfomer = None
        self.transformer_type = None
        self.OHE_categorical_transformer = None
        self.ORE_categorical_transformer = None
#         self.numerical_cols = None
#         self.low_card_cat_cols = None
#         self.high_card_cat_cols = None
#         self.final_cols = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = "numerical_transformer, categorical_transformer,columns_transfomer"
        return "PreProcessor object with attributes:" + attr_str
    
    def __repr__(self):
        return self.__str__()
    
    # PreProcessor core methods
    
    # Preprocessing for numerical data
    def preprocess_numerical_data(self):
        self.numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant')),
            ('scaler',  StandardScaler())
        ])
        
    # Preprocessing for categorical data
    def preprocess_OHE_categorical_data(self):
        self.OHE_categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])   

    def preprocess_OE_categorical_data(self):
        self.OE_categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder())
        ])       
    
    #TO-DO
#     def reduce_dims(self):
#         pass
    
    # Bundle preprocessing for numerical and categorical data
    def preprocess_all_cols_for_training(self, dataframeloader):
        # create scikit-learn pipelines instances
        self.preprocess_numerical_data()
        self.preprocess_OHE_categorical_data()
        self.preprocess_OE_categorical_data()
        # convert to Scikit-learn ColumnTranfomer
        self.columns_transfomer = ColumnTransformer(
            transformers=[
                ('num', self.numerical_transformer, dataframeloader.numerical_cols),
                ('low_cad_cat', self.OHE_categorical_transformer,
                 dataframeloader.low_card_cat_cols),
                ('high_cad_cat', self.OE_categorical_transformer,
                 dataframeloader.high_card_cat_cols)
            ])
        self.transformer_type = self.columns_transfomer
        return self
    
    # Bundle preprocessing for cv_cols
    def preprocess_cols_for_cv(self, cv_cols_type, dataframeloader):
        
        # change column types and preprocessor according to cv_cols provided
        if cv_cols_type == "all":
            # create scikit-learn pipelines instances
            self.preprocess_numerical_data()
            self.preprocess_OHE_categorical_data()
            # convert to Scikit-learn ColumnTranfomer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('num', self.numerical_transformer, dataframeloader.numerical_cols),
                    ('low_cad_cat', self.OHE_categorical_transformer,
                     dataframeloader.low_card_cat_cols)
                ])
            self.transformer_type = self.columns_transfomer
        
        elif cv_cols_type == "num":
            self.preprocess_numerical_data()
            self.transformer_type = self.numerical_transformer
        
        elif cv_cols_type == "cat":
            # create scikit-learn pipelines instances
            self.preprocess_OHE_categorical_data()
            # convert all categorical columns to OrdinalEncoder with Scikit-learn ColumnTranfomer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('low_cad_cat', self.OHE_categorical_transformer,
                     dataframeloader.low_card_cat_cols)
                    # REMOVING HIGH CARDINALITY COLUMNS BECAUSE OF NOT ENOUGH REPRESENTATION
                    # OF CARDINALITY DURING K-FOLD SPLIT OF DATA
                ])
            self.transformer_type = self.columns_transfomer
        else:
            raise ValueError("Bad cv_cols_type, Only 'num','cat','all' are allowed!")
        
        return self

In [None]:
# showdoc(PreProcessor.preprocess_data)

#### Test PreProcessor with House Data

In [None]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id", target="SalePrice",
    random_state=42, valid_size=0.2)
dfl

In [None]:
pp = PreProcessor().preprocess_all_cols_for_training(dataframeloader=dfl)
pp

In [None]:
pp.columns_transfomer

In [None]:
print(len(dfl.numerical_cols))
dfl.numerical_cols

In [None]:
print(len(dfl.high_card_cat_cols))
dfl.high_card_cat_cols

In [None]:
print(len(dfl.low_card_cat_cols))
dfl.low_card_cat_cols

In [None]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model

In [None]:
# Bundle preprocessor and model in a pipeline
pl = Pipeline(steps=[('preprocessor', pp.columns_transfomer),
                      ('model', model)
                     ])
pl

In [None]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
pl

In [None]:
# Preprocessing of training data and then fit model 
pl.fit(dfl.X_train, dfl.y_train)

In [None]:
# Preprocessing of validation data and then get predictions
preds = pl.predict(dfl.X_valid)

print('X_valid MAE:', mean_absolute_error(dfl.y_valid, preds))

In [None]:
# X_valid MAE: 17582.46150684932

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()