In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp preprocessor

# PreProcessor

> An API to Preprocess train, valid and test dataset for Machine Learning models based on tabular or strucuture data

In [3]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [4]:
# export
from tabular_ml_toolkit.dataframeloader import *
from tabular_ml_toolkit.logger import *

In [5]:
# export
# hide
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

In [6]:
# export

class PreProcessor:
    """
    Represent PreProcessor class
    
    Attributes:
    numerical_transformer: Numerical Columns Tranformer
    categorical_transformer: Categorical Columns Transformer
    preprocessor: Preprocessor for Columns Tranformer
    """

    def __init__(self):
        self.columns_transfomer = None
        self.transformer_type = None
        self.target_cols_transformer = None
        self.target_cols_pl = None
        self.cat_cols_pl = None
        self.num_cols_pl = None
        
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = "numerical_transformer, categorical_transformer,columns_transfomer"
        return "PreProcessor object with attributes:" + attr_str
    
    def __repr__(self):
        return self.__str__()
    
    # PreProcessor Pipeline core methods
    
    # Create preprocessing pipeline for numerical data
    def create_num_cols_pp_pl(self, num_cols__imputer, num_cols__scaler):
        self.num_cols_pl = Pipeline(steps=[
            ('imputer', num_cols__imputer),
            ('scaler',  num_cols__scaler)
        ])
        
    # Create Preprocessing pipeline for categorical data
    def create_cat_cols_pp_pl(self, cat_cols__imputer, cat_cols__encoder):
        self.cat_cols_pl = Pipeline(steps=[
        ('imputer', cat_cols__imputer),
        ('encoder', cat_cols__encoder)
        ])
    
    # Create Preprocessing pipeline for target cols
    def create_target_cols_pp_pl(self, target_cols__encoder):
        self.target_cols_pl = Pipeline(steps=[
        ('encoder', target_cols__encoder)
        ])
    
    # Bundle preprocessing pipelines based upon types of columns
    def preprocess_all_cols(self, dataframeloader, problem_type="regression",
                            num_cols__imputer=SimpleImputer(strategy='constant'),
                            num_cols__scaler=StandardScaler(),
                            cat_cols__imputer=SimpleImputer(strategy='constant'),
                            cat_cols__encoder=OneHotEncoder(handle_unknown='ignore'),
                            target_cols__encoder=LabelEncoder()):
                            #cat_cols__encoder=OrdinalEncoder(handle_unknown='use_encoded_value',
                                                             #unknown_value=np.nan)):
        
        #if problem type classification encode target
        # encode target based upon problem type
        if problem_type == "classification":
            logger.info("PreProcessing will include target(s) encoding!")
            #now just call fit tranform on y
            dataframeloader.y = target_cols__encoder.fit_transform(dataframeloader.y)
            #logger.info("Encoded dataframeloader.y:", dataframeloader.y)
        
        #TODO: REALLY NOT HAPPY WITH THIS DETERMINISTIC REPEATED FLOW
        # change preprocessor according to type of column found
        if len(dataframeloader.categorical_cols) < 1:
            logger.info("categorical columns are None, Preprocessing will done accordingly!")
            # create scikit-learn pipelines instance
            self.create_num_cols_pp_pl(num_cols__imputer, num_cols__scaler)
            #now setup columns tranformer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('num_cols', self.num_cols_pl,
                     dataframeloader.numerical_cols)
                ])
                
            
        elif len(dataframeloader.numerical_cols) < 1:
            logger.info("numerical columns are None, Preprocessing will done accordingly!")
            # create sklearn pipelines instance
            self.create_cat_cols_pp_pl(cat_cols__imputer, cat_cols__encoder)
            #now setup columns tranformer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('cat_cols', self.cat_cols_pl,
                     dataframeloader.categorical_cols)
                ])
                
        
        else:
            # create scikit-learn pipelines instance
            logger.info("Both Numerical & Categorical columns found, Preprocessing will done accordingly!")
            self.create_num_cols_pp_pl(num_cols__imputer, num_cols__scaler)
            self.create_cat_cols_pp_pl(cat_cols__imputer, cat_cols__encoder)
            #now setup columns tranformer
            self.columns_transfomer = ColumnTransformer(
                transformers=[
                    ('num_cols', self.num_cols_pl,
                     dataframeloader.numerical_cols),
                    ('cat_cols', self.cat_cols_pl,
                     dataframeloader.categorical_cols)
                ])

        
        # now setup final tranfomer type      
        self.transformer_type = self.columns_transfomer
        #logger.info(f"self.transformer_type: {self.transformer_type}")
        
        return self

In [7]:
# showdoc(PreProcessor.preprocess_data)

#### Test PreProcessor with House Data

In [8]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id", target="SalePrice",
    random_state=42)
dfl

2021-11-20 01:10:37,554 INFO DataFrame Memory usage decreased to 0.58 Mb (35.5% reduction)
2021-11-20 01:10:37,589 INFO DataFrame Memory usage decreased to 0.58 Mb (34.8% reduction)


DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

In [9]:
# let's create train, valid split
dfl.create_train_valid(valid_size=0.2, random_state = 42)

In [10]:
pp = PreProcessor().preprocess_all_cols(dataframeloader=dfl, problem_type="regression")
pp

2021-11-20 01:10:37,672 INFO Both Numerical & Categorical columns found, Preprocessing will done accordingly!


PreProcessor object with attributes:numerical_transformer, categorical_transformer,columns_transfomer

In [11]:
# pp.columns_transfomer

In [12]:
print(len(dfl.numerical_cols))
# dfl.numerical_cols

36


In [13]:
print(len(dfl.high_card_cat_cols))
# dfl.high_card_cat_cols

3


In [14]:
print(len(dfl.low_card_cat_cols))
# dfl.low_card_cat_cols

40


In [15]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model

RandomForestRegressor(random_state=42)

In [16]:
# Bundle preprocessor and model in a pipeline
pl = Pipeline(steps=[('preprocessor', pp.columns_transfomer),
                      ('model', model)
                     ])
# pl

In [17]:
# for visualizing pipeline
from sklearn import set_config

set_config(display="diagram")
pl

In [18]:
# Preprocessing of training data and then fit model 
pl.fit(dfl.X_train, dfl.y_train)

In [19]:
# Preprocessing of validation data and then get predictions
preds = pl.predict(dfl.X_valid)

print('X_valid MAE:', mean_absolute_error(dfl.y_valid, preds))

X_valid MAE: 17588.3151369863


In [20]:
# X_valid MAE: 17582.46150684932

In [21]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_dataframeloader.ipynb.
Converted 01_preprocessor.ipynb.
Converted 02_tmlt.ipynb.
Converted 03_Tutorial.ipynb.
Converted 04_optuna_objective.ipynb.
Converted 07_Kaggle_TPS_Tutorial.ipynb.
Converted do_optuna_opt_tutorial.ipynb.
Converted index.ipynb.
Converted logger.ipynb.
