In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp preprocessor

# PreProcessor

> An API to Preprocess train, valid and test dataset for Machine Learning models based on tabular or strucuture data

In [3]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [4]:
# export
from tabular_ml_toolkit.dataframeloader import *
from tabular_ml_toolkit.logger import *

In [5]:
# export
# hide
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd

In [6]:
# export

class PreProcessor:
    """
    Represent PreProcessor class
    
    Attributes:
    numerical_transformer: Numerical Columns Tranformer
    categorical_transformer: Categorical Columns Transformer
    preprocessor: Preprocessor for Columns Tranformer
    """

    def __init__(self):
        self.columns_transfomer = None
        self.target_cols_pl = None
        self.cat_cols_pl = None
        self.num_cols_pl = None
        
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        attr_str = "numerical_transformer, categorical_transformer,columns_transfomer"
        return "PreProcessor object with attributes:" + attr_str
    
    def __repr__(self):
        return self.__str__()
    
    # PreProcessor Pipeline core methods
    
    # Create preprocessing pipeline for numerical data
    def create_num_cols_pp_pl(self, num_cols__imputer, num_cols__scaler):
        self.num_cols_pl = Pipeline(steps=[('imputer', num_cols__imputer), ('scaler',  num_cols__scaler)],
                                    #memory="pipeline_cache_dir"
                                   )
        
    # Create Preprocessing pipeline for categorical data
    def create_cat_cols_pp_pl(self, cat_cols__imputer, cat_cols__encoder):
        self.cat_cols_pl = Pipeline(steps=[('imputer', cat_cols__imputer), ('encoder', cat_cols__encoder)],
                                   #memory="pipeline_cache_dir"
                                   )
    
    # Create Preprocessing pipeline for target cols
    def create_target_cols_pp_pl(self, target_cols__encoder):
        self.target_cols_pl = Pipeline(steps=[('encoder', target_cols__encoder)],
                                      #memory="pipeline_cache_dir"
                                      )
    
    # Bundle preprocessing pipelines based upon types of columns
    def preprocess_all_cols(self, dataframeloader, problem_type="regression",
                            num_cols__imputer=SimpleImputer(strategy='constant'),
                            num_cols__scaler=StandardScaler(),
                            cat_cols__imputer=SimpleImputer(strategy='constant'),
                            cat_cols__encoder=OneHotEncoder(handle_unknown='ignore'),
                            target_cols__encoder=LabelEncoder()):
                            #cat_cols__encoder=OrdinalEncoder(handle_unknown='use_encoded_value',
                                                             #unknown_value=np.nan)):
        
        #if problem type classification encode target
        # encode target based upon problem type
        if "classification" in problem_type:
            logger.info("PreProcessing will include target(s) encoding!")
            #now just call fit tranform on y
            dataframeloader.y = target_cols__encoder.fit_transform(dataframeloader.y)
            #logger.info("Encoded dataframeloader.y:", dataframeloader.y)
        
        #TODO: REALLY NOT HAPPY WITH THIS DETERMINISTIC REPEATED FLOW
        # change preprocessor according to type of column found
        if len(dataframeloader.categorical_cols) < 1:
            logger.info("categorical columns are None, Preprocessing will done accordingly!")
            # create scikit-learn pipelines instance
            self.create_num_cols_pp_pl(num_cols__imputer, num_cols__scaler)
            #now setup columns tranformer
            self.columns_transfomer  = make_column_transformer(
                (self.num_cols_pl, dataframeloader.numerical_cols),
                remainder='passthrough', sparse_threshold=0
            )   
            
        elif len(dataframeloader.numerical_cols) < 1:
            logger.info("numerical columns are None, Preprocessing will done accordingly!")
            # create sklearn pipelines instance
            self.create_cat_cols_pp_pl(cat_cols__imputer, cat_cols__encoder)
            #now setup columns tranformer
            self.columns_transfomer = make_column_transformer(
                (self.cat_cols_pl, dataframeloader.categorical_cols),
                remainder='passthrough', sparse_threshold=0
            )
                
        
        else:
            # create scikit-learn pipelines instance
            logger.info("Both Numerical & Categorical columns found, Preprocessing will done accordingly!")
            self.create_num_cols_pp_pl(num_cols__imputer, num_cols__scaler)
            self.create_cat_cols_pp_pl(cat_cols__imputer, cat_cols__encoder)
            #now setup columns tranformer
            self.columns_transfomer  = make_column_transformer(
                (self.num_cols_pl, dataframeloader.numerical_cols),
                (self.cat_cols_pl, dataframeloader.categorical_cols),
                remainder='passthrough', sparse_threshold=0
            )
        #logger.info(f"self.transformer_type: {self.transformer_type}")
        
        return self.columns_transfomer

In [7]:
# showdoc(PreProcessor.preprocess_data)

#### Test PreProcessor with House Data

In [8]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id", target="SalePrice",
    random_state=42)
dfl

2021-11-30 21:17:46,942 INFO DataFrame Memory usage decreased to 0.58 Mb (35.5% reduction)
2021-11-30 21:17:46,980 INFO DataFrame Memory usage decreased to 0.58 Mb (34.8% reduction)


DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

In [9]:
dfl.X.shape
dfl.X_test.shape

(1459, 79)

In [10]:
# let's create train, valid split
dfl.create_train_valid(valid_size=0.2, random_state = 42)

In [11]:
pp = PreProcessor().preprocess_all_cols(dataframeloader=dfl, problem_type="regression")
pp

2021-11-30 21:17:47,084 INFO Both Numerical & Categorical columns found, Preprocessing will done accordingly!


ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                  transformers=[('pipeline-1',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='constant')),
                                                 ('scaler', StandardScaler())]),
                                 ['MSSubClass', 'LotFrontage', 'LotArea',
                                  'OverallQual', 'OverallCond', 'YearBuilt',
                                  'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                                  'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                  '1stFlrSF'...
                                 ['MSZoning', 'Street', 'Alley', 'LotShape',
                                  'LandContour', 'Utilities', 'LotConfig',
                                  'LandSlope', 'Condition1', 'Condition2',
                                  'BldgType', 'HouseStyle', 'RoofStyle',
        

In [12]:
# pp.columns_transfomer

In [13]:
print(len(dfl.numerical_cols))
# dfl.numerical_cols

36


In [14]:
print(len(dfl.high_card_cat_cols))
# dfl.high_card_cat_cols

3


In [15]:
print(len(dfl.low_card_cat_cols))
# dfl.low_card_cat_cols

40


In [16]:
pp

ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                  transformers=[('pipeline-1',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='constant')),
                                                 ('scaler', StandardScaler())]),
                                 ['MSSubClass', 'LotFrontage', 'LotArea',
                                  'OverallQual', 'OverallCond', 'YearBuilt',
                                  'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                                  'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                  '1stFlrSF'...
                                 ['MSZoning', 'Street', 'Alley', 'LotShape',
                                  'LandContour', 'Utilities', 'LotConfig',
                                  'LandSlope', 'Condition1', 'Condition2',
                                  'BldgType', 'HouseStyle', 'RoofStyle',
        

In [17]:
dfl.X.shape

(1460, 79)

In [18]:
X_np = pp.fit_transform(dfl.X)
print(type(X_np))
X_np.shape

<class 'numpy.ndarray'>


(1460, 304)

In [19]:
X_df = pd.DataFrame(X_np)
X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,294,295,296,297,298,299,300,301,302,303
0,0.073375,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.872563,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.073375,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.309859,0.068587,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.073375,0.761179,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
X_test_np = pp.transform(dfl.X_test)
print(type(X_test_np))
X_test_np.shape

<class 'numpy.ndarray'>


(1459, 304)

In [21]:
X_train_np = pp.transform(dfl.X_train)
print(type(X_train_np))
X_train_np.shape

<class 'numpy.ndarray'>


(1168, 304)

In [22]:
X_valid_np = pp.transform(dfl.X_valid)
print(type(X_valid_np))
X_valid_np.shape

<class 'numpy.ndarray'>


(292, 304)

In [23]:
y_train = dfl.y_train
print(type(y_train))
y_valid = dfl.y_valid
print(type(y_valid))

In [24]:
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model

RandomForestRegressor(random_state=42)

In [25]:
# Preprocessing of training data and then fit model 
# model.fit(dfl.X_train, dfl.y_train)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [26]:
# Preprocessing of validation data and then get predictions
preds = model.predict(X_valid)

print('X_valid MAE:', mean_absolute_error(y_valid, preds))

X_valid MAE: 17585.421472602742


In [27]:
# X_valid MAE: 17582.46150684932

In [28]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_dataframeloader.ipynb.
Converted 01_preprocessor.ipynb.
Converted 02_tmlt.ipynb.
Converted 04_xgb_optuna_objective.ipynb.
Converted 13_Kaggle_TPS_Tutorial.ipynb.
Converted Logger.ipynb.
Converted index.ipynb.
Converted kaggle_tps_fe_tutorial.ipynb.
Converted utility.ipynb.
Converted xgb_tabular_ml_toolkit.ipynb.
