In [1]:
# using %matplotlib inline to immediatly draw plot after run
%matplotlib inline 

# imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import dotenv
import os
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.compose import ColumnTransformer


"""prepare environment variable"""
project_dir = os.path.join(os.path.abspath(''), os.pardir)
dotenv_path = os.path.join(project_dir, '.env')
dotenv.load_dotenv(dotenv_path)

True

In [2]:
# functions
def check_path_existance(path):
    return os.path.exists(path)

In [43]:
dataset_folder = os.path.join(project_dir, os.getenv('RAW_FOLDER'), 'house-prices-advanced-regression-techniques')
train_df = None
test_df = None

if check_path_existance(dataset_folder):   
    train_df = pd.read_csv(os.path.join(dataset_folder, 'train.csv'))
    test_df = pd.read_csv(os.path.join(dataset_folder, 'test.csv'))

In [101]:
train_df.loc[:, ('MSSubClass')] = train_df.loc[:, ('MSSubClass')].astype(str)
test_df.loc[:, ('MSSubClass')] = test_df.loc[:, ('MSSubClass')].astype(str)

numerical_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = train_df.select_dtypes(include=['object']).columns

# Data Preparation

## Train Data Preprocessing

In [116]:
"""Numerical Data Preprocessing"""
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns) -> None:
        super().__init__()
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.loc[:, self.columns]
    

class DFSimpleImputer(SimpleImputer):
    def __init__(self, **kwargs):
        super(DFSimpleImputer, self).__init__(**kwargs)
    
    def transform(self, X):
        X_imputed = super().transform(X)
        X_imputed = pd.DataFrame(X_imputed, index=X.index, columns=X.columns)
        return X_imputed



numerical_pipeline = Pipeline([
    ('column_selector', ColumnSelector(columns=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
            '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'])),
    ('simple_imputer', SimpleImputer(strategy='mean')),
    ('logger', FunctionTransformer(np.log1p))
])


(1460, 17)

In [117]:
"""Categorical Data Preprocessing"""
class MissingCategoricalOnPurposeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns) -> None:
        super().__init__()
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        X[self.columns] = imputer.fit_transform(X[self.columns])
        return X
    

# def replace_missing_categorical_column(df: pd.DataFrame) -> pd.DataFrame:
#     try:
#         mode_categorical_columns = {}
#         for c in df.select_dtypes(include=['object']).columns:
#             if c not in categorical_columns_nan_on_purpose:
#                 mode_ = df[c].mode()[0]
#                 df[c].fillna(mode_, inplace=True)
#                 mode_categorical_columns[c] = mode_  
#         return (df, mode_categorical_columns)
#     except AttributeError:
#         print("Make sure to input Pandas DataFrame")
#         return None
# X_categorical, mode_categorical_columns = replace_missing_categorical_column(X_categorical)
    

class ModifiedOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_categorical_columns) -> None:
        super().__init__()
        self.ordinal_categorical_columns = ordinal_categorical_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for k, v in self.ordinal_categorical_columns.items():
            ordinal_encoder = OrdinalEncoder(categories=[v])
            X[k] = ordinal_encoder.fit_transform(X[[k]]).astype(int)
        return X
    
class ModifiedOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, one_hot_categorical_columns) -> None:
        super().__init__()
        self.one_hot_categorical_columns = one_hot_categorical_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for k, v in self.one_hot_categorical_columns.items():
            one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore')
            X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X[[k]]))
            X_encoded.columns = one_hot_encoder.get_feature_names_out([k])
            X.drop(columns=[k], inplace=True)
            X = pd.concat([X, X_encoded], axis=1)
        return X


ordinal_categorical_columns ={
    "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
    "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
    "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
    "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
    "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
    "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
    "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
    "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
} # gather ordinal categorical column

one_hot_categorical_columns = {
    "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
    "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
    "Street": ["Pave", "Grvl"],
    "Alley": ["Missing", "Grvl", "Pave"],
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
    "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
    "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
    "LandSlope": ["Gtl", "Mod", "Sev"],
    "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
    "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
    "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
    "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
    "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
    "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
    "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
    "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
    "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
    "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
    "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
    "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
    "CentralAir": ["Y", "N"],
    "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
    "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
    "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
    "PavedDrive": ["Y", "N", "P"],
    "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
    "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
    "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
} # one hot encoder column



categorical_pipeline = Pipeline([
    ('column_selector', ColumnSelector(columns=categorical_columns)),
    ('missing_categorical_on_purpose_imputer', MissingCategoricalOnPurposeImputer(columns=[
        'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])),
    ('missing_categorical_imputer', DFSimpleImputer(strategy='most_frequent')),
    ('categorical_encoder', ColumnTransformer(
        [
            ('ordinal_encoder', ModifiedOrdinalEncoder(ordinal_categorical_columns), list(ordinal_categorical_columns.keys())),
            ('one_hot_encoder', ModifiedOneHotEncoder(one_hot_categorical_columns), list(one_hot_categorical_columns.keys()))
        ], remainder="passthrough"
    )),
])

(1460, 233)

In [None]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('numerical', numerical_pipeline),
        ('categorical', categorical_pipeline),
    ])),
    ('scale', StandardScaler())  #  scale all features
])

In [59]:
pipeline=Pipeline([
    ('features', FeatureUnion([
        ('numerical', Pipeline([
            ('column_selector', ColumnSelector(columns=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
                    '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'])),
            ('simple_imputer', SimpleImputer(strategy='mean')),
            ('logger', FunctionTransformer(np.log1p))
        ])),
        ('categorical', Pipeline([
            ('column_selector', ColumnSelector(columns=categorical_columns)),
            ('missing_categorical_on_purpose_imputer', MissingCategoricalOnPurposeImputer(columns=[
                'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])),
            ('categorical_encoder', ColumnTransformer(
                [
                    ('ordinal_encoder', ModifiedOrdinalEncoder(ordinal_categorical_columns), list(ordinal_categorical_columns.keys())),
                    ('one_hot_encoder', ModifiedOneHotEncoder(one_hot_categorical_columns), list(one_hot_categorical_columns.keys()))
                ], remainder="passthrough"
            )),
        ])),
    ])),
    ('scale', MinMaxScaler())
])
pipeline.fit_transform(train_df)

array([[0.41326841, 0.81319602, 0.95095099, ..., 0.        , 0.        ,
        0.        ],
       [0.49030656, 0.73486691, 0.76014346, ..., 0.        , 0.        ,
        0.        ],
       [0.42998996, 0.81319602, 0.93690552, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.41892525, 0.81319602, 0.50888447, ..., 0.        , 0.        ,
        0.        ],
       [0.42998996, 0.6444426 , 0.57392381, ..., 0.        , 0.        ,
        0.        ],
       [0.46633838, 0.6444426 , 0.68165888, ..., 0.        , 0.        ,
        0.        ]])

In [91]:
def train_data_feature_preprocessing(X):
    # Convert MSSubClass and MoSold into categorical column
    X.loc[:, ('MSSubClass', 'MoSold')] = X.loc[:, ('MSSubClass', 'MoSold')].astype(str)

    # Remove unwanted column
    X.drop(columns=['Id'], inplace=True)


    """Preprocess Numerical Columns"""
    X_numerical = X.select_dtypes(include=['int64', 'float64'])
    # Numerical Feature Selection
    selected_numerical_columns = [
        'LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 
        'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
        '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 
        'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'
    ]
    dropped_numerical_columns = [c for c in X_numerical.columns if c not in selected_numerical_columns]
    X_numerical.drop(columns=dropped_numerical_columns, inplace=True)


    # Fill missing numerical column using the mean of data
    class DFSimpleImputer(SimpleImputer):
        def __init__(self, **kwargs):
            super(DFSimpleImputer, self).__init__(**kwargs)
        
        def transform(self, X):
            X_imputed = super().transform(X)
            X_imputed = pd.DataFrame(X_imputed, index=X.index, columns=X.columns)
            return X_imputed

    num_imputer = DFSimpleImputer(strategy='mean')
    X_numerical = num_imputer.fit_transform(X_numerical) 

    # log transform
    X_numerical = np.log1p(X_numerical)

    """Preprocess Categorical Data"""
    X_categorical = X.select_dtypes(include=['object'])

    # Fill NA categorical column on purpose
    categorical_columns_nan_on_purpose = [
        'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
        'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
    ]
    cat_imputer = DFSimpleImputer(strategy='constant', fill_value='Missing')
    X_categorical[categorical_columns_nan_on_purpose] = cat_imputer.fit_transform(X_categorical[categorical_columns_nan_on_purpose])

    # Fill NA categorical column not on purpose
    def replace_missing_categorical_column(df: pd.DataFrame) -> pd.DataFrame:
        try:
            mode_categorical_columns = {}
            for c in df.select_dtypes(include=['object']).columns:
                if c not in categorical_columns_nan_on_purpose:
                    mode_ = df[c].mode()[0]
                    df[c].fillna(mode_, inplace=True)
                    mode_categorical_columns[c] = mode_  
            return (df, mode_categorical_columns)
        except AttributeError:
            print("Make sure to input Pandas DataFrame")
            return None
    X_categorical, mode_categorical_columns = replace_missing_categorical_column(X_categorical)

    # Perform Ordinal Encoding
    ordinal_categorical_columns = {
        "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
        "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
        "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
        "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
        "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
        "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
        "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
        "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
        "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
        "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
        "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
    } # gather ordinal categorical column
    for k, v in ordinal_categorical_columns.items():
        ordinal_encoder = OrdinalEncoder(categories=[v])
        X_categorical[k] = ordinal_encoder.fit_transform(X_categorical[[k]]).astype(int)

    # Perform One Hot Encoding
    one_hot_categorical_columns = {
        "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
        "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
        "Street": ["Pave", "Grvl"],
        "Alley": ["Missing", "Grvl", "Pave"],
        "LotShape": ["Reg", "IR1", "IR2", "IR3"],
        "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
        "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
        "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
        "LandSlope": ["Gtl", "Mod", "Sev"],
        "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
        "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
        "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
        "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
        "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
        "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
        "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
        "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
        "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
        "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
        "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
        "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
        "CentralAir": ["Y", "N"],
        "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
        "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
        "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
        "PavedDrive": ["Y", "N", "P"],
        "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
        "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
        "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
    } # one hot encoder column

    for k, v in one_hot_categorical_columns.items():
        one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore')
        X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X_categorical[[k]]))
        X_encoded.columns = one_hot_encoder.get_feature_names_out([k])
        X_categorical.drop(columns=[k], inplace=True)
        X_categorical = pd.concat([X_categorical, X_encoded], axis=1)

    combined_X = pd.concat([X_categorical, X_numerical], axis=1)
    """Perform Feature Scaling for All Columns"""
    X_scalers = {}
    for c in combined_X.columns:
        X_scaler = MinMaxScaler()
        combined_X[c] = X_scaler.fit_transform(combined_X[[c]])
        X_scalers[c] = X_scaler

    return (combined_X, num_imputer, mode_categorical_columns, X_scalers)


def train_data_target_preprocessing(y):
    y = np.log1p(y)
    y_scaler = MinMaxScaler()
    y = y_scaler.fit_transform(y)
    return (y, y_scaler)


In [238]:
# """Copy Data for Reuse"""
# dataset = train_df.copy()

# """Convert MSSubClass and MoSold into categorical column"""
# dataset.loc[:, ('MSSubClass', 'MoSold')] = dataset.loc[:, ('MSSubClass', 'MoSold')].astype(str)

# """Remove Unwanted column"""
# dataset.drop(columns=['Id'], inplace=True)


# """Perform Numerical Feature Selection"""
# selected_numerical_columns = [
#     'LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 
#     'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
#     '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 
#     'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice'
# ]
# dropped_numerical_columns = [c for c in dataset.select_dtypes(include=['int64', 'float64']).columns if c not in selected_numerical_columns]
# dataset.drop(columns=dropped_numerical_columns, inplace=True)


# """Fill NA MasVnrArea with 0"""
# dataset['MasVnrArea'].fillna(0, inplace=True)


# """Fill NA Numerical Column Not on Purpose"""
# numerical_columns_nan_on_purpose = ['MasVnrArea']
# def replace_missing_numerical_column(df: pd.DataFrame) -> pd.DataFrame:
#     try:
#         df = df.copy()
#         for f in df.select_dtypes(include=['int64', 'float64']):
#             if f not in numerical_columns_nan_on_purpose:
#                 df[f].fillna(df[f].mean(), inplace=True)
#         return df
#     except AttributeError:
#         print("Make sure to input Pandas DataFrame")
#         return None
# dataset = replace_missing_numerical_column(dataset)

# """Perform Log Transformation on Numerical Columns"""
# log_transformed_column = ('LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '2ndFlrSF', 'LowQualFinSF' , 'SalePrice')
# for c in dataset.select_dtypes(include=['int64', 'float64']):
#     if c in log_transformed_column:
#         dataset.loc[:, c] = dataset.loc[:, c].apply(lambda x: np.log(x+1))


# """Fill NA Categorical Column on Purpose"""
# categorical_columns_nan_on_purpose = [
#     'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 
#     'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
#     'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
#     'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
# ]
# def replace_missing_categorical_column_on_purpose(df: pd.DataFrame) -> pd.DataFrame:
#     try:
#         df = df.copy()
#         for f in categorical_columns_nan_on_purpose:
#             df[f].fillna('Missing', inplace=True)
#         return df
#     except AttributeError:
#         print("Make sure to input Pandas DataFrame")
#         return None
# dataset = replace_missing_categorical_column_on_purpose(dataset)


# """Fill NA Electrical Column with Mode"""
# mode_electrical = 'SBrkr'
# dataset['Electrical'].fillna(mode_electrical, inplace=True)


# """Fill NA Categorical Column Not on Purpose"""
# def replace_missing_categorical_column(df: pd.DataFrame) -> pd.DataFrame:
#     try:
#         df = df.copy()
#         for f in df.select_dtypes(include=['object']):
#             if f not in categorical_columns_nan_on_purpose:
#                 df[f].fillna(df[f].mode()[0], inplace=True)
#         return df
#     except AttributeError:
#         print("Make sure to input Pandas DataFrame")
#         return None
# dataset = replace_missing_categorical_column(dataset)


# """Perform Numerical Feature Scaling"""
# selected_numerical_columns = [
#     'LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 
#     'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
#     '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 
#     'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice'
# ]
# scaler_objects = {}
# for c in selected_numerical_columns:
#     scaler = MinMaxScaler()
#     dataset.loc[:, c] = scaler.fit_transform(dataset.loc[:, [c]])
#     scaler_objects[c] = scaler


# """Perform Ordinal Encoding"""
# ordinal_categorical_columns = {
#     "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
#     "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
#     "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
#     "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
#     "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
#     "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
#     "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
#     "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
#     "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
#     "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
#     "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
#     "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
#     "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
#     "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
#     "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
# } # gather ordinal categorical column
# for k, v in ordinal_categorical_columns.items():
#     ordinal_encoder = OrdinalEncoder(categories=[v])
#     dataset[k] = ordinal_encoder.fit_transform(dataset[[k]]).astype(int)


# """Perform One Hot Encoding"""
# ordinal_categorical_columns = {
#     "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
#     "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
#     "Street": ["Pave", "Grvl"],
#     "Alley": ["Missing", "Grvl", "Pave"],
#     "LotShape": ["Reg", "IR1", "IR2", "IR3"],
#     "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
#     "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
#     "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
#     "LandSlope": ["Gtl", "Mod", "Sev"],
#     "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
#     "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
#     "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
#     "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
#     "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
#     "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
#     "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
#     "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
#     "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
#     "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
#     "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
#     "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
#     "CentralAir": ["Y", "N"],
#     "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
#     "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
#     "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
#     "PavedDrive": ["Y", "N", "P"],
#     "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
#     "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
#     "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
# } # one hot encoder column
# for k, v in ordinal_categorical_columns.items():
#     one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore')
#     X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(dataset[[k]]).astype(int))
#     X_encoded.columns = one_hot_encoder.get_feature_names_out([k])
#     dataset.drop(columns=[k], axis=1, inplace=True)
#     dataset = pd.concat([dataset, X_encoded], axis=1)


# train_dataset = dataset

## Test Data Preprocessing

In [45]:
def test_data_feature_preprocessing(X, num_imputer,  mode_categorical_columns, X_scalers):
    # Convert MSSubClass and MoSold into categorical column
    X.loc[:, ('MSSubClass', 'MoSold')] = X.loc[:, ('MSSubClass', 'MoSold')].astype(str)

    # Remove unwanted column
    X.drop(columns=['Id'], inplace=True)


    """Preprocess Numerical Columns"""
    X_numerical = X.select_dtypes(include=['int64', 'float64'])
    # Numerical Feature Selection
    selected_numerical_columns = [
        'LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 
        'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
        '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 
        'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'
    ]
    dropped_numerical_columns = [c for c in X_numerical.columns if c not in selected_numerical_columns]
    X_numerical.drop(columns=dropped_numerical_columns, inplace=True)


    # Fill missing numerical column using the mean of data
    X_numerical = num_imputer.transform(X_numerical) 

    # log transform
    X_numerical = np.log1p(X_numerical)


    """Preprocess Categorical Data"""
    X_categorical = X.select_dtypes(include=['object'])

    # Fill NA categorical column on purpose
    categorical_columns_nan_on_purpose = [
        'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
        'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
    ]
    cat_imputer = DFSimpleImputer(strategy='constant', fill_value='Missing')
    X_categorical[categorical_columns_nan_on_purpose] = cat_imputer.fit_transform(X_categorical[categorical_columns_nan_on_purpose])

    # Fill NA categorical column not on purpose
    def replace_missing_categorical_column(df: pd.DataFrame) -> pd.DataFrame:
        try:
            for c in df.select_dtypes(include=['object']).columns:
                if c not in categorical_columns_nan_on_purpose:
                    mode_ = mode_categorical_columns[c]
                    df[c].fillna(mode_, inplace=True)
            return df
        except AttributeError:
            print("Make sure to input Pandas DataFrame")
            return None
    X_categorical = replace_missing_categorical_column(X_categorical)

    # Perform Ordinal Encoding
    ordinal_categorical_columns = {
        "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
        "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
        "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
        "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
        "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
        "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
        "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
        "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
        "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
        "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
        "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
        "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
    } # gather ordinal categorical column
    for k, v in ordinal_categorical_columns.items():
        ordinal_encoder = OrdinalEncoder(categories=[v])
        X_categorical[k] = ordinal_encoder.fit_transform(X_categorical[[k]]).astype(int)

    # X_categorical = X_categorical[ordinal_categorical_columns.keys()]
    # Perform One Hot Encoding
    one_hot_categorical_columns = {
        "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
        "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
        "Street": ["Pave", "Grvl"],
        "Alley": ["Missing", "Grvl", "Pave"],
        "LotShape": ["Reg", "IR1", "IR2", "IR3"],
        "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
        "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
        "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
        "LandSlope": ["Gtl", "Mod", "Sev"],
        "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
        "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
        "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
        "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
        "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
        "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
        "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
        "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
        "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
        "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
        "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
        "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
        "CentralAir": ["Y", "N"],
        "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
        "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
        "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
        "PavedDrive": ["Y", "N", "P"],
        "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
        "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
        "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
    } # one hot encoder column

    for k, v in one_hot_categorical_columns.items():
        one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore')
        X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X_categorical[[k]]))
        X_encoded.columns = one_hot_encoder.get_feature_names_out([k])
        X_categorical.drop(columns=[k], inplace=True)
        X_categorical = pd.concat([X_categorical, X_encoded], axis=1)

    combined_X = pd.concat([X_categorical, X_numerical], axis=1)
    """Perform Feature Scaling for All Columns"""
    for c in combined_X.columns:
        combined_X[c] = X_scalers[c].fit_transform(combined_X[[c]])

    return combined_X



def test_data_target_preprocessing(y, y_scaler):
    y = np.log1p(y)
    y = y_scaler.transform(y)
    return y


# Data Modelling

In [151]:
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


def tts(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
    return (X_train, X_test, y_train, y_test)  

def train(X, y, model):
    return model.fit(X, y)

def pred(X, model):
    return model.predict(X)

def undo_scaling_log_transform(results, name):
    return np.expm1(y_scaler.inverse_transform(results.reshape(1, -1)))



"""Split Data"""
X_train, y_train = train_df.drop(columns=['SalePrice']), train_df[['SalePrice']]
X_test = test_df


"""Data Preprocessing"""
X_train, num_imputer, mode_categorical_columns, X_scalers = train_data_feature_preprocessing(X_train.reset_index(drop=True))
y_train, y_scaler =  train_data_target_preprocessing(y_train.reset_index(drop=True))
X_test = test_data_feature_preprocessing(X_test.reset_index(drop=True), num_imputer, mode_categorical_columns, X_scalers)
# y_test =  test_data_target_preprocessing(y_test.reset_index(drop=True), y_scaler)


"""Model Training and Prediction"""
for model_object in [LinearRegression(), Ridge(), Lasso(), DecisionTreeRegressor(), RandomForestRegressor(), ExtraTreesRegressor()]:
    start = time.perf_counter()
    model = train(X_train, y_train.ravel(), model_object) # train model
    y_pred = pred(X_test, model) # predict
    y_pred = undo_scaling_log_transform(y_pred, 'SalePrice') # transform into normal format
    pd.DataFrame({
        'Id': test_df['Id'],
        'SalePrice': y_pred.ravel()
    }).to_csv(os.path.join(project_dir, os.getenv('PROCESSED_FOLDER'), f'20230827_submission_{model_object.__class__.__name__}.csv'), index=False) # save
    print(f"{model_object.__class__.__name__} model training and prediction done in {time.perf_counter()-start} seconds")

# """Process Prediction into Interpretable Manner"""
# y_pred = undo_scaling_log_transform(y_pred, 'SalePrice')
# y_test = undo_scaling_log_transform(y_test, 'SalePrice')
# """Evaluate Against Real Data"""
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test.ravel(), y_pred.ravel())

# print('rmse: ', rmse)
# print('mae: ', mae)
# print('r2: ', r2)

  return np.expm1(y_scaler.inverse_transform(results.reshape(1, -1)))


LinearRegression model training and prediction done in 0.10577230001217686 seconds
Ridge model training and prediction done in 0.0352277000201866 seconds
Lasso model training and prediction done in 0.02923799998825416 seconds
DecisionTreeRegressor model training and prediction done in 0.10343129999819212 seconds
RandomForestRegressor model training and prediction done in 6.145862899982603 seconds
ExtraTreesRegressor model training and prediction done in 6.134990699996706 seconds


In [146]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR

# prepare data pipeline
feature_preprocessing_pipeline=Pipeline([
    ('features', FeatureUnion([
        ('numerical', Pipeline([
            ('column_selector', ColumnSelector(columns=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
                    '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'])),
            ('simple_imputer', SimpleImputer(strategy='mean')),
            ('logger', FunctionTransformer(np.log1p))
        ])),
        ('categorical', Pipeline([
            ('column_selector', ColumnSelector(columns=categorical_columns)),
            ('missing_categorical_on_purpose_imputer', MissingCategoricalOnPurposeImputer(columns=[
                'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])),
            ('missing_categorical_imputer', DFSimpleImputer(strategy='most_frequent')),
            ('categorical_encoder', ColumnTransformer(
                [
                    ('ordinal_encoder', ModifiedOrdinalEncoder(ordinal_categorical_columns), list(ordinal_categorical_columns.keys())),
                    ('one_hot_encoder', ModifiedOneHotEncoder(one_hot_categorical_columns), list(one_hot_categorical_columns.keys()))
                ], remainder="passthrough"
            )),
        ])),
    ])),
    ('scale', MinMaxScaler())
])


def target_preprocessing_pipeline(y):
    y = np.log1p(y)
    y_scaler = MinMaxScaler()
    y = y_scaler.fit_transform(y)
    return (y, y_scaler)

def target_revert_pipeline(y, scaler):
    return np.expm1(scaler.inverse_transform(y.reshape(1, -1)))


# prepare X and y
X_train, y_train = train_df.drop(columns=['SalePrice']), train_df[['SalePrice']]
X_test = test_df
# preprocess y

models = [LinearRegression(), Ridge(), Lasso(), SVR(), DecisionTreeRegressor(), RandomForestRegressor(), ExtraTreesRegressor()]
for model in models:
    # train model
    X_train_preprocessed = feature_preprocessing_pipeline.fit_transform(X_train)
    y_train_preprocessed, scaler = target_preprocessing_pipeline(y_train)
    model.fit(X_train_preprocessed, y_train_preprocessed.ravel())

    # predict
    X_test_preprocessed = feature_preprocessing_pipeline.transform(X_test)
    y_pred = model.predict(X_test_preprocessed)
    y_pred = target_revert_pipeline(y_pred, scaler)

    # save to submit
    pd.DataFrame({
        'Id': test_df['Id'],
        'SalePrice': y_pred.ravel()
    }).to_csv(os.path.join(project_dir, os.getenv('PROCESSED_FOLDER'), f'20230828_submission_{model.__class__.__name__}.csv'), index=False) # save

  return np.expm1(scaler.inverse_transform(y.reshape(1, -1)))


# Evaluation
1. We still exclude handling so many zeros value on selected numerical column, and multicollinearity as well.

In [64]:
y

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125
