In [1]:
# using %matplotlib inline to immediatly draw plot after run
%matplotlib inline 

# imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import dotenv
import datetime
import os
import pickle
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from abc import ABC, abstractmethod



"""prepare environment variable"""
project_dir = os.path.join(os.path.abspath(''), os.pardir)
dotenv_path = os.path.join(project_dir, '.env')
dotenv.load_dotenv(dotenv_path)

True

# Get Data

In [4]:
def check_path_existance(path):
    return os.path.exists(path)

dataset_folder = os.path.join(project_dir, os.getenv('RAW_FOLDER'), 'house-prices-advanced-regression-techniques')
train_df = None
test_df = None

if check_path_existance(dataset_folder):   
    train_df = pd.read_csv(os.path.join(dataset_folder, 'train.csv'))
    test_df = pd.read_csv(os.path.join(dataset_folder, 'test.csv'))

In [5]:
# train_df.loc[:, ('MSSubClass')] = train_df.loc[:, ('MSSubClass')].astype(str)
# test_df.loc[:, ('MSSubClass')] = test_df.loc[:, ('MSSubClass')].astype(str)

numerical_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = train_df.select_dtypes(include=['object']).columns

# Data Preprocessing Pipeline

In [6]:
class DataLoader:
    @staticmethod
    def load_data(path):
        if os.path.exists(path):
            return pd.read_csv(path)
        return None
# data
train_data_path = os.path.join(project_dir, os.getenv('RAW_FOLDER'), 'house-prices-advanced-regression-techniques', 'train.csv')
test_data_path = os.path.join(project_dir, os.getenv('RAW_FOLDER'), 'house-prices-advanced-regression-techniques', 'test.csv')

print(DataLoader.load_data(train_data_path).shape)
print(DataLoader.load_data(test_data_path).shape)

(1460, 81)
(1459, 80)


In [63]:
np.log1p

<ufunc 'log1p'>

In [16]:
class BaseDataPreprocessor(ABC):
    def __init__(self, X: pd.DataFrame):
        self.X = X
    
    @abstractmethod
    def clean_data(self):
        pass

    @abstractmethod
    def transform_data(self):
        pass

    @abstractmethod
    def scale_data(self):
        pass
    
    @abstractmethod
    def select_column_data(self):
        pass

In [12]:
class t(BaseDataPreprocessor):
    def __init__(self):
        pass

# class TrainDataPreprocessor(BaseDataPreprocessor):
#     def __init__(self):
#         pass

TypeError: function() argument 'code' must be code, not str

In [41]:
# for data preprocessing
class TrainDataPreprocessor(BaseDataPreprocessor):
    def __init__(self, X: pd.DataFrame):
        super().__init__(X)
        self.X_cleaned = self.clean_data(self.X) # cleaning data
        # feature engineering
        self.X_column_selected = self.select_column_data(self.X_cleaned) # feature selection
        self.X_transformed = self.transform_data(self.X_column_selected) # transform data
        self.X_scaled = self.scale_data(self.X_transformed, "minmax") # scale data


    
    def clean_data(self, X):
        X.drop(columns=['Id'], inplace=True) # drop irrelavant columns
        X.loc[:, ('MSSubClass')] = X.loc[:, ('MSSubClass')].astype(str) # fix incorrect data type
        
        ### Clean numerical columns
        numerical_columns = X.select_dtypes(exclude=['object']).columns
        imputer = SimpleImputer(strategy='mean')
        X.loc[:, numerical_columns] = imputer.fit_transform(X.loc[:, numerical_columns]) # impute missing values with mean

        ### Clean categorical columns
        categorical_columns = X.select_dtypes(include=['object']).columns
        categorical_columns_missing_on_purpose = [
            'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 
            'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 
            'PoolQC', 'Fence', 'MiscFeature'
        ]
        imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        X.loc[:, categorical_columns_missing_on_purpose] = imputer.fit_transform(X.loc[:, categorical_columns_missing_on_purpose]) # impute missing categorical value on purpose

        imputer = SimpleImputer(strategy='most_frequent')
        X.loc[:, categorical_columns] = imputer.fit_transform(X.loc[:, categorical_columns]) # impute missing categorical value not on purpose

        return X


    def transform_data(self, X):
        ordinal_categorical_columns ={
            "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
            "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
            "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
            "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
            "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
            "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
            "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
            "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
            "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
            "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
            "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
        } # gather ordinal categorical column

        one_hot_categorical_columns = {
            "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
            "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
            "Street": ["Pave", "Grvl"],
            "Alley": ["Missing", "Grvl", "Pave"],
            "LotShape": ["Reg", "IR1", "IR2", "IR3"],
            "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
            "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
            "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
            "LandSlope": ["Gtl", "Mod", "Sev"],
            "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
            "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
            "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
            "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
            "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
            "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
            "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
            "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
            "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
            "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
            "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
            "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
            "CentralAir": ["Y", "N"],
            "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
            "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
            "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
            "PavedDrive": ["Y", "N", "P"],
            "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
            "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
            "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
        } # one hot encoder column
        
        ### Numerical columns transformation
        numerical_columns = X.select_dtypes(exclude=['object']).columns
        X.loc[:, numerical_columns] = np.log1p(X.loc[:, numerical_columns]) # perform log transformation
        
        ### Categorical columns transformation
        for k, v in ordinal_categorical_columns.items():
            if k in X.columns:
                ordinal_encoder = OrdinalEncoder(categories=[v]) # define ordinal encoder
                X[k] = ordinal_encoder.fit_transform(X[[k]]).astype(int) # ordinal encoding
        
        for k, v in one_hot_categorical_columns.items():
            if k in X.columns:
                one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore') # one hot encoder
                X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X[[k]])) # one hot encoding
                X_encoded.columns = one_hot_encoder.get_feature_names_out([k])
                X.drop(columns=[k], inplace=True)
                X = pd.concat([X, X_encoded], axis=1)
        return X
    
    def scale_data(self, X, scaler="standard"):
        if scaler == "standard":
            scaler = StandardScaler()
        elif scaler == "minmax":
            scaler = MinMaxScaler()
        elif scaler == "robust":
            scaler = RobustScaler()

        X_scaled = scaler.fit_transform(X)    
        X = pd.DataFrame(data=X_scaled, columns=X.columns)
        return X

    def select_column_data(self, X):
        selected_numerical_columns = [
            'LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 
            'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
            '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 
            'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'
        ]
        selected_categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
        selected_columns = np.concatenate([selected_numerical_columns, selected_categorical_columns])
        return X.loc[:, selected_columns]
# # for modelling
# class ModelTrainer():
#     pass

# # for evaluation
# class ModelEvaluator():
#     pass

In [42]:
data_preprocessor = DataPreprocessor(train_df.drop(columns=['SalePrice']))
data_preprocessor.X_scaled

Unnamed: 0,LotFrontage,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,...,SaleType_ConLw,SaleType_Con,SaleType_Oth,SaleType_VWD,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_Partial,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family
0,0.413268,0.813196,0.950951,0.884882,0.716038,0.759508,0.774668,0.355839,0.884420,0.577436,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.490307,0.734867,0.760143,0.437057,0.000000,0.797188,0.819153,0.502753,0.000000,0.469935,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.429990,0.813196,0.936906,0.868404,0.690361,0.716357,0.782930,0.383123,0.886245,0.592825,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.383633,0.813196,0.319286,0.336710,0.000000,0.622780,0.760436,0.399623,0.868471,0.578882,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.508439,0.882287,0.929878,0.835423,0.794318,0.750841,0.808002,0.465926,0.911832,0.666289,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.395769,0.734867,0.922846,0.835423,0.000000,0.000000,0.786968,0.396460,0.857277,0.564152,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,0.512839,0.734867,0.774366,0.636841,0.648854,0.772504,0.842121,0.690649,0.000000,0.645566,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,0.418925,0.813196,0.508884,0.934267,0.000000,0.650620,0.808700,0.479880,0.923592,0.688447,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1458,0.429990,0.644443,0.573924,0.769361,0.000000,0.452858,0.801091,0.443104,0.000000,0.414179,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [11]:
"""Numerical Data Preprocessing"""
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns) -> None:
        super().__init__()
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.loc[:, self.columns]
    

class DFSimpleImputer(SimpleImputer):
    def __init__(self, **kwargs):
        super(DFSimpleImputer, self).__init__(**kwargs)
    
    def transform(self, X):
        X_imputed = super().transform(X)
        X_imputed = pd.DataFrame(X_imputed, index=X.index, columns=X.columns)
        return X_imputed

"""Categorical Data Preprocessing"""
class MissingCategoricalOnPurposeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns) -> None:
        super().__init__()
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        X[self.columns] = imputer.fit_transform(X[self.columns])
        return X

class ModifiedOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_categorical_columns) -> None:
        super().__init__()
        self.ordinal_categorical_columns = ordinal_categorical_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for k, v in self.ordinal_categorical_columns.items():
            ordinal_encoder = OrdinalEncoder(categories=[v])
            X[k] = ordinal_encoder.fit_transform(X[[k]]).astype(int)
        return X
    
class ModifiedOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, one_hot_categorical_columns) -> None:
        super().__init__()
        self.one_hot_categorical_columns = one_hot_categorical_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for k, v in self.one_hot_categorical_columns.items():
            one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore')
            X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X[[k]]))
            X_encoded.columns = one_hot_encoder.get_feature_names_out([k])
            X.drop(columns=[k], inplace=True)
            X = pd.concat([X, X_encoded], axis=1)
        return X


ordinal_categorical_columns ={
    "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
    "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
    "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
    "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
    "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
    "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
    "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
    "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
} # gather ordinal categorical column

one_hot_categorical_columns = {
    "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
    "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
    "Street": ["Pave", "Grvl"],
    "Alley": ["Missing", "Grvl", "Pave"],
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
    "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
    "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
    "LandSlope": ["Gtl", "Mod", "Sev"],
    "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
    "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
    "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
    "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
    "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
    "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
    "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
    "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
    "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
    "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
    "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
    "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
    "CentralAir": ["Y", "N"],
    "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
    "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
    "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
    "PavedDrive": ["Y", "N", "P"],
    "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
    "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
    "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
} # one hot encoder column

# Data Modelling 

In [116]:
pipeline=Pipeline([
    ('features', FeatureUnion([
        ('numerical', Pipeline([
            ('column_selector', ColumnSelector(columns=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
                    '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'])),
            ('simple_imputer', SimpleImputer(strategy='mean')),
            ('logger', FunctionTransformer(np.log1p))
        ])),
        ('categorical', Pipeline([
            ('column_selector', ColumnSelector(columns=categorical_columns)),
            ('missing_categorical_on_purpose_imputer', MissingCategoricalOnPurposeImputer(columns=[
                'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])),
            ('categorical_encoder', ColumnTransformer(
                [
                    ('ordinal_encoder', ModifiedOrdinalEncoder(ordinal_categorical_columns), list(ordinal_categorical_columns.keys())),
                    ('one_hot_encoder', ModifiedOneHotEncoder(one_hot_categorical_columns), list(one_hot_categorical_columns.keys()))
                ], remainder="passthrough"
            )),
        ])),
    ])),
    ('scale', MinMaxScaler())
])
pipeline.fit_transform(train_df)

# prepare data pipeline
feature_preprocessing_pipeline=Pipeline([
    ('features', FeatureUnion([
        ('numerical', Pipeline([
            ('column_selector', ColumnSelector(columns=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
                    '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'])),
            ('simple_imputer', SimpleImputer(strategy='mean')),
            ('logger', FunctionTransformer(np.log1p))
        ])),
        ('categorical', Pipeline([
            ('column_selector', ColumnSelector(columns=categorical_columns)),
            ('missing_categorical_on_purpose_imputer', MissingCategoricalOnPurposeImputer(columns=[
                'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])),
            ('missing_categorical_imputer', DFSimpleImputer(strategy='most_frequent')),
            ('categorical_encoder', ColumnTransformer(
                [
                    ('ordinal_encoder', ModifiedOrdinalEncoder(ordinal_categorical_columns), list(ordinal_categorical_columns.keys())),
                    ('one_hot_encoder', ModifiedOneHotEncoder(one_hot_categorical_columns), list(one_hot_categorical_columns.keys()))
                ], remainder="passthrough"
            )),
        ])),
    ])),
    ('scale', MinMaxScaler())
])

# prepare X and y
X_train, y_train = train_df.drop(columns=['SalePrice']), train_df['SalePrice'].to_numpy()
X_test = test_df

date = datetime.datetime.now().strftime('%Y%m%d')

models = [
    LinearRegression(), Ridge(), Lasso(), 
    SVR(), DecisionTreeRegressor(), RandomForestRegressor(), 
    AdaBoostRegressor(), GradientBoostingRegressor(), ExtraTreesRegressor(), 
    XGBRegressor(), LGBMRegressor()
]
model_names = [model.__class__.__name__ for model in models]
rmse = []
for model in models:
    # train model
    X_train_preprocessed = feature_preprocessing_pipeline.fit_transform(X_train)
    # y_train_preprocessed, scaler = target_preprocessing_pipeline(y_train)
    model.fit(X_train_preprocessed, y_train.ravel())

    # calculate rmse
    y_pred = model.predict(X_train_preprocessed)
    rmse.append(np.sqrt(mean_squared_error(y_train, y_pred)))

    # predict
    X_test_preprocessed = feature_preprocessing_pipeline.transform(X_test)
    y_pred = model.predict(X_test_preprocessed)

    # save model
    pickle.dump(model, open(os.path.join(project_dir, os.getenv('MODELS_FOLDER'), f'{date}_{model.__class__.__name__.lower()}_model.pkl'), 'wb'))

    # save to submit
    pd.DataFrame({
        'Id': test_df['Id'],
        'SalePrice': y_pred.ravel()
    }).to_csv(os.path.join(project_dir, os.getenv('PROCESSED_FOLDER'), f'{date}_submission_{model.__class__.__name__.lower()}.csv'), index=False) # save

pd.DataFrame({
    'model': model_names,
    'rmse': rmse
})

  model = cd_fast.enet_coordinate_descent(


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2625
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 147
[LightGBM] [Info] Start training from score 180921.195890


Unnamed: 0,model,rmse
0,LinearRegression,26495.236987
1,Ridge,27262.305356
2,Lasso,26497.301357
3,SVR,81367.76245
4,DecisionTreeRegressor,138.793727
5,RandomForestRegressor,10542.930446
6,AdaBoostRegressor,28445.321706
7,GradientBoostingRegressor,15347.648268
8,ExtraTreesRegressor,138.793727
9,XGBRegressor,2171.875789


In [114]:
y_train.ravel()

array([208500, 181500, 223500, ..., 266500, 142125, 147500], dtype=int64)

The best model according to submission is the Random Forest Model with score of 0.1462. There are some things that we can improve such as:
1. Handling multicollinearity
2. Hyperparameter Tuning

# Evaluation
1. We still exclude handling so many zeros value on selected numerical column, and multicollinearity as well.

# Next Step

# Feature Engineering

In [107]:
x = np.log(10)
y = np.log(11)
np.exp(x*y)

249.97424928733278