In [42]:
# using %matplotlib inline to immediatly draw plot after run
%matplotlib inline 

# imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import dotenv
import datetime
import os
import pickle
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR



"""prepare environment variable"""
project_dir = os.path.join(os.path.abspath(''), os.pardir)
dotenv_path = os.path.join(project_dir, '.env')
dotenv.load_dotenv(dotenv_path)

True

In [2]:
# functions
def check_path_existance(path):
    return os.path.exists(path)

In [3]:
dataset_folder = os.path.join(project_dir, os.getenv('RAW_FOLDER'), 'house-prices-advanced-regression-techniques')
train_df = None
test_df = None

if check_path_existance(dataset_folder):   
    train_df = pd.read_csv(os.path.join(dataset_folder, 'train.csv'))
    test_df = pd.read_csv(os.path.join(dataset_folder, 'test.csv'))

In [4]:
train_df.loc[:, ('MSSubClass')] = train_df.loc[:, ('MSSubClass')].astype(str)
test_df.loc[:, ('MSSubClass')] = test_df.loc[:, ('MSSubClass')].astype(str)

numerical_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = train_df.select_dtypes(include=['object']).columns

# Data Preprocessing Pipeline

In [5]:
"""Numerical Data Preprocessing"""
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns) -> None:
        super().__init__()
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.loc[:, self.columns]
    

class DFSimpleImputer(SimpleImputer):
    def __init__(self, **kwargs):
        super(DFSimpleImputer, self).__init__(**kwargs)
    
    def transform(self, X):
        X_imputed = super().transform(X)
        X_imputed = pd.DataFrame(X_imputed, index=X.index, columns=X.columns)
        return X_imputed

"""Categorical Data Preprocessing"""
class MissingCategoricalOnPurposeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns) -> None:
        super().__init__()
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        X[self.columns] = imputer.fit_transform(X[self.columns])
        return X

class ModifiedOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_categorical_columns) -> None:
        super().__init__()
        self.ordinal_categorical_columns = ordinal_categorical_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for k, v in self.ordinal_categorical_columns.items():
            ordinal_encoder = OrdinalEncoder(categories=[v])
            X[k] = ordinal_encoder.fit_transform(X[[k]]).astype(int)
        return X
    
class ModifiedOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, one_hot_categorical_columns) -> None:
        super().__init__()
        self.one_hot_categorical_columns = one_hot_categorical_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for k, v in self.one_hot_categorical_columns.items():
            one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore')
            X_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X[[k]]))
            X_encoded.columns = one_hot_encoder.get_feature_names_out([k])
            X.drop(columns=[k], inplace=True)
            X = pd.concat([X, X_encoded], axis=1)
        return X


ordinal_categorical_columns ={
    "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
    "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
    "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
    "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
    "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
    "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
    "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
    "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
    "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
    "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
} # gather ordinal categorical column

one_hot_categorical_columns = {
    "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
    "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
    "Street": ["Pave", "Grvl"],
    "Alley": ["Missing", "Grvl", "Pave"],
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
    "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
    "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
    "LandSlope": ["Gtl", "Mod", "Sev"],
    "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
    "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
    "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
    "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
    "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
    "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
    "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
    "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
    "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
    "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
    "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
    "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
    "CentralAir": ["Y", "N"],
    "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
    "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
    "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
    "PavedDrive": ["Y", "N", "P"],
    "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
    "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
    "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
} # one hot encoder column

# Data Modelling

In [43]:

pipeline=Pipeline([
    ('features', FeatureUnion([
        ('numerical', Pipeline([
            ('column_selector', ColumnSelector(columns=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
                    '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'])),
            ('simple_imputer', SimpleImputer(strategy='mean')),
            ('logger', FunctionTransformer(np.log1p))
        ])),
        ('categorical', Pipeline([
            ('column_selector', ColumnSelector(columns=categorical_columns)),
            ('missing_categorical_on_purpose_imputer', MissingCategoricalOnPurposeImputer(columns=[
                'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])),
            ('categorical_encoder', ColumnTransformer(
                [
                    ('ordinal_encoder', ModifiedOrdinalEncoder(ordinal_categorical_columns), list(ordinal_categorical_columns.keys())),
                    ('one_hot_encoder', ModifiedOneHotEncoder(one_hot_categorical_columns), list(one_hot_categorical_columns.keys()))
                ], remainder="passthrough"
            )),
        ])),
    ])),
    ('scale', MinMaxScaler())
])
pipeline.fit_transform(train_df)

# prepare data pipeline
feature_preprocessing_pipeline=Pipeline([
    ('features', FeatureUnion([
        ('numerical', Pipeline([
            ('column_selector', ColumnSelector(columns=['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 
                    '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'])),
            ('simple_imputer', SimpleImputer(strategy='mean')),
            ('logger', FunctionTransformer(np.log1p))
        ])),
        ('categorical', Pipeline([
            ('column_selector', ColumnSelector(columns=categorical_columns)),
            ('missing_categorical_on_purpose_imputer', MissingCategoricalOnPurposeImputer(columns=[
                'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])),
            ('missing_categorical_imputer', DFSimpleImputer(strategy='most_frequent')),
            ('categorical_encoder', ColumnTransformer(
                [
                    ('ordinal_encoder', ModifiedOrdinalEncoder(ordinal_categorical_columns), list(ordinal_categorical_columns.keys())),
                    ('one_hot_encoder', ModifiedOneHotEncoder(one_hot_categorical_columns), list(one_hot_categorical_columns.keys()))
                ], remainder="passthrough"
            )),
        ])),
    ])),
    ('scale', MinMaxScaler())
])


def target_preprocessing_pipeline(y):
    y = np.log1p(y)
    y_scaler = MinMaxScaler()
    y = y_scaler.fit_transform(y.reshape(-1, 1))
    return (y, y_scaler)

def target_revert_pipeline(y, scaler):
    return np.expm1(scaler.inverse_transform(y.reshape(-1, 1)))


# prepare X and y
X_train, y_train = train_df.drop(columns=['SalePrice']), train_df['SalePrice'].to_numpy()
X_test = test_df

date = datetime.datetime.now().strftime('%Y%m%d')

models = [
    LinearRegression(), Ridge(), Lasso(), 
    SVR(), DecisionTreeRegressor(), RandomForestRegressor(), 
    AdaBoostRegressor(), GradientBoostingRegressor(), ExtraTreesRegressor(), 
    XGBRegressor(), LGBMRegressor()
]
model_names = [model.__class__.__name__ for model in models]
rmse = []
for model in models:
    # train model
    X_train_preprocessed = feature_preprocessing_pipeline.fit_transform(X_train)
    y_train_preprocessed, scaler = target_preprocessing_pipeline(y_train)
    model.fit(X_train_preprocessed, y_train_preprocessed.ravel())

    # calculate rmse
    y_pred = target_revert_pipeline(model.predict(X_train_preprocessed), scaler)
    rmse.append(np.sqrt(mean_squared_error(y_train, y_pred)))

    # predict
    X_test_preprocessed = feature_preprocessing_pipeline.transform(X_test)
    y_pred = model.predict(X_test_preprocessed)
    y_pred = target_revert_pipeline(y_pred.ravel(), scaler)

    # save model
    pickle.dump(model, open(os.path.join(project_dir, os.getenv('MODELS_FOLDER'), f'{date}_{model.__class__.__name__.lower()}_model.pkl'), 'wb'))

    # save to submit
    pd.DataFrame({
        'Id': test_df['Id'],
        'SalePrice': y_pred.ravel()
    }).to_csv(os.path.join(project_dir, os.getenv('PROCESSED_FOLDER'), f'{date}_submission_{model.__class__.__name__.lower()}.csv'), index=False) # save

pd.DataFrame({
    'model': model_names,
    'rmse': rmse
})

  return np.expm1(scaler.inverse_transform(y.reshape(-1, 1)))


The best model according to submission is the Random Forest Model with score of 0.1462. There are some things that we can improve such as:
1. Handling multicollinearity
2. Hyperparameter Tuning

# Evaluation
1. We still exclude handling so many zeros value on selected numerical column, and multicollinearity as well.