In [2]:
# using %matplotlib inline to immediatly draw plot after run
%matplotlib inline 

# imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import os
import pickle
from dotenv import load_dotenv, find_dotenv
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from typing import Optional
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


"""prepare environment variable"""
load_dotenv(find_dotenv())

True

# Data Preprocessing Pipeline

In [3]:
class DataLoader:
    @staticmethod
    def load_data(path: str) -> Optional[pd.DataFrame]:
        if os.path.exists(path):
            return pd.read_csv(path)
        return None

## Data Cleaner Class 

In [4]:
class DataCleaner:
    @staticmethod
    def drop_feature(dataset: pd.DataFrame, columns: list) -> pd.DataFrame:
        dataset = dataset.copy()
        dataset = dataset.drop(columns=columns)
        return dataset

    @staticmethod
    def change_feature_type(dataset: pd.DataFrame, mapper: dict) -> pd.DataFrame:
        dataset = dataset.copy()
        dataset = dataset.astype(mapper)
        return dataset
    
    @staticmethod
    def impute_missing_numerical_feature(dataset: pd.DataFrame, dataset_type: str) -> pd.DataFrame:
        numerical_features = dataset.select_dtypes(include=['int', 'float']).columns.tolist()        
        if dataset_type == 'train':
            # fill missing numerical features with mean
            imputer = SimpleImputer(strategy='mean')
            dataset.loc[:, numerical_features] = imputer.fit_transform(dataset.loc[:, numerical_features])
            pickle.dump(imputer, open(os.getenv('NUMERICAL_IMPUTER_PATH'), "wb"))
            
        elif dataset_type == 'test':
            # fill missing numerical features with saved mean imputer
            imputer = pickle.load(open(os.getenv('NUMERICAL_IMPUTER_PATH'), "rb"))
            dataset.loc[:, numerical_features] = imputer.fit_transform(dataset.loc[:, numerical_features])
        else:
            print("dataset_type is neither train or test")
        return dataset

    @staticmethod
    def impute_missing_categorical_feature(
        dataset: pd.DataFrame, dataset_type: str, 
        categorical_features_missing_on_purpose: list = [
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
            'BsmtFinType2','GarageType', 'GarageFinish', 'GarageQual',
            'GarageCond','Alley', 'MasVnrType', 'Fence', 'FireplaceQu', 
            'MiscFeature', 'PoolQC'
        ]) -> pd.DataFrame:

        categorical_features = dataset.select_dtypes(include=['object']).columns.tolist()

        # fill expected missing categorical values
        imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        dataset.loc[:, categorical_features_missing_on_purpose] = imputer.fit_transform(dataset.loc[:, categorical_features_missing_on_purpose])

        if dataset_type == 'train':
            # fill missing categorical features with mode
            imputer = SimpleImputer(strategy='most_frequent')
            dataset.loc[:, categorical_features] = imputer.fit_transform(dataset.loc[:, categorical_features])
            pickle.dump(imputer, open(os.getenv('CATEGORICAL_IMPUTER_PATH'), 'wb'))
            
        elif dataset_type == 'test':
            # fill missing categorical features with saved mode imputer
            imputer = pickle.load(open(os.getenv('CATEGORICAL_IMPUTER_PATH'), "rb"))
            dataset.loc[:, categorical_features] = imputer.fit_transform(dataset.loc[:, categorical_features])
        else:
            print("dataset_type is neither train or test")
            
        return dataset

## Data Feature Generator Class

In [5]:
class DataFeatureGenerator:
    @staticmethod
    def generate_feature_by_sum(dataset: pd.DataFrame, features: list, new_feature_name: str) -> pd.DataFrame:
        dataset=dataset.copy()
        dataset.loc[:, new_feature_name] = dataset.loc[:, features].sum(axis=1)
        return dataset 
        
    @staticmethod
    def generate_feature_by_mean(dataset: pd.DataFrame, features: list, new_feature_name: str) -> pd.DataFrame:
        dataset=dataset.copy()
        dataset.loc[:, new_feature_name] = dataset.loc[:, features].mean(axis=1).astype(int)
        return dataset 

# Data Feature Selector

In [6]:
class DataFeatureSelector:
    @staticmethod
    def select_feature(dataset: pd.DataFrame, features: list) -> pd.DataFrame:
        dataset = dataset.copy()
        try:
            dataset = dataset.loc[:, features]
        except Exception as e:
            print(e)
        return dataset 

# Data Feature Transformer

In [7]:
class DataFeatureTransformer:

    def log_transform_feature(dataset: pd.DataFrame) -> pd.DataFrame:
        dataset = dataset.copy()
        try:
            numerical_columns = dataset.select_dtypes(include=['int', 'float']).columns
            dataset.loc[:, numerical_columns] = np.log1p(dataset.loc[:, numerical_columns])
        except Exception as e:
            print(e)
        return dataset

    def ordinal_encoding(dataset: pd.DataFrame) -> pd.DataFrame:
        dataset = dataset.copy()
        ordinal_categorical_columns ={
            "ExterQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "ExterCond": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "BsmtQual":  ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
            "BsmtCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
            "BsmtExposure": ["Gd", "Av", "Mn", "No", "Missing"],
            "BsmtFinType1": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
            "BsmtFinType2": ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "Missing"],
            "HeatingQC": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po"], 
            "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"],
            "GarageFinish": ["Fin", "RFn", "Unf", "Missing"], 
            "GarageQual": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
            "GarageCond": ["Ex", "Gd", "TA", "Fa", "Po", "Missing"], 
            "PoolQC": ["Ex", "Gd", "TA", "Fa", "Missing"], 
            "Fence": ["GdPrv", "MnPrv", "GdWo", "MnWw", "Missing"]
        } # gather ordinal categorical column
       
        ### Categorical columns transformation
        for f, v in ordinal_categorical_columns.items():
            if f in dataset.columns:
                ordinal_encoder = OrdinalEncoder(categories=[v]) # define ordinal encoder
                dataset[f] = ordinal_encoder.fit_transform(dataset[[f]]).astype(int) # ordinal encoding
        return dataset
    
    def one_hot_encoding(dataset: pd.DataFrame) -> pd.DataFrame:
        one_hot_categorical_columns = {
            "MSSubClass": ["20", "30", "40", "45",	"50", "60", "70", "75", "80", "85", "90", "120", "150", "160", "180", "190"],
            "MSZoning": ["A", "C", "FV", "I", "RH", "RL", "RP", "RM"],
            "Street": ["Pave", "Grvl"],
            "Alley": ["Missing", "Grvl", "Pave"],
            "LotShape": ["Reg", "IR1", "IR2", "IR3"],
            "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
            "Utilities": ["AllPub", "NoSewr", "NoSeWa", "ELO"],
            "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
            "LandSlope": ["Gtl", "Mod", "Sev"],
            "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
            "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
            "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe", "RRNe"],
            "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
            "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
            "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
            "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
            "Exterior1st": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
            "Exterior2nd": ["AsbShng", "AsphShn", "BrkComm", "BrkFace", "CBlock", "CemntBd", "HdBoard",	"ImStucc", "MetalSd", "Other", "Plywood", "PreCast", "Stone", "Stucco", "VinylSd", "Wd Sdng", "WdShing"],
            "MasVnrType": ["BrkFace", "Missing", "Stone", "BrkCmn", "CBlock"],
            "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
            "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
            "CentralAir": ["Y", "N"],
            "Electrical": ["SBrkr", "FuseF", "FuseA", "FuseP", "Mix", "Missing"],
            "Functional": ["Typ", "Min1", "Maj1", "Min2", "Mod", "Maj2", "Sev", "Sal"],
            "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Missing", "Basment", "2Types"],
            "PavedDrive": ["Y", "N", "P"],
            "MiscFeature": ["Missing", "Shed", "Gar2", "Othr", "TenC", "Elev"],
            "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth", "VWD"],
            "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"]
        } # one hot encoder column

        for f, v in one_hot_categorical_columns.items():
            if f in dataset.columns:
                one_hot_encoder = OneHotEncoder(categories=[v], sparse_output=False, handle_unknown='ignore') # one hot encoder
                dataset_encoded = pd.DataFrame(one_hot_encoder.fit_transform(dataset[[f]])) # one hot encoding
                dataset_encoded.columns = one_hot_encoder.get_feature_names_out([f])
                dataset.drop(columns=[f], inplace=True)
                dataset = pd.concat([dataset, dataset_encoded], axis=1)
        return dataset
    
    def scale_data(dataset: pd.DataFrame, scaler_name: str, dataset_type: str) -> pd.DataFrame:
        dataset = dataset.copy()
        if dataset_type == "train":
            if scaler_name == "standard":
                scaler = StandardScaler()
            elif scaler_name == "minmax":
                scaler = MinMaxScaler()
            elif scaler_name == "robust":
                scaler = RobustScaler()

            dataset_scaled = scaler.fit_transform(dataset) # scale the data    
            dataset = pd.DataFrame(data=dataset_scaled, columns=dataset.columns) # create pandas dataframe
            pickle.dump(scaler, open(os.getenv('SCALER_PATH'), 'wb')) # save the scaler

        elif dataset_type == "test":
            scaler = pickle.load(open(os.getenv('SCALER_PATH'), 'rb'))
            dataset_scaled = scaler.fit_transform(dataset) # scale the data    
            dataset = pd.DataFrame(data=dataset_scaled, columns=dataset.columns) # create pandas dataframe
        return dataset

# Data Modelling 

## Data Modelling without Feature Engineering

In [34]:
# ========================================================================================
# ============================== CROSS VALIDATION ========================================
# ========================================================================================

def preprocess_data(df: pd.DataFrame, dataset_type: str) -> pd.DataFrame:

    # ========================================================================================
    # Clean data
    # ========================================================================================
    cleaned_df = DataCleaner.drop_feature(dataset=df.copy(), columns=['Id'])
    cleaned_df = DataCleaner.change_feature_type(dataset=cleaned_df, mapper={'MSSubClass': str})
    cleaned_df = DataCleaner.impute_missing_numerical_feature(dataset=cleaned_df, dataset_type=dataset_type)
    cleaned_df = DataCleaner.impute_missing_categorical_feature(dataset=cleaned_df, dataset_type=dataset_type)

    # ========================================================================================
    # Feature selection
    # ========================================================================================
    feature_selected_df = DataFeatureSelector.select_feature(cleaned_df, 
        [
            'LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 
            'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 
            'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'
        ] 
        + cleaned_df.select_dtypes(include=['object']).columns.tolist()
    )

    # ========================================================================================
    # Data transformation (encoding & scaling)
    # ========================================================================================
    feature_transformed_df = DataFeatureTransformer.log_transform_feature(feature_selected_df)
    feature_transformed_df = DataFeatureTransformer.ordinal_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.one_hot_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.scale_data(dataset=feature_transformed_df, scaler_name='standard', dataset_type=dataset_type)
    
    return feature_transformed_df

models = [
    LinearRegression(), Ridge(random_state=42), Lasso(random_state=42), 
    SVR(), DecisionTreeRegressor(random_state=42), RandomForestRegressor(random_state=42), 
    AdaBoostRegressor(random_state=42), GradientBoostingRegressor(random_state=42), 
    ExtraTreesRegressor(random_state=42), XGBRegressor(random_state=42), LGBMRegressor(random_state=42),
    CatBoostRegressor(random_state=42)
]

date = datetime.datetime.now().strftime('%Y%m%d')

model_names = [model.__class__.__name__ for model in models]
rmses = []
maes = []
r2s = []

for model in models:
    # Load data
    # ========================================================================================
    train_df = DataLoader.load_data(os.getenv('TRAIN_PATH'))

    # prepare X and y
    # ========================================================================================
    X_train, y_train = train_df.drop(columns=['SalePrice']), train_df['SalePrice'].to_numpy()

    # perform shuffle split validation
    # ========================================================================================
    ss = ShuffleSplit(n_splits=5, random_state=42)
    rmse, mae, r2 = [], [], []

    for train_index, val_index in ss.split(X_train):
        # Split based on generated index of folding
        # ========================================================================================
        X_train_fold, X_val_fold = X_train.iloc[train_index].reset_index(drop=True), X_train.iloc[train_index].reset_index(drop=True)
        y_train_fold, y_val_fold = y_train[train_index], y_train[train_index]

        # Preprocess training and validation data
        # ========================================================================================
        X_train_fold_preprocessed = preprocess_data(X_train_fold, dataset_type="train")
        X_val_fold_preprocessed = preprocess_data(X_val_fold, dataset_type="test")

        # Train
        # ========================================================================================
        cv_model = clone(model)
        cv_model.fit(X_train_fold_preprocessed, y_train_fold)

        # Predict & evaluate
        # ========================================================================================
        y_val_fold_pred = cv_model.predict(X_val_fold_preprocessed)
        rmse.append(np.sqrt(mean_squared_error(y_val_fold, y_val_fold_pred)))
        mae.append(mean_absolute_error(y_val_fold, y_val_fold_pred))
        r2.append(r2_score(y_val_fold, y_val_fold_pred))

    # Record for final summary
    # ========================================================================================
    rmses.append(np.mean(rmse))
    maes.append(np.mean(mae))
    r2s.append(np.mean(r2))
    print(f"{model.__class__.__name__} done")

pd.DataFrame({
    'model': model_names,
    'rmse': rmses,
    'mae': maes,
    'r2': r2s
}).sort_values(by=['rmse'])

LinearRegression done
Ridge done


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso done
SVR done
DecisionTreeRegressor done
RandomForestRegressor done
AdaBoostRegressor done
GradientBoostingRegressor done
ExtraTreesRegressor done
XGBRegressor done
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2685
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 143
[LightGBM] [Info] Start training from score 180704.734399
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2674
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 143
[LightGBM] [Info] Start training from score 179888.022831
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2683
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 144
[LightGBM] [Info] Start training from score 181533.502283
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2680
[LightGB

Unnamed: 0,model,rmse,mae,r2
4,DecisionTreeRegressor,117.041147,4.56621,0.999997
8,ExtraTreesRegressor,117.041147,4.56621,0.999997
9,XGBRegressor,2143.365665,1511.768416,0.999266
11,CatBoostRegressor,6932.921513,5322.31997,0.99232
5,RandomForestRegressor,11163.791439,6543.612521,0.9801
10,LGBMRegressor,12464.915056,6438.08686,0.975217
7,GradientBoostingRegressor,14821.01068,10976.676039,0.96492
2,Lasso,26089.944912,17831.56771,0.89145
1,Ridge,26091.14581,17825.448919,0.89144
0,LinearRegression,26096.157058,17856.727164,0.891398


We don't pick Decision Tree Regressor and Extra Tree Regressor because it is a clear indication of overfitting.

In [38]:
# ========================================================================================
# =========================== TRAIN MODEL FOR KAGGLE =====================================
# ========================================================================================

def preprocess_data(df: pd.DataFrame, dataset_type: str) -> pd.DataFrame:

    # ========================================================================================
    # Clean data
    # ========================================================================================
    cleaned_df = DataCleaner.drop_feature(dataset=df.copy(), columns=['Id'])
    cleaned_df = DataCleaner.change_feature_type(dataset=cleaned_df, mapper={'MSSubClass': str})
    cleaned_df = DataCleaner.impute_missing_numerical_feature(dataset=cleaned_df, dataset_type=dataset_type)
    cleaned_df = DataCleaner.impute_missing_categorical_feature(dataset=cleaned_df, dataset_type=dataset_type)

    # ========================================================================================
    # Feature selection
    # ========================================================================================
    feature_selected_df = DataFeatureSelector.select_feature(cleaned_df, 
        [
            'LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 
            'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 
            'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'
        ] 
        + cleaned_df.select_dtypes(include=['object']).columns.tolist()
    )

    # ========================================================================================
    # Data transformation (encoding & scaling)
    # ========================================================================================
    feature_transformed_df = DataFeatureTransformer.log_transform_feature(feature_selected_df)
    feature_transformed_df = DataFeatureTransformer.ordinal_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.one_hot_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.scale_data(dataset=feature_transformed_df, scaler_name='standard', dataset_type=dataset_type)
    
    return feature_transformed_df

models = [
    RandomForestRegressor(random_state=42), GradientBoostingRegressor(random_state=42), 
    XGBRegressor(random_state=42), LGBMRegressor(random_state=42),
    CatBoostRegressor(random_state=42)
]

model_names = [model.__class__.__name__ for model in models]
rmses = []
maes = []
r2s = []

for model in models:
    # Load data
    # ========================================================================================
    train_df = DataLoader.load_data(os.getenv('TRAIN_PATH'))

    # prepare X and y
    # ========================================================================================
    X_train, y_train = train_df.drop(columns=['SalePrice']), train_df['SalePrice'].to_numpy()

    # perform shuffle split validation
    # ========================================================================================
    ss = ShuffleSplit(n_splits=5, random_state=42)
    rmse, mae, r2 = [], [], []

    for train_index, val_index in ss.split(X_train):
        # Load data
        # ========================================================================================
        train_df = DataLoader.load_data(os.getenv('TRAIN_PATH'))
        test_df = DataLoader.load_data(os.getenv('TEST_PATH'))

        # Prepare X and y
        # ========================================================================================
        X_train, y_train = train_df.drop(columns=['SalePrice']), train_df['SalePrice'].to_numpy()
        X_test = test_df

        # Preprocess training and validation data
        # ========================================================================================
        X_train_preprocessed = preprocess_data(X_train, dataset_type="train")

        # Train
        # ========================================================================================
        model.fit(X_train_preprocessed, y_train)

        # Predict & evaluate
        # ========================================================================================
        X_test_preprocessed = preprocess_data(X_test, dataset_type="test")
        y_pred = model.predict(X_test_preprocessed)

        # Save model
        # ========================================================================================
        pickle.dump(model, open(os.path.join(os.getenv('MODELS_FOLDER'), f'{date}_{model.__class__.__name__.lower()}_model.pkl'), 'wb'))

        # Save to submit
        # ========================================================================================
        pd.DataFrame({
            'Id': test_df['Id'],
            'SalePrice': y_pred.ravel()
        }).to_csv(os.path.join(os.getenv('PROCESSED_FOLDER'), f'{date}_submission_{model.__class__.__name__.lower()}.csv'), index=False) # save

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2765
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 147
[LightGBM] [Info] Start training from score 180921.195890
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2765
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 147
[LightGBM] [Info] Start training from score 180921.195890
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2765
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 147
[LightGBM] [Info] Start training from score 180921.195890
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2765
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 147
[LightGBM] [Info] Star

# Data Modelling with Feature Engineering
Feature engineering might be an alternative to tackle multicollinearity inside each independend features. We will be using 3 best models:
1. CatBoostRegressor
2. GradientBoostingRegressor
3. LGBMRegressor

In [14]:
# ========================================================================================
# =========================== TRAIN MODEL FOR KAGGLE =====================================
# ========================================================================================

def preprocess_data(df: pd.DataFrame, dataset_type: str) -> pd.DataFrame:

    # ========================================================================================
    # Clean data
    # ========================================================================================
    cleaned_df = DataCleaner.drop_feature(dataset=df.copy(), columns=['Id'])
    cleaned_df = DataCleaner.change_feature_type(dataset=cleaned_df, mapper={'MSSubClass': str})
    cleaned_df = DataCleaner.impute_missing_numerical_feature(dataset=cleaned_df, dataset_type=dataset_type)
    cleaned_df = DataCleaner.impute_missing_categorical_feature(dataset=cleaned_df, dataset_type=dataset_type)

    # ========================================================================================
    # Feature Engineering
    # ========================================================================================
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(cleaned_df, ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], 'OverallYear')
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['1stFlrSF',	'2ndFlrSF'], 'FloorSF')
    # feature_engineered_df['OveralQualCond'] = (feature_engineered_df['OverallQual'] + feature_engineered_df['OverallCond'])/2
    # feature_engineered_df['Bath'] = (feature_engineered_df['FullBath'] + feature_engineered_df['HalfBath'])/2
    # feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd'], 'TotalAbvGrd')
    # feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['BsmtFullBath', 'BsmtHalfBath'], 'BsmtBath')


    # ========================================================================================
    # Feature selection
    # ========================================================================================
    feature_selected_df = DataFeatureSelector.select_feature(feature_engineered_df, 
        [
            'LotFrontage', 'OverallQual', 'MasVnrArea', 'OverallYear', 'FloorSF',
            'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'FullBath', 
            'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'
        ] 
        + cleaned_df.select_dtypes(include=['object']).columns.tolist()
    ) # 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 

    # ========================================================================================
    # Data transformation (encoding & scaling)
    # ========================================================================================
    feature_transformed_df = DataFeatureTransformer.log_transform_feature(feature_selected_df)
    feature_transformed_df = DataFeatureTransformer.ordinal_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.one_hot_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.scale_data(dataset=feature_transformed_df, scaler_name='standard', dataset_type=dataset_type)
    
    return feature_transformed_df

models = [
    GradientBoostingRegressor(random_state=42), 
    LGBMRegressor(random_state=42),
    CatBoostRegressor(random_state=42)
]

date = datetime.datetime.now().strftime('%Y%m%d')

model_names = [model.__class__.__name__ for model in models]
rmses = []
maes = []
r2s = []

for model in models:
    # Load data
    # ========================================================================================
    train_df = DataLoader.load_data(os.getenv('TRAIN_PATH'))

    # prepare X and y
    # ========================================================================================
    X_train, y_train = train_df.drop(columns=['SalePrice']), train_df['SalePrice'].to_numpy()

    # perform shuffle split validation
    # ========================================================================================
    ss = ShuffleSplit(n_splits=5, random_state=42)
    rmse, mae, r2 = [], [], []

    for train_index, val_index in ss.split(X_train):
        # Split based on generated index of folding
        # ========================================================================================
        X_train_fold, X_val_fold = X_train.iloc[train_index].reset_index(drop=True), X_train.iloc[train_index].reset_index(drop=True)
        y_train_fold, y_val_fold = y_train[train_index], y_train[train_index]

        # Preprocess training and validation data
        # ========================================================================================
        X_train_fold_preprocessed = preprocess_data(X_train_fold, dataset_type="train")
        X_val_fold_preprocessed = preprocess_data(X_val_fold, dataset_type="test")

        # Train
        # ========================================================================================
        cv_model = clone(model)
        cv_model.fit(X_train_fold_preprocessed, y_train_fold)

        # Predict & evaluate
        # ========================================================================================
        y_val_fold_pred = cv_model.predict(X_val_fold_preprocessed)
        rmse.append(np.sqrt(mean_squared_error(y_val_fold, y_val_fold_pred)))
        mae.append(mean_absolute_error(y_val_fold, y_val_fold_pred))
        r2.append(r2_score(y_val_fold, y_val_fold_pred))

    # Record for final summary
    # ========================================================================================
    rmses.append(np.mean(rmse))
    maes.append(np.mean(mae))
    r2s.append(np.mean(r2))
    print(f"{model.__class__.__name__} done")

pd.DataFrame({
    'model': model_names,
    'rmse': rmses,
    'mae': maes,
    'r2': r2s
}).sort_values(by=['rmse'])

GradientBoostingRegressor done
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2413
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 141
[LightGBM] [Info] Start training from score 180704.734399
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2402
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 141
[LightGBM] [Info] Start training from score 179888.022831
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2412
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 142
[LightGBM] [Info] Start training from score 181533.502283
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2410
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 143
[LightGBM] [Info] Start training from score 179971.8652

Unnamed: 0,model,rmse,mae,r2
2,CatBoostRegressor,7050.801774,5453.279185,0.992062
1,LGBMRegressor,12675.557673,6619.930799,0.974369
0,GradientBoostingRegressor,15176.482862,11249.795435,0.963214


In [16]:
# ========================================================================================
# =========================== TRAIN MODEL FOR KAGGLE =====================================
# ========================================================================================

def preprocess_data(df: pd.DataFrame, dataset_type: str) -> pd.DataFrame:

    # ========================================================================================
    # Clean data
    # ========================================================================================
    cleaned_df = DataCleaner.drop_feature(dataset=df.copy(), columns=['Id'])
    cleaned_df = DataCleaner.change_feature_type(dataset=cleaned_df, mapper={'MSSubClass': str})
    cleaned_df = DataCleaner.impute_missing_numerical_feature(dataset=cleaned_df, dataset_type=dataset_type)
    cleaned_df = DataCleaner.impute_missing_categorical_feature(dataset=cleaned_df, dataset_type=dataset_type)
    
    # ========================================================================================
    # Feature Engineering
    # ========================================================================================
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(cleaned_df, ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], 'OverallYear')
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['1stFlrSF',	'2ndFlrSF'], 'FloorSF')
    feature_engineered_df['OveralQualCond'] = (feature_engineered_df['OverallQual'] + feature_engineered_df['OverallCond'])/2
    # feature_engineered_df['Bath'] = (feature_engineered_df['FullBath'] + feature_engineered_df['HalfBath'])/2
    # feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd'], 'TotalAbvGrd')
    # feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['BsmtFullBath', 'BsmtHalfBath'], 'BsmtBath')


    # ========================================================================================
    # Feature selection
    # ========================================================================================
    feature_selected_df = DataFeatureSelector.select_feature(feature_engineered_df, 
        [
            'LotFrontage', 'OveralQualCond', 'MasVnrArea', 'OverallYear', 'FloorSF',
            'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'FullBath', 
            'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'
        ] 
        + cleaned_df.select_dtypes(include=['object']).columns.tolist()
    ) # 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'OverallQual'


    # ========================================================================================
    # Data transformation (encoding & scaling)
    # ========================================================================================
    feature_transformed_df = DataFeatureTransformer.log_transform_feature(feature_selected_df)
    feature_transformed_df = DataFeatureTransformer.ordinal_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.one_hot_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.scale_data(dataset=feature_transformed_df, scaler_name='standard', dataset_type=dataset_type)
    
    return feature_transformed_df

models = [
    GradientBoostingRegressor(random_state=42), 
    LGBMRegressor(random_state=42),
    CatBoostRegressor(random_state=42)
]

model_names = [model.__class__.__name__ for model in models]
rmses = []
maes = []
r2s = []

for model in models:
    # Load data
    # ========================================================================================
    train_df = DataLoader.load_data(os.getenv('TRAIN_PATH'))
    test_df = DataLoader.load_data(os.getenv('TEST_PATH'))

    # Prepare X and y
    # ========================================================================================
    X_train, y_train = train_df.drop(columns=['SalePrice']), train_df['SalePrice'].to_numpy()
    X_test = test_df

    # Preprocess training and validation data
    # ========================================================================================
    X_train_preprocessed = preprocess_data(X_train, dataset_type="train")

    # Train
    # ========================================================================================
    model.fit(X_train_preprocessed, y_train)

    # Predict & evaluate
    # ========================================================================================
    X_test_preprocessed = preprocess_data(X_test, dataset_type="test")
    y_pred = model.predict(X_test_preprocessed)

    # Save model
    # ========================================================================================
    pickle.dump(model, open(os.path.join(os.getenv('MODELS_FOLDER'), f'{date}_{model.__class__.__name__.lower()}_fe_model.pkl'), 'wb'))

    # Save to submit
    # ========================================================================================
    pd.DataFrame({
        'Id': test_df['Id'],
        'SalePrice': y_pred.ravel()
    }).to_csv(os.path.join(os.getenv('PROCESSED_FOLDER'), f'{date}_submission_{model.__class__.__name__.lower()}_fe.csv'), index=False) # save

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2479
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 145
[LightGBM] [Info] Start training from score 180921.195890
Learning rate set to 0.043466
0:	learn: 77154.9209441	total: 2.94ms	remaining: 2.94s
1:	learn: 75081.0781578	total: 5.93ms	remaining: 2.96s
2:	learn: 72973.3976697	total: 8.41ms	remaining: 2.79s
3:	learn: 70922.7016940	total: 10.8ms	remaining: 2.68s
4:	learn: 69031.0601961	total: 14.1ms	remaining: 2.8s
5:	learn: 67279.6019931	total: 16.6ms	remaining: 2.74s
6:	learn: 65532.3398500	total: 19.6ms	remaining: 2.78s
7:	learn: 63807.0359420	total: 22ms	remaining: 2.73s
8:	learn: 62119.8187222	total: 24.3ms	remaining: 2.67s
9:	learn: 60706.8982979	total: 27.2ms	remaining: 2.69s
10:	learn: 59108.4310568	total: 29.9ms	remaining: 2.69s
11:	learn: 57645.1043438	total: 33.1ms	remaining: 2.73s
12:	learn: 56409.7364286	total: 36ms	remaining: 2.73s
13:	learn: 551

1. Using `OverallYear`, `FloorSF`, `OveralQualCond` Improve CatBoost and LGBM except Gradient

The best model according to submission is the Random Forest Model with score of 0.1462. There are some things that we can improve such as:
1. Handling multicollinearity
2. Hyperparameter Tuning

# Evaluation
1. We still exclude handling so many zeros value on selected numerical column, and multicollinearity as well.

# Next Step

In [None]:
def preprocess_data(df: pd.DataFrame, dataset_type: str) -> pd.DataFrame:

    # ========================================================================================
    # Clean data
    # ========================================================================================
    cleaned_df = DataCleaner.drop_feature(dataset=df.copy(), columns=['Id'])
    cleaned_df = DataCleaner.change_feature_type(dataset=cleaned_df, mapper={'MSSubClass': str})
    cleaned_df = DataCleaner.impute_missing_numerical_feature(dataset=cleaned_df, dataset_type=dataset_type)
    cleaned_df = DataCleaner.impute_missing_categorical_feature(dataset=cleaned_df, dataset_type=dataset_type)
    cleaned_df

    # ========================================================================================
    # Feature engineering
    # ========================================================================================
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(cleaned_df, ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], 'OverallYear')
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['1stFlrSF',	'2ndFlrSF'], 'FloorSF')
    feature_engineered_df['OveralQualCond'] = (feature_engineered_df['OverallQual'] + feature_engineered_df['OverallCond'])/2
    feature_engineered_df['Bath'] = (feature_engineered_df['FullBath'] + feature_engineered_df['HalfBath'])/2
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd'], 'TotalAbvGrd')
    feature_engineered_df = DataFeatureGenerator.generate_feature_by_mean(feature_engineered_df, ['BsmtFullBath', 'BsmtHalfBath'], 'BsmtBath')
    feature_engineered_df

    # ========================================================================================
    # Feature selection
    # ========================================================================================
    feature_selected_df = DataFeatureSelector.select_feature(feature_engineered_df, 
        ['OverallQual', 'MasVnrArea', 'Fireplaces', 'GarageArea', 'OverallYear', 'FloorSF', 'TotalAbvGrd', 'BsmtBath'] 
        + feature_engineered_df.select_dtypes(include=['object']).columns.tolist()
    )
    feature_selected_df

    # ========================================================================================
    # Data transformation (encoding & scaling)
    # ========================================================================================
    feature_transformed_df = DataFeatureTransformer.log_transform_feature(feature_selected_df)
    feature_transformed_df = DataFeatureTransformer.ordinal_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.one_hot_encoding(feature_transformed_df)
    feature_transformed_df = DataFeatureTransformer.scale_data(dataset=feature_transformed_df, scaler_name='standard', dataset_type=dataset_type)
    
    return feature_transformed_df


# Feature Engineering

In [None]:
class DataLoader:
    @staticmethod
    def load_data(path):
        if os.path.exists(path):
            return pd.read_csv(path)
        return None
# data
train_data_path = os.path.join(project_dir, os.getenv('RAW_FOLDER'), 'house-prices-advanced-regression-techniques', 'train.csv')
test_data_path = os.path.join(project_dir, os.getenv('RAW_FOLDER'), 'house-prices-advanced-regression-techniques', 'test.csv')

train_df = DataLoader.load_data(train_data_path)
test_df = DataLoader.load_data(test_data_path)

In [None]:
"""Plot Correlation Map"""
def show_correlation_map(data):
    numerical_columns = data.select_dtypes(include=['int', 'float']).columns
    corr_matrix = data.loc[:, numerical_columns].corr()

    plt.figure(figsize=(20, 20), facecolor='w', edgecolor='red')
    plt.title("Correlation HeatMap")
    sns.set(font_scale=0.7)
    sns.heatmap(
        corr_matrix,
        cmap='coolwarm',
        center = 0, 
        annot=True,
        fmt='.2f',
        square=True,
        linewidths=0.5,
        
    )
    plt.show()

    # print('Features with Correlation Value above 0.3', ', '.join(corr_matrix.loc[corr_matrix['SalePrice'] > 0.3].index.values))

In [None]:
show_correlation_map(train_df)

In [None]:
train_dataset = train_df.copy()

# LotArea and LotFrantage
train_dataset.loc[:, 'TotalLotArea'] = train_dataset.loc[:, 'LotArea'] + train_dataset.loc[:, 'LotFrontage']

# YearBuilt, YearRemodAdd, GarageYrBlt
train_dataset.loc[:, 'OverallYear'] = train_dataset.loc[:, ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].mean(axis=1).astype(int)

# BsmtFinSF1	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	1stFlrSF	2ndFlrSF	LowQualFinSF	WoodDeckSF	OpenPorchSF
train_dataset.loc[:, 'TotalSF'] = train_dataset.loc[:, ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',	'1stFlrSF',	
    '2ndFlrSF',	'LowQualFinSF',	'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)

# pool area
train_dataset.loc[:, 'Pool'] = train_dataset.loc[:, 'PoolArea'].apply(lambda x: 1 if x > 0 else 0)

# total above
train_dataset.loc[:, 'TotalAbvGrd'] = train_dataset.loc[:, ['GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']].sum(axis=1)

# BsmtFullBath, BsmtHalfBath
train_dataset.loc[:, 'TotalBsmtBath'] = train_dataset.loc[:, ['BsmtFullBath', 'BsmtHalfBath']].sum(axis=1)


train_dataset = train_dataset.drop(columns=[
    'LotArea', 'LotFrontage', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt',
    'BsmtFinSF1', 'BsmtFinSF2',	'BsmtUnfSF', 'TotalBsmtSF',	'1stFlrSF',	
    '2ndFlrSF',	'LowQualFinSF',	'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
    '3SsnPorch', 'ScreenPorch', 'FullBath', 'HalfBath', 'GarageCars', 'PoolArea',
    'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
    'BsmtFullBath', 'BsmtHalfBath'
])

show_correlation_map(train_dataset)

In [None]:
train_df['2ndFlrSF']

In [None]:
train.LotFrontage

In [None]:
train_dataset.loc[:, ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']]

In [None]:
train_df['TotRmsAbvGrd'].hist()

In [None]:
train_df['BedroomAbvGr'].hist()

In [None]:
train_df['KitchenAbvGr'].hist()

In [None]:
train_df[['FullBath', 'HalfBath']]