# House price regression

# 1. Import libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import KFold, cross_validate

from scipy.stats import skew, boxcox_normmax
from scipy.special import boxcox1p, inv_boxcox1p

pd.set_option('display.max_columns', 500)

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

y = np.log1p(np.ravel(np.array(train['SalePrice']).T))
X = train.drop(columns=['SalePrice'])

X_pred = test

folds = KFold(n_splits=10, shuffle=True, random_state=0)

# 2. Preprocessing pipeline

Steps to include in the preprocessing pipeline:
- Dropping features
- Reformat data type into correct one
- Impute missing data
- Feature engineering
- Encode categorical data

In [None]:
def reformat_categorical_data(df):
    df['MSSubClass'] = df['MSSubClass'].apply(str)
    df['MoSold'] = df['MoSold'].astype(str)
    return df

def drop_features(df):
    features = ['Utilities', 'Street', 'PoolQC', 'Id']
    df_ = df.drop(columns = features)
    return df_

In [None]:
class custom_FT():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
class custom_imputer():
    def __init__(self, imp_0, imp_none, imp_med, imp_freq):
        self.imp_0 = imp_0
        self.imp_none = imp_none
        self.imp_med = imp_med
        self.imp_freq = imp_freq
        
        self.median_values = {}
        self.most_frequent_values = {}

    def fit(self, X, y=None):
        #save the median and most frequent value to transform the test data
        for col in self.imp_med:
            self.median_values[col] = X[col].mode()[0]
        for col in self.imp_freq:
            self.most_frequent_values[col] = X[col].mode()[0]
        return self
    
    def transform(self, X):
        #fill the NA value based on the rule
        for col in self.imp_0:
            X[col] = X[col].fillna(0)
        for col in self.imp_none:
            X[col] = X[col].fillna('none')
        for col in self.imp_med:
            X[col] = X[col].fillna(self.median_values[col])
        for col in self.imp_freq:
            X[col] = X[col].fillna(self.most_frequent_values[col])
        return X

In [None]:
def feature_engineering(df):
    df["Has_shed"] = (df["MiscFeature"] == "Shed") * 1  
    df["Remodeled"] = (df["YearRemodAdd"] != df["YearBuilt"]) * 1
    df["Recent_remodel"] = (df["YearRemodAdd"] == df["YrSold"]) * 1
    df["Very_new_house"] = (df["YearBuilt"] == df["YrSold"]) * 1
    df["Has_2nd_floor"] = (df["2ndFlrSF"] != 0) * 1
    df["Has_pool"] = (df["PoolArea"] != 0) * 1
    df["Has_Wood_deck"] = (df["WoodDeckSF"] != 0) * 1
    df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
    df['age'] = df['YrSold'] - df['YearBuilt']
    df['Total_bath'] = df['FullBath'] + 0.5*df['HalfBath'] + df['BsmtFullBath'] + 0.5*df['BsmtHalfBath']
    return df

In [None]:
class DataFrameOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, drop=None, handle_unknown="error"):
        #we give the choice to the user to drop columns and how to 
        #handle errors, the rest of the OHE parameters are hard coded
        #or by default
        self.drop = drop
        self.dtype = np.float64
        self.handle_unknown = handle_unknown
         
    def fit(self, X, y=None):
        #one OHE object per column is stored in this variable
        self.onehotencoders_ = []
        #variable that keep track of the names
        self.col_names = []
        cols = X.select_dtypes(include=["object", "category"]).columns
        for c in cols:
            #Now create, fit, and store the OHE for every column
            ohe = OneHotEncoder(sparse=False, drop=self.drop,
                                dtype=self.dtype,        
                                handle_unknown=self.handle_unknown)
            self.onehotencoders_.append(ohe.fit(X.loc[:, [c]]))
            #Get the feature names and remove each x0_
            col_names = ohe.get_feature_names()
            col_names = [x.replace("x0_", "") for x in col_names]
            #write the original column name before the new name
            col_names = [f"{c}_{x}" for x in col_names]
            self.col_names.append(col_names)
        return self

    def transform(self, X):
        all_df = []
        
        cols = X.select_dtypes(include=["object", "category"]).colums

        for i, c in enumerate(cols):
            ohe = self.onehotencoders_[i]
            ohe_col = ohe.transform(X.loc[:, [c]])
            df_col = pd.DataFrame(ohe_col, columns=self.col_names[i])
            all_df.append(df_col)

        OHE_df = pd.concat(all_df, axis=1)
        num_df = X.select_dtypes(exclude=["object", "category"])
        return pd.concat([num_df, OHE_df], axis=1)

In [None]:
imp_0 = ['BsmtFullBath', 'BsmtHalfBath', 'TotalBsmtSF', 'BsmtFinSF2', 'GarageYrBlt', 
         'GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtUnfSF', 'BsmtFinSF1']
imp_none = ['MSSubClass', 'HeatingQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
                             'GarageCond', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtExposure', 'BsmtFinType2', 'MasVnrType', 'KitchenQual']
imp_med = ['LotFrontage']
imp_freq = ['SaleType', 'Electrical', 'Exterior1st', 'Exterior2nd', 'MSZoning', 'Functional']

In [None]:
pipeline = Pipeline([
    ('drop' , custom_FT(drop_features)),
    ('reformat data type' , custom_FT(reformat_categorical_data)),
    ('imputer' , custom_imputer(imp_0, imp_none, imp_med, imp_freq)),
    ('feature_engineering', custom_FT(feature_engineering)),
    ('encoding' , DataFrameOneHotEncoder(handle_unknown="ignore"))
])
pipeline.fit(X)
X_preprocessed = pipeline.transform(X)
X_pred_preprocessed = pipeline.transform(X_pred)

In [None]:
X

In [None]:
X_preprocessed

In [None]:
(X_preprocessed.shape, X_pred_preprocessed.shape)

# 3. Prediction pipeline

In [None]:
models_list = {'Ridge': Ridge(),
               'SVR': SVR(), 
               'LGBMRegressor' : LGBMRegressor(verbosity = 0, force_row_wise=True), 
               'Lasso': Lasso(alpha=0.0005),
               'ElasticNet': ElasticNet(alpha=0.0006, l1_ratio=1)
              }

scoring = {'r2':'r2'}
columns = ['Model', 'Median fit time', 'Mean r2', 'Std r2']

x = X_preprocessed.to_numpy()

model_perf_matrix = []
predictions = pd.DataFrame()
for model_name, model in models_list.items():
    pipeline = Pipeline([
        ('model' , model)
    ])

    cv_score = cross_validate(pipeline, x, y, cv=folds, scoring=scoring, 
                              verbose=0, error_score="raise");
    model_perf_matrix.append([model_name, round(cv_score['fit_time'].mean(),3),
                              round(cv_score['test_r2'].mean(),4), round(cv_score['test_r2'].std(),4)])
    
    pipeline.fit(x,y)
    predictions[model_name] = np.floor(np.expm1(pipeline.predict(X_pred_preprocessed.to_numpy()))).T
    
df_model_perf = pd.DataFrame(model_perf_matrix, columns=columns)
df_model_perf

In [None]:
predictions.head(20)