# House Prices Prediction

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler


from pycaret.regression import setup, compare_models
from sklearn.model_selection import KFold, cross_val_score
import lightgbm as lgb

from sklearn.linear_model import BayesianRidge, HuberRegressor, Ridge, OrthogonalMatchingPursuit

### Read Data

In [2]:
training_set_path = '../dataset/train.csv'
train0 = pd.read_csv(training_set_path)

testing_set_path = '../dataset/test.csv'
test0 = pd.read_csv(testing_set_path)

## Cleaning

### Drop Outlier

In [3]:
train1 = train0[train0.GrLivArea < 4500]
train1.dropna(axis=0, subset=['MasVnrArea', 'Electrical'], inplace=True)
train1.reset_index(drop=True, inplace=True)

### Concrat train and test set

In [4]:
y = train1['SalePrice']
test_ids = test0['Id']

train1.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test0.drop(['Id'], axis=1, inplace=True)

data1 = pd.concat([train1, test0], axis=0).reset_index(drop=True)
data1

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2903,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2904,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2905,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2906,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


### Set data type as str

In [5]:
data2 = data1.copy()

In [6]:
data2['MSSubClass'] = data2['MSSubClass'].astype(str)

### Fill Missing Value (Catagorical)

In [7]:
#For these columns, missing value means the feature is not available. So 'None' is filled, which is easier for future features engineering.
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]:
    data2[column] = data2[column].fillna("None")

#In the data description document, these columns are not supposed to contain missing value.
#Therefore, we are filling the mode of the column as the missing value
for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

### Fill Missing Value (Numeric)

In [8]:
for column in [
    'LotFrontage',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea'
]:
    data2[column] = data2[column].fillna(0)

## Features Engineering

### Binary Transformation

In [9]:
data3 = data2.copy()

In [10]:
#Convert the columns 'CentralAir' from (Y/N) to (1/0)
data3['CentralAir'] = data3['CentralAir'].replace({'Y': 1, 'N': 0})

#Convert the data in the column 'Street' to binary, where 'Pave' to 1 and 'Grvl' to 0.
data3['Street'] = data3['Street'].replace({'Pave': 1, 'Grvl': 0})

#Convert the data in the column 'Alley' to binary.
#The way of doing it is to group 'Pave' and 'Grvl'
#If the property has an alley access, then 1; 
#Otherwise 0.
data3['Alley'] = data3['Alley'].replace({'Pave': 1, 'Grvl': 1, 'None': 0})

#Convert the data in the column 'LotShape' to binary, where 'Reg' to 1 and grouping 'IR1', 'IR2' and 'IR3' to 0.
#If the shape of the property is regular, then 1;
#Otherwise 0.
data3['LotShape'] = data3['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})

#Convert the data in the column 'LandContour' to binary, where 'Lvl' to 1 and grouping 'Bnk', 'HLS' and 'Low' to 0.
#If the property is near flat/level, then 1;
#Otherwise 0.
data3['LandContour'] = data3['LandContour'].replace({'Lvl': 1, 'Bnk': 0, 'HLS': 0, 'Low': 0})

#Convert the data in the column 'Utilities' to binary, where 'AllPub' to 1 and grouping 'NoSewr', 'NoSeWa' and 'ELO' to 0.
#If all type of utilities is available, then 1;
#Otherwise 0.
data3['Utilities'] = data3['Utilities'].replace({'AllPub': 1, 'NoSewr': 0, 'NoSeWa': 0, 'ELO': 0})

### Ordinal Encoding

In [11]:
data4 = data3.copy()

In [12]:
#Define the desired order for each categorical feature
LandSlope_order = ['Sev', 'Mod', 'Gtl']
ExterQual_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
ExterCond_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtQual_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtCond_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtExposure_order = ['None', 'No', 'Mn', 'Av', 'Gd']
BsmtFinType1_order = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
BsmtFinType2_order = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
HeatingQC_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
KitchenQual_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
Functional_order = ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']
FireplaceQu_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
GarageFinish_order = ['None', 'Unf', 'RFn', 'Fin']
GarageQual_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
GarageCond_order = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
PavedDrive_order = ['N', 'P', 'Y']
PoolQC_order = ['None', 'Fa', 'TA', 'Gd', 'Ex']
Fence_order = ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

# List of columns to transform
columns_to_transform_OE = [
    'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
    'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence'
]

# Convert specified columns to strings
data4[columns_to_transform_OE] = data4[columns_to_transform_OE].astype(str)

# Create an instance of OrdinalEncoder with specified categories
ordinal_encoder = OrdinalEncoder(categories=[
    LandSlope_order,
    ExterQual_order,
    ExterCond_order,
    BsmtQual_order,
    BsmtCond_order,
    BsmtExposure_order,
    BsmtFinType1_order,
    BsmtFinType2_order,
    HeatingQC_order,
    KitchenQual_order,
    Functional_order,
    FireplaceQu_order,
    GarageFinish_order,
    GarageQual_order,
    GarageCond_order,
    PavedDrive_order,
    PoolQC_order,
    Fence_order
])

# Apply the encoder to the specified columns
encoded_columns_OE = ordinal_encoder.fit_transform(data4[columns_to_transform_OE])

# Create a new DataFrame with encoded values
OE_df = pd.DataFrame(encoded_columns_OE, columns=columns_to_transform_OE)

clean_set_dropped = data4.drop(columns=columns_to_transform_OE)

data4 = pd.concat([clean_set_dropped, OE_df], axis=1)

### Catagorical Variables

In [13]:
data5 = data4.copy()

In [14]:
data5 = pd.get_dummies(data5)

### Date Columns

### Cyclical Encoding (MoSold)

In [15]:
data6 = data5.copy()

In [16]:
data6['MoSold'] = -np.cos((2 * np.pi / 12) * data6['MoSold'])

### Year Difference (YrSold)

In [17]:
data6['YrSold'] = 2010 - data6['YrSold']

In [18]:
data7 = data6.copy()

## Split the data

In [19]:
train_split = data7.loc[:train1.index.max(), :].copy()
test_split = data7.loc[train1.index.max() + 1:, :].reset_index(drop=True).copy()

## Scaling

Scaling after the split to prevent data leakage

In [20]:
scaler = StandardScaler()
scaler.fit(train_split)

train_final = pd.DataFrame(scaler.transform(train_split), index = train_split.index, columns = train_split.columns)
test_final = pd.DataFrame(scaler.transform(test_split), index = test_split.index, columns = test_split.columns)

## Y Transformation

In [21]:
log_y = np.log(y)

## Model Selection

In [22]:
_ = setup(data = pd.concat([train_final, log_y], axis = 1), target = 'SalePrice')

Unnamed: 0,Description,Value
0,Session id,2109
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1449, 232)"
4,Transformed data shape,"(1449, 232)"
5,Transformed train set shape,"(1014, 232)"
6,Transformed test set shape,"(435, 232)"
7,Numeric features,231
8,Preprocess,True
9,Imputation type,simple


In [23]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.0833,0.0141,0.1182,0.9133,0.0092,0.007,0.014
omp,Orthogonal Matching Pursuit,0.0881,0.0148,0.1213,0.9093,0.0094,0.0074,0.014
gbr,Gradient Boosting Regressor,0.0888,0.0156,0.1244,0.9037,0.0097,0.0074,0.086
ridge,Ridge Regression,0.0892,0.0166,0.1279,0.8977,0.01,0.0075,0.014
lightgbm,Light Gradient Boosting Machine,0.091,0.0172,0.1302,0.8934,0.0102,0.0076,0.159
et,Extra Trees Regressor,0.0951,0.0188,0.1364,0.8839,0.0106,0.008,0.175
rf,Random Forest Regressor,0.0969,0.0198,0.1399,0.8779,0.0109,0.0081,0.174
ada,AdaBoost Regressor,0.1314,0.0295,0.1713,0.8177,0.0133,0.011,0.056
knn,K Neighbors Regressor,0.1439,0.0409,0.2017,0.747,0.0156,0.012,0.024
dt,Decision Tree Regressor,0.1522,0.0462,0.2138,0.7173,0.0166,0.0127,0.015


br
omp
gbr
ridge
lightgbm
et
rf

## Bagging Ensemble

In [24]:
models = {
    "gbr": GradientBoostingRegressor(verbose=0),
    "br": BayesianRidge(),
    "omp": OrthogonalMatchingPursuit(),
    "et": ExtraTreesRegressor(),
    "rf": RandomForestRegressor()
}

In [25]:
for name, model in models.items():
    model.fit(train_final, log_y)
    print(name + " trained.")

gbr trained.
br trained.
omp trained.
et trained.
rf trained.


## Evaluate

In [29]:
results = {}

kf = KFold(n_splits = 10)

for name, model in models.items():
    result = np.exp(np.sqrt(- cross_val_score(model, train_final, log_y, scoring = 'neg_mean_squared_error', cv = kf)))
    results[name] = result

In [37]:
results

{'gbr': array([1.12841874, 1.10473032, 1.11904937, 1.1473761 , 1.15786149,
        1.10804903, 1.13872306, 1.11726715, 1.10980109, 1.14666161]),
 'br': array([1.12436258, 1.10824122, 1.12375813, 1.13539103, 1.15492506,
        1.10307636, 1.13471608, 1.10474388, 1.10060412, 1.13475217]),
 'omp': array([1.13098381, 1.11008719, 1.13000372, 1.13008093, 1.15787519,
        1.1101573 , 1.12678807, 1.10815925, 1.10451802, 1.1478073 ]),
 'et': array([1.13540617, 1.12174685, 1.1417994 , 1.14879697, 1.18837702,
        1.10931617, 1.14023221, 1.12098472, 1.1242646 , 1.15100972]),
 'rf': array([1.1556519 , 1.13038153, 1.14241387, 1.15133669, 1.17866509,
        1.1138091 , 1.1543454 , 1.12560095, 1.1324369 , 1.16434483])}

In [36]:
for name, model in results.items():
    print(name)
    print(np.mean(result))
    print(np.std(result))

gbr
1.1448986257120486
0.018643565709715138
br
1.1448986257120486
0.018643565709715138
omp
1.1448986257120486
0.018643565709715138
et
1.1448986257120486
0.018643565709715138
rf
1.1448986257120486
0.018643565709715138


## Combine Predictions

In [38]:
final_predictions = (
    0.2 * np.exp(models['gbr'].predict(test_final)) +
    0.2 * np.exp(models['br'].predict(test_final)) +
    0.2 * np.exp(models['omp'].predict(test_final)) +
    0.2 * np.exp(models['et'].predict(test_final)) +
    0.2 * np.exp(models['rf'].predict(test_final)) 
)

## Submission

In [39]:
submission2 = pd.concat([test_ids, pd.Series(final_predictions, name = 'SalePrice')], axis = 1)

In [41]:
submission2.to_csv('./submission2.csv', index = False, header = True)

In [30]:
#01: 0.14094
#02: 0.13341