# Prediction of housing prices using Random Forest Regression

## Importing libraries

In [1]:
import copy

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

## Reading datasets

In [2]:
TRAINING_SET_PATH = '../dataset/train.csv'
training_set = pd.read_csv(TRAINING_SET_PATH)

TESTING_SET_PATH = '../dataset/test.csv'
testing_set = pd.read_csv(TESTING_SET_PATH)

## Data Preprocessing

In [3]:
# Combining both sets to process together
print(f'Training set length: {len(training_set)}; Testing set lenth: {len(testing_set)}')
total_set = pd.concat([training_set, testing_set]).reset_index()
print(f'Total set length: {len(total_set)}')
testing_min_id = 1461

Training set length: 1460; Testing set lenth: 1459
Total set length: 2919


### Handling missing values

In [4]:
def display_missing_values(df):
    missing_values = df.isnull().sum().to_frame()
    missing_values.columns = ['Missing_Values']
    missing_values = missing_values.query('Missing_Values > 0')
    print(missing_values)

display_missing_values(total_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Alley                   2721
Utilities                  2
Exterior1st                1
Exterior2nd                1
MasVnrType              1766
MasVnrArea                23
BsmtQual                  81
BsmtCond                  82
BsmtExposure              82
BsmtFinType1              79
BsmtFinSF1                 1
BsmtFinType2              80
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
FireplaceQu             1420
GarageType               157
GarageYrBlt              159
GarageFinish             159
GarageCars                 1
GarageArea                 1
GarageQual               159
GarageCond               159
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType      

In [12]:
cleaned_set = copy.deepcopy(total_set)

#### Handling Bsmt

Upon further inspection in the dataset, rows with NA *BsmtQual* have NA values for the rest of *BsmtColumns*

It would be suitable to create a new column of binary value to indicate whether the row has a *Bsmt*

In [11]:
# Inspecting rows with NA BsmtQual
mask = total_set['BsmtQual'].isna()
cols = ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')
total_set.loc[mask, cols]

Unnamed: 0,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtFinSF1,BsmtFinType2.1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
17,,,,,,0.0,,0.0,0.0,0.0
39,,,,,,0.0,,0.0,0.0,0.0
90,,,,,,0.0,,0.0,0.0,0.0
102,,,,,,0.0,,0.0,0.0,0.0
156,,,,,,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2803,,,,,,0.0,,0.0,0.0,0.0
2804,,,,,,0.0,,0.0,0.0,0.0
2824,,,,,,0.0,,0.0,0.0,0.0
2891,,,,,,0.0,,0.0,0.0,0.0


In [13]:
# Inspecting rows with non-NA BsmtQual
mask = total_set['BsmtQual'].notna()
cols = ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')
total_set.loc[mask, cols]

Unnamed: 0,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtFinSF1,BsmtFinType2.1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
0,Gd,TA,No,GLQ,Unf,706.0,Unf,0.0,150.0,856.0
1,Gd,TA,Gd,ALQ,Unf,978.0,Unf,0.0,284.0,1262.0
2,Gd,TA,Mn,GLQ,Unf,486.0,Unf,0.0,434.0,920.0
3,TA,Gd,No,ALQ,Unf,216.0,Unf,0.0,540.0,756.0
4,Gd,TA,Av,GLQ,Unf,655.0,Unf,0.0,490.0,1145.0
...,...,...,...,...,...,...,...,...,...,...
2914,TA,TA,No,Unf,Unf,0.0,Unf,0.0,546.0,546.0
2915,TA,TA,No,Rec,Unf,252.0,Unf,0.0,294.0,546.0
2916,TA,TA,No,ALQ,Unf,1224.0,Unf,0.0,0.0,1224.0
2917,Gd,TA,Av,GLQ,Unf,337.0,Unf,0.0,575.0,912.0


In [16]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(BsmtIncluded=training_set['BsmtQual'].notna())

# Replaceing NA with an arbitrary value
mask = cleaned_set['BsmtQual'].isna()
cat_cols = ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2')
num_cols = ('BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')

cleaned_set.loc[mask, cat_cols] = str('NA')
cleaned_set.loc[mask, num_cols] = 0

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Alley                   2721
Utilities                  2
Exterior1st                1
Exterior2nd                1
MasVnrType              1766
MasVnrArea                23
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
FireplaceQu             1420
GarageType               157
GarageYrBlt              159
GarageFinish             159
GarageCars                 1
GarageArea                 1
GarageQual               159
GarageCond               159
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459
BsmtIncluded            1459


In [29]:
# cleanup BsmtCond
mask = cleaned_set['BsmtCond'].isna()
cols = ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')

# Replaceing NA with an arbitrary value
cat_cols = ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2')
num_cols = ('BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')

cleaned_set.loc[mask, cat_cols] = str('NA')
cleaned_set.loc[mask, num_cols] = 0

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Alley                   2721
Utilities                  2
Exterior1st                1
Exterior2nd                1
MasVnrType              1766
MasVnrArea                23
BsmtFinSF1                 1
BsmtFinType2               1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
FireplaceQu             1420
GarageType               157
GarageYrBlt              159
GarageFinish             159
GarageCars                 1
GarageArea                 1
GarageQual               159
GarageCond               159
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459
BsmtIncluded            1459


In [30]:
# cleanup BsmtExposure
mask = cleaned_set['BsmtExposure'].isna()
cols = ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')

# Replaceing NA with an arbitrary value
cat_cols = ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2')
num_cols = ('BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')

cleaned_set.loc[mask, cat_cols] = str('NA')
cleaned_set.loc[mask, num_cols] = 0

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Alley                   2721
Utilities                  2
Exterior1st                1
Exterior2nd                1
MasVnrType              1766
MasVnrArea                23
BsmtFinSF1                 1
BsmtFinType2               1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
FireplaceQu             1420
GarageType               157
GarageYrBlt              159
GarageFinish             159
GarageCars                 1
GarageArea                 1
GarageQual               159
GarageCond               159
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459
BsmtIncluded            1459


In [32]:
# cleanup BsmtExposure
mask = cleaned_set['BsmtFinType2'].isna()
cols = ('Id','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')

cleaned_set.loc[mask, cols]

Unnamed: 0,Id,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
332,333,Gd,TA,No,GLQ,,1124.0,479.0,1603.0,3206.0


#### Handling Garage

Upon further inspection in the dataset, rows with NA *GarageType* have NA values for the rest of *GarageColumns*

It would be suitable to create a new column of binary value to indicate whether the row has a garage

In [8]:
# Inspecting rows with NA GarageType
mask = cleaned_set['GarageType'].isna()
cols = ('GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond')
cleaned_set.loc[mask, cols]

Unnamed: 0,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond
39,,,,,
48,,,,,
78,,,,,
88,,,,,
89,,,,,
...,...,...,...,...,...
2893,,,,,
2909,,,,,
2913,,,,,
2914,,,,,


In [9]:
# Inspecting rows with NA YrBlt
mask = cleaned_set['GarageYrBlt'].isna()
cols = ('GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'GarageCars', 'GarageArea')
cleaned_set.loc[mask, cols]

Unnamed: 0,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,GarageCars,GarageArea
39,,,,,,0.0,0.0
48,,,,,,0.0,0.0
78,,,,,,0.0,0.0
88,,,,,,0.0,0.0
89,,,,,,0.0,0.0
...,...,...,...,...,...,...,...
2893,,,,,,0.0,0.0
2909,,,,,,0.0,0.0
2913,,,,,,0.0,0.0
2914,,,,,,0.0,0.0


In [10]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(GarageIncluded=training_set['GarageType'].notna())

# For NA Garage Type

#   Replaceing NA with an arbitrary value
mask = cleaned_set['GarageType'].isna()

#   Handling categorical features
cat_cols = ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond')
cleaned_set.loc[mask, cat_cols] = 'NA'

#   Handling numerical feature
num_cols = ('GarageYrBlt', 'GarageCars', 'GarageArea')
cleaned_set.loc[mask, num_cols] = 0

# For NA GarageYrBlt

#   Replaceing NA with an arbitrary value
mask = cleaned_set['GarageYrBlt'].isna()

#   Handling categorical features
cat_cols = ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond')
cleaned_set.loc[mask, cat_cols] = 'NA'

#   Handling numerical feature
num_cols = ('GarageYrBlt', 'GarageCars', 'GarageArea')
cleaned_set.loc[mask, num_cols] = 0

display_missing_values(cleaned_set)

                Missing_Values
MSZoning                     4
LotFrontage                486
Alley                     2721
Utilities                    2
Exterior1st                  1
Exterior2nd                  1
MasVnrType                1766
MasVnrArea                  23
BsmtCond                     3
BsmtExposure                 3
BsmtFinSF1                   1
BsmtFinType2                 1
BsmtFinSF2                   1
BsmtUnfSF                    1
TotalBsmtSF                  1
Electrical                   1
BsmtFullBath                 2
BsmtHalfBath                 2
KitchenQual                  1
Functional                   2
FireplaceQu               1420
PoolQC                    2909
Fence                     2348
MiscFeature               2814
SaleType                     1
SalePrice                 1459
BsmtIncluded              1459
GarageIncluded            1459


#### Handling Alley

It is observed that *Alley* is a nominal feature. NAs in *Alley* are replaced with *'No_Alley'* for now, which will be further processed and encoded.

In [11]:
cleaned_set['Alley'].unique()

array([nan, 'Grvl', 'Pave'], dtype=object)

In [12]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(AlleyIncluded=training_set['Alley'].notna())

mask = cleaned_set['Alley'].isna()
cleaned_set.loc[mask, 'Alley'] = 'No_Alley'

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Utilities                  2
Exterior1st                1
Exterior2nd                1
MasVnrType              1766
MasVnrArea                23
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
FireplaceQu             1420
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459


#### Handling MasVnrType

Again, *MasVnrType* is a nominal feature, NAs are replaced with *'No_MasVnrType'* for further encoding.

In [13]:
cleaned_set['MasVnrType'].unique()

array(['BrkFace', nan, 'Stone', 'BrkCmn'], dtype=object)

In [14]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(MasVnrIncluded=training_set['MasVnrType'].notna())

mask = cleaned_set['MasVnrType'].isna()
cleaned_set.loc[mask, 'MasVnrType'] = 'No_MasVnrType'

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Utilities                  2
Exterior1st                1
Exterior2nd                1
MasVnrArea                23
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
FireplaceQu             1420
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459


#### Handling MasVnrArea

In [15]:
mask = cleaned_set['MasVnrArea'].isna()
cleaned_set.loc[mask, 'MasVnrArea'] = 0
display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Utilities                  2
Exterior1st                1
Exterior2nd                1
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
FireplaceQu             1420
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459


#### Handling FireplaceQu

Same approach for *FireplaceQu*

In [16]:
cleaned_set['FireplaceQu'].unique()

array([nan, 'TA', 'Gd', 'Fa', 'Ex', 'Po'], dtype=object)

In [17]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(FireplaceIncluded=training_set['FireplaceQu'].notna())

mask = cleaned_set['FireplaceQu'].isna()
cleaned_set.loc[mask, 'FireplaceQu'] = 'No_FireplaceQu'

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Utilities                  2
Exterior1st                1
Exterior2nd                1
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
PoolQC                  2909
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459


#### Handling PoolQC

Same approach for *PoolQC*

In [18]:
cleaned_set['PoolQC'].unique()

array([nan, 'Ex', 'Fa', 'Gd'], dtype=object)

In [19]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(PoolIncluded=training_set['PoolQC'].notna())

mask = cleaned_set['PoolQC'].isna()
cleaned_set.loc[mask, 'PoolQC'] = 'No_PoolQC'

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Utilities                  2
Exterior1st                1
Exterior2nd                1
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
Fence                   2348
MiscFeature             2814
SaleType                   1
SalePrice               1459


#### Handling Fence

Same approach for *Fence*

In [20]:
cleaned_set['Fence'].unique()

array([nan, 'MnPrv', 'GdWo', 'GdPrv', 'MnWw'], dtype=object)

In [21]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(FenceIncluded=training_set['Fence'].notna())

mask = cleaned_set['Fence'].isna()
cleaned_set.loc[mask, 'Fence'] = 'No_Fence'

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Utilities                  2
Exterior1st                1
Exterior2nd                1
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
MiscFeature             2814
SaleType                   1
SalePrice               1459


#### Handling MiscFeature

Same approach for *MiscFeature*

In [22]:
cleaned_set['MiscFeature'].unique()

array([nan, 'Shed', 'Gar2', 'Othr', 'TenC'], dtype=object)

In [23]:
# Creating a indicator column
cleaned_set = cleaned_set.assign(MiscFeatureIncluded=training_set['MiscFeature'].notna())

mask = cleaned_set['MiscFeature'].isna()
cleaned_set.loc[mask, 'MiscFeature'] = str('NA')

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
LotFrontage              486
Utilities                  2
Exterior1st                1
Exterior2nd                1
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
SaleType                   1
SalePrice               1459


#### Handling LotFrontage

Since not all housing properties have lot frontage, such as apartments, it would be better to preserve the implication in the original dataset, instead of making up an approximate value.

In [52]:
mask = cleaned_set['LotFrontage'].isna()
cleaned_set.loc[mask, 'LotFrontage'] = 0

display_missing_values(cleaned_set)

              Missing_Values
MSZoning                   4
Utilities                  2
Exterior1st                1
Exterior2nd                1
BsmtCond                   3
BsmtExposure               3
BsmtFinSF1                 1
BsmtFinType2               1
BsmtFinSF2                 1
BsmtUnfSF                  1
TotalBsmtSF                1
Electrical                 1
BsmtFullBath               2
BsmtHalfBath               2
KitchenQual                1
Functional                 2
GarageYrBlt                2
GarageFinish               2
GarageCars                 1
GarageArea                 1
GarageQual                 2
GarageCond                 2
SaleType                   1
SalePrice               1459


#### Handling MSZoning

In [51]:
mask = cleaned_set['MSZoning'].isna()
print(f'{cleaned_set.loc[mask, 'Id']}')

455     1916
756     2217
790     2251
1444    2905
Name: Id, dtype: int64


In [33]:
# mask = cleaned_set['MSZoning'].isna()

# # Seperating training and testing to preserve index
# training = copy.deepcopy(cleaned_set.iloc[0:training_len])
# mask = training['MSZoning'].isna()
# training.drop(training.loc[mask].index, inplace=True)
# training_len = len(training)    # Update new training set length 

# # Combining two sets 
# testing = cleaned_set.iloc[training_len:len(cleaned_set)] 
# cleaned_set = pd.concat([training, testing])

# print(f'Training set length: {training_len}; Total length: {len(cleaned_set)}')

All Belong to training set? Index([455, 756, 790, 1444], dtype='int64')
Training set length: 1460; Total length: 2919


### Standardising numerical features

In [None]:
# Getting numerical columns
num_features = cleaned_set.select_dtypes(include=['number']).columns.drop('Id')
num_features

#### Plotting distributions of numerical features

In [None]:
def plot_dfs(features):
    # Setting up subplots
    plot_per_row = 4
    rows = len(features) // plot_per_row
    fig, axes = plt.subplots(rows+1, plot_per_row, figsize=(20, 6*rows))
    axes = axes.flatten()

    # Plotting each subplot
    for i, feature in enumerate(features):
        ax = axes[i]
        ax.hist(cleaned_set[feature], bins=len(cleaned_set[feature].unique()))
        ax.set_title(f'Distribution of {feature}')
        ax.set_xlabel(feature)
        ax.set_ylabel('Frequency')

    # Turning off unused subplot
    for j in range(i+1, len(axes)):
        axes[j].axis('off')

plot_dfs(num_features)

#### Standardising suitable features

Obsered in the plot of each feature's distribution, and it seems the following features should be standardised:
*LotArea*, *BsmtFinSF1*, *TotalBsmtSF*, *1stFlrSF*, *GrLivArea*

In [None]:
standardised_features = ['LotArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea']

cleaned_set.loc[:, standardised_features] = StandardScaler().fit_transform(cleaned_set[standardised_features])
testing_set.loc[:, standardised_features] = StandardScaler().fit_transform(testing_set[standardised_features])

plot_dfs(standardised_features)

#### Standardising target

In [None]:
cleaned_set.loc[:, 'SalePrice'] = np.log1p(cleaned_set[['SalePrice']])
plt.plot(cleaned_set.loc[:, 'SalePrice'])

### Encoding categorical features

In [None]:
# Getting categorical columns
cat_cols = cleaned_set.select_dtypes(include=['object']).columns
cat_cols

#### Encoding categorical columns using OrdinalEncoder

It is noted that not all categorical columns are ordinal features.

Despite the added implication, it doesn't really matter that nominal values are treated as such.

In [None]:
ord_enc = OrdinalEncoder()
ord_enc.fit(cleaned_set.loc[:, cat_cols])
encoded_cat = ord_enc.transform(cleaned_set.loc[:, cat_cols])
encoded_cat

In [None]:
# Decoding categorical columns
ord_enc.inverse_transform(encoded_cat)

#### Subtituting encoded categorical features into the dataset

In [None]:
cleaned_set.loc[:, cat_cols] = encoded_cat
cleaned_set.loc[:, cat_cols]

In [None]:
testing_set.loc[:, cat_cols] = ord_enc.transform(testing_set.loc[:, cat_cols])
testing_set.loc[:, cat_cols]

## Building model

In [None]:
max_depth = 10
random_state = 0
regr = RandomForestRegressor(max_depth=max_depth, random_state=random_state)
regr = regr.fit(X=cleaned_set.drop(['Id', 'SalePrice'], axis=1), y=cleaned_set.loc[:, 'SalePrice'])
regr.score(X=cleaned_set.drop(['Id', 'SalePrice'], axis=1), y=cleaned_set.loc[:, 'SalePrice'])

## Converting testing set

In [None]:
y_pred = np.expm1(regr.predict(testing_set.drop('Id', axis=1)))
y_true = pd.read_csv('../dataset/sample_submission.csv').drop('Id', axis=1)

mean_squared_error(y_true, y_pred)