<img style="float: right;" src="img/openhouse_logo.png" width="200" height="200"/><br>

# <center> <ins> House Price Prediction Coding Test <br><br> 4. Feature Selection, Encoding, Scaling <ins> </center>
### <center>by: Daniel Lachner-Piza, PhD <br> for: OpenHouse.ai </center>




In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import joblib
import openhouse.etl.data_loader as dl
import openhouse.eda.eda_engine as eda

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from pathlib import Path

EDA_FIG_SIZE = (3,3)
models_path = Path("models")
os.makedirs(models_path, exist_ok=True)

# <ins> 1.Data Loading <ins> 

In [2]:
# Import the dataset using the DataLoader class, which wraps several file types into one single data reader.
data_loader = dl.DataLoader("data/clean_dataset.csv")
clean_df = data_loader.load_data()

# <ins> 2. Feature selection <ins>
Based on the exploration of the data from "03.Explore_Clean_Data" we can select which features to keep based on: 
- Correlation with the target
- Low correlation with other features

## 2.1. Numerical Features

- All the features show a correlation with the target
- The inter-feature correlation is not too high

Therefore, all numerical features are kept

In [3]:
contin_num_cols = ['LotArea', 'GrLivArea', 'YearBuilt', 'TotalBsmtSF', 'GarageArea']

## 2.2. Ordinal Features

- The YearSold features doesn't show any correlation with the target so it is removed
- All other ordinal features are kept

In [4]:
ordinal_cols = ['OverallQuality','OverallCondition','FullBath','HalfBath','GarageCars']

## 2.3. Categorical Features

- All the features show a correlation with the target

Therefore, all categorical features are kept

In [5]:
categorical_columns = ['Street', 'LotType', 'BldgType', 'HouseStyle', 'Foundation', 'CentralAir', 'GarageType', 'SaleType', 'SaleCondition']

# <ins> 3. Feature Encoding <ins>

In [6]:
df_numerical = clean_df[contin_num_cols].copy()
df_ordinal = clean_df[ordinal_cols].copy()
df_categorical = clean_df[categorical_columns].copy()
y_df = clean_df[['SalePrice']].copy()

## 3.1. Ordinal Encoding

In [7]:
encoded_ordinal_df = df_ordinal.copy()
for col_name in encoded_ordinal_df.columns:
    ordered_categories = np.sort(encoded_ordinal_df[col_name].unique())
    enc = OrdinalEncoder(categories=[ordered_categories], handle_unknown='use_encoded_value', unknown_value=-1)
    encoded_ordinal_df[col_name] = enc.fit_transform(encoded_ordinal_df[[col_name]]).astype(int)

## 3.2 Categorical Encoding

In [8]:
encoded_categorical_df = pd.get_dummies(df_categorical, drop_first=False)

In [9]:
encoded_categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1336 entries, 0 to 1335
Data columns (total 51 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Street_Grvl            1336 non-null   bool 
 1   Street_Pave            1336 non-null   bool 
 2   LotType_Corner         1336 non-null   bool 
 3   LotType_CulDSac        1336 non-null   bool 
 4   LotType_FR2            1336 non-null   bool 
 5   LotType_FR3            1336 non-null   bool 
 6   LotType_Inside         1336 non-null   bool 
 7   BldgType_1Fam          1336 non-null   bool 
 8   BldgType_2fmCon        1336 non-null   bool 
 9   BldgType_Duplex        1336 non-null   bool 
 10  BldgType_Twnhs         1336 non-null   bool 
 11  BldgType_TwnhsE        1336 non-null   bool 
 12  HouseStyle_1.5Fin      1336 non-null   bool 
 13  HouseStyle_1.5Unf      1336 non-null   bool 
 14  HouseStyle_1Fam        1336 non-null   bool 
 15  HouseStyle_1Story      1336 non-null  

# <ins> 4.Join all column types <ins> 

In [10]:
all_features_df = pd.concat([df_numerical, encoded_ordinal_df, encoded_categorical_df], axis=1)
print(f"Table Shape: {all_features_df.shape}")
all_features_df.head(5)

Table Shape: (1336, 61)


Unnamed: 0,LotArea,GrLivArea,YearBuilt,TotalBsmtSF,GarageArea,OverallQuality,OverallCondition,FullBath,HalfBath,GarageCars,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,8910,1194,1959,655,539.0,4,4,1,0,1,...,False,False,False,True,False,False,False,False,True,False
1,1526,630,1970,630,286.0,2,6,1,0,0,...,False,False,False,True,False,False,False,False,True,False
2,14598,1933,2007,894,668.0,4,3,2,1,2,...,False,False,False,True,False,False,False,False,True,False
3,7200,1040,1949,0,420.0,2,3,2,0,1,...,False,False,False,True,False,False,False,False,True,False
4,8100,1559,1948,1559,812.0,3,4,1,0,1,...,False,False,False,False,False,False,False,False,True,False


# <ins> 5. Standardize all features (z-score) <ins> 

In [11]:
feat_scaler = StandardScaler()
scaled_data = feat_scaler.fit_transform(all_features_df)
clean_scaled_data_df = all_features_df.copy()
for i, col_name in enumerate(clean_scaled_data_df.columns):
    clean_scaled_data_df[col_name] = scaled_data[:,i]

In [12]:
clean_scaled_data_df

Unnamed: 0,LotArea,GrLivArea,YearBuilt,TotalBsmtSF,GarageArea,OverallQuality,OverallCondition,FullBath,HalfBath,GarageCars,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.174981,-0.357075,-0.469580,-0.954556,0.210489,-0.130274,0.389367,-1.056176,-0.785264,0.209788,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
1,-0.888561,-0.868571,-0.096071,-1.011578,-1.154649,-1.624231,2.230762,-1.056176,-0.785264,-1.382694,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
2,0.374700,0.313130,1.160276,-0.409426,0.906547,-0.130274,-0.531331,0.771610,1.205462,1.802270,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
3,-0.340233,-0.496739,-0.809133,-2.448532,-0.431612,-1.624231,-0.531331,0.771610,-0.785264,0.209788,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
4,-0.253258,-0.026053,-0.843088,1.107358,1.683542,-0.877252,0.389367,-1.056176,-0.785264,0.209788,...,-0.0548,-0.303967,-0.027369,-2.575711,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331,-0.456199,-0.569291,-1.793837,-0.258888,-1.035941,-1.624231,-1.452028,-1.056176,-0.785264,-1.382694,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
1332,0.138129,0.187977,-0.639356,-0.053609,-0.798526,-0.877252,-0.531331,0.771610,-0.785264,-1.382694,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
1333,-0.050317,-0.455021,-0.639356,0.028502,-0.053905,-0.877252,1.310064,-1.056176,-0.785264,0.209788,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
1334,-0.224267,-0.677213,-0.775177,-0.804018,-1.111482,-1.624231,-1.452028,-1.056176,-0.785264,-1.382694,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354


## 5.1 Save clean and scaled table

In [13]:
clean_scaled_data_df.to_csv("data/clean_scaled_dataset.csv", index=False)

## 5.2 Save target dataframe

In [14]:
y_df.to_csv("data/target.csv", index=False)

## 5.3 Save StandardScaler

In [16]:
std_scaler_fpath = models_path / "StandardScaler.bin"
joblib.dump(feat_scaler, std_scaler_fpath, compress=True)

['models/StandardScaler.bin']