# Analysing if garage is important to SalesPrice of house

## Gradient Boosting with CatBoost
### Pre-process data

In [85]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [86]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt

In [87]:
from catboost import CatBoostRegressor

In [88]:
data = r'/Users/OliverPan/Desktop/house/garage.csv'

In [89]:
garage_df = pd.read_csv(data)

In [90]:
garage_df.head()

Unnamed: 0,SalePrice,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea
0,208500,Attchd,2003.0,RFn,2,548
1,181500,Attchd,1976.0,RFn,2,460
2,223500,Attchd,2001.0,RFn,2,608
3,140000,Detchd,1998.0,Unf,3,642
4,250000,Attchd,2000.0,RFn,3,836


In [91]:
garage_df = garage_df[garage_df["GarageCars"] != 4]

### Additional pre-processing

In [92]:
garage_df["GarageType"] = garage_df["GarageType"].astype("category").cat.codes
garage_df["GarageFinish"] = garage_df["GarageFinish"].astype("category").cat.codes
garage_df["GarageCars"] = garage_df["GarageCars"].astype("category").cat.codes

garage_df = garage_df.dropna()

### CatBoost with all variables

In [93]:
X = garage_df.drop(["SalePrice"], axis = 1)
y = garage_df[["SalePrice"]]

In [94]:
category = ["GarageType", "GarageFinish", "GarageCars"]

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

In [96]:
cat_boost = CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')
cat_boost.fit(X_train, y_train, cat_features=category)

0:	learn: 78580.2497440	total: 719us	remaining: 35.3ms
1:	learn: 75729.6640953	total: 1.52ms	remaining: 36.6ms
2:	learn: 72818.2791896	total: 3.78ms	remaining: 59.3ms
3:	learn: 70572.9639516	total: 6.18ms	remaining: 71.1ms
4:	learn: 68773.3798415	total: 7.59ms	remaining: 68.3ms
5:	learn: 66973.6782170	total: 8.22ms	remaining: 60.3ms
6:	learn: 65188.2850673	total: 8.83ms	remaining: 54.2ms
7:	learn: 63691.8740222	total: 9.39ms	remaining: 49.3ms
8:	learn: 62379.6832947	total: 10.4ms	remaining: 47.3ms
9:	learn: 60705.9750250	total: 11.2ms	remaining: 45ms
10:	learn: 59519.2239829	total: 11.9ms	remaining: 42.1ms
11:	learn: 58294.6395396	total: 12.4ms	remaining: 39.3ms
12:	learn: 57257.5609077	total: 13ms	remaining: 37.1ms
13:	learn: 56521.0564712	total: 13.6ms	remaining: 35ms
14:	learn: 55887.3863978	total: 14.2ms	remaining: 33.1ms
15:	learn: 55321.0488998	total: 14.8ms	remaining: 31.4ms
16:	learn: 54810.5783296	total: 15.3ms	remaining: 29.8ms
17:	learn: 54436.7225397	total: 15.9ms	remaining

<catboost.core.CatBoostRegressor at 0x1a1a488d50>

In [97]:
y_pred = cat_boost.predict(X_test)

In [98]:
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,161000,122720.45682
1,153000,136285.985763
2,270000,292789.761674
3,274900,367430.190476
4,151000,157422.324563


In [99]:
## Error is ok, slowly lowering
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 29663.523250014856


In [100]:
## We should also look at RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

43527.27728829656

In [102]:
## Normalised RMSE
sqrt(mean_squared_error(y_test, y_pred)) / (y_test["SalePrice"].max() - y_test["SalePrice"].min())

0.08874062647970755