# Analysing if garage is important to SalesPrice of house

## Gradient Boosting with CatBoost
### Pre-process data

In [22]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [23]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt

In [24]:
from catboost import CatBoostRegressor, Pool

In [25]:
data = r'/Users/OliverPan/Desktop/house/garage.csv'

In [26]:
garage_df = pd.read_csv(data)

In [27]:
garage_df.head()

Unnamed: 0,SalePrice,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea
0,208500,Attchd,2003.0,RFn,2,548
1,181500,Attchd,1976.0,RFn,2,460
2,223500,Attchd,2001.0,RFn,2,608
3,140000,Detchd,1998.0,Unf,3,642
4,250000,Attchd,2000.0,RFn,3,836


In [28]:
garage_df = garage_df[garage_df["GarageCars"] != 4]

### Additional pre-processing

In [29]:
garage_df["GarageType"] = garage_df["GarageType"].astype("category").cat.codes
garage_df["GarageFinish"] = garage_df["GarageFinish"].astype("category").cat.codes
garage_df["GarageCars"] = garage_df["GarageCars"].astype("category").cat.codes

garage_df = garage_df.dropna()

### CatBoost with all variables

In [30]:
X = garage_df.drop(["SalePrice"], axis = 1)
y = garage_df[["SalePrice"]]

In [31]:
category = ["GarageType", "GarageFinish", "GarageCars"]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

In [33]:
cat_boost = CatBoostRegressor(depth=3, learning_rate=0.1, loss_function='RMSE')
cat_boost.fit(X_train, y_train, cat_features=category)

0:	learn: 78093.7258778	total: 7.13ms	remaining: 7.12s
1:	learn: 74760.2330205	total: 8.78ms	remaining: 4.38s
2:	learn: 71624.0899923	total: 10.2ms	remaining: 3.39s
3:	learn: 69002.8570222	total: 11.7ms	remaining: 2.9s
4:	learn: 66476.3949341	total: 13ms	remaining: 2.59s
5:	learn: 64449.6420161	total: 14.3ms	remaining: 2.36s
6:	learn: 62859.3627151	total: 15.9ms	remaining: 2.25s
7:	learn: 61518.5340504	total: 17.2ms	remaining: 2.14s
8:	learn: 60360.9845521	total: 18.9ms	remaining: 2.08s
9:	learn: 59355.8390198	total: 19.8ms	remaining: 1.96s
10:	learn: 58313.4118873	total: 21.6ms	remaining: 1.94s
11:	learn: 57586.3595548	total: 22.9ms	remaining: 1.89s
12:	learn: 56853.7498459	total: 24.9ms	remaining: 1.89s
13:	learn: 56105.0592803	total: 26.7ms	remaining: 1.88s
14:	learn: 55669.9249747	total: 27.9ms	remaining: 1.83s
15:	learn: 55262.1673098	total: 33.5ms	remaining: 2.06s
16:	learn: 54749.9788291	total: 34.8ms	remaining: 2.01s
17:	learn: 54095.0015052	total: 36.3ms	remaining: 1.98s
18:	l

276:	learn: 41317.2168059	total: 536ms	remaining: 1.4s
277:	learn: 41236.8591809	total: 537ms	remaining: 1.4s
278:	learn: 41236.0872343	total: 538ms	remaining: 1.39s
279:	learn: 41193.7505662	total: 539ms	remaining: 1.39s
280:	learn: 41171.2552141	total: 540ms	remaining: 1.38s
281:	learn: 41135.9438009	total: 541ms	remaining: 1.38s
282:	learn: 41134.5802319	total: 541ms	remaining: 1.37s
283:	learn: 41118.7052835	total: 543ms	remaining: 1.37s
284:	learn: 41082.6108039	total: 544ms	remaining: 1.36s
285:	learn: 41069.0965303	total: 545ms	remaining: 1.36s
286:	learn: 41064.6362820	total: 545ms	remaining: 1.35s
287:	learn: 41062.0761852	total: 546ms	remaining: 1.35s
288:	learn: 41046.1453780	total: 547ms	remaining: 1.34s
289:	learn: 41035.0041689	total: 548ms	remaining: 1.34s
290:	learn: 40990.3364100	total: 551ms	remaining: 1.34s
291:	learn: 40976.6065373	total: 552ms	remaining: 1.34s
292:	learn: 40950.7542831	total: 552ms	remaining: 1.33s
293:	learn: 40930.6860172	total: 553ms	remaining: 

453:	learn: 38767.8305190	total: 707ms	remaining: 850ms
454:	learn: 38758.8596719	total: 708ms	remaining: 848ms
455:	learn: 38748.7441105	total: 709ms	remaining: 846ms
456:	learn: 38748.6964088	total: 710ms	remaining: 843ms
457:	learn: 38721.5691972	total: 711ms	remaining: 841ms
458:	learn: 38703.0955159	total: 712ms	remaining: 839ms
459:	learn: 38692.7431232	total: 712ms	remaining: 836ms
460:	learn: 38689.1141605	total: 713ms	remaining: 834ms
461:	learn: 38686.0638187	total: 714ms	remaining: 832ms
462:	learn: 38683.6444435	total: 715ms	remaining: 830ms
463:	learn: 38677.3354826	total: 716ms	remaining: 827ms
464:	learn: 38667.1543141	total: 717ms	remaining: 825ms
465:	learn: 38650.6869752	total: 720ms	remaining: 825ms
466:	learn: 38638.6678315	total: 721ms	remaining: 823ms
467:	learn: 38626.5522699	total: 722ms	remaining: 820ms
468:	learn: 38612.6980746	total: 723ms	remaining: 818ms
469:	learn: 38610.2616372	total: 723ms	remaining: 816ms
470:	learn: 38574.2377606	total: 725ms	remaining

696:	learn: 36429.6532486	total: 1.04s	remaining: 453ms
697:	learn: 36425.6131638	total: 1.04s	remaining: 452ms
698:	learn: 36424.8309704	total: 1.04s	remaining: 450ms
699:	learn: 36414.9453151	total: 1.05s	remaining: 448ms
700:	learn: 36407.4040182	total: 1.05s	remaining: 447ms
701:	learn: 36406.7091583	total: 1.05s	remaining: 445ms
702:	learn: 36402.0467744	total: 1.05s	remaining: 443ms
703:	learn: 36397.3627134	total: 1.05s	remaining: 441ms
704:	learn: 36395.3633834	total: 1.05s	remaining: 439ms
705:	learn: 36388.7043235	total: 1.05s	remaining: 438ms
706:	learn: 36383.0806734	total: 1.05s	remaining: 436ms
707:	learn: 36381.4100803	total: 1.05s	remaining: 434ms
708:	learn: 36375.4612716	total: 1.05s	remaining: 432ms
709:	learn: 36373.0461051	total: 1.05s	remaining: 431ms
710:	learn: 36352.1697771	total: 1.05s	remaining: 429ms
711:	learn: 36347.9709542	total: 1.06s	remaining: 427ms
712:	learn: 36340.9967771	total: 1.06s	remaining: 425ms
713:	learn: 36332.7103750	total: 1.06s	remaining

857:	learn: 35278.2026927	total: 1.21s	remaining: 201ms
858:	learn: 35274.5440846	total: 1.21s	remaining: 199ms
859:	learn: 35269.7574803	total: 1.22s	remaining: 198ms
860:	learn: 35257.7782588	total: 1.22s	remaining: 196ms
861:	learn: 35257.3200098	total: 1.22s	remaining: 195ms
862:	learn: 35253.8371819	total: 1.22s	remaining: 193ms
863:	learn: 35251.7466707	total: 1.22s	remaining: 192ms
864:	learn: 35248.9606576	total: 1.22s	remaining: 191ms
865:	learn: 35245.4890993	total: 1.22s	remaining: 189ms
866:	learn: 35241.1493445	total: 1.22s	remaining: 188ms
867:	learn: 35240.1658353	total: 1.23s	remaining: 186ms
868:	learn: 35237.7075866	total: 1.23s	remaining: 185ms
869:	learn: 35235.7740207	total: 1.23s	remaining: 183ms
870:	learn: 35231.9696779	total: 1.23s	remaining: 182ms
871:	learn: 35227.5463645	total: 1.23s	remaining: 180ms
872:	learn: 35221.7340867	total: 1.23s	remaining: 179ms
873:	learn: 35220.1806468	total: 1.23s	remaining: 177ms
874:	learn: 35218.4793768	total: 1.23s	remaining

<catboost.core.CatBoostRegressor at 0x1a1da61910>

In [34]:
y_pred = cat_boost.predict(X_test)

In [35]:
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,161000,137953.434636
1,153000,148108.9935
2,270000,261879.631429
3,274900,378160.44381
4,151000,166294.89982


In [36]:
## Error is ok, slowly lowering
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 30300.186947530558


In [37]:
## We should also look at RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

46745.630777548504

In [38]:
## Normalised RMSE
sqrt(mean_squared_error(y_test, y_pred)) / (y_test["SalePrice"].max() - y_test["SalePrice"].min())

0.09530199954648012

In [39]:
list(cat_boost.get_feature_importance(Pool(X_test, label = y_test, cat_features=category)))

[7.242071667920951,
 15.336670878091667,
 16.17213830229763,
 24.4770090320926,
 36.77211011959715]

In [40]:
garage_df.drop(["SalePrice"], axis = 1).columns.values

array(['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea'], dtype=object)