# Analysing the main_house data to predict house prices

### Pre-process data

In [32]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [33]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics

from catboost import CatBoostRegressor, Pool

from sklearn.metrics import mean_squared_error
from math import sqrt

In [34]:
data = r'/Users/OliverPan/Desktop/house/train.csv'

In [35]:
df = pd.read_csv(data)

In [36]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [37]:
misc_df = df[["SalePrice", "MoSold", "YrSold", "SaleType", "SaleCondition", "Condition1", "Condition2"]]

In [38]:
misc_df.head()

Unnamed: 0,SalePrice,MoSold,YrSold,SaleType,SaleCondition,Condition1,Condition2
0,208500,2,2008,WD,Normal,Norm,Norm
1,181500,5,2007,WD,Normal,Feedr,Norm
2,223500,9,2008,WD,Normal,Norm,Norm
3,140000,2,2006,WD,Abnorml,Norm,Norm
4,250000,12,2008,WD,Normal,Norm,Norm


### CatBoost

In [39]:
X = misc_df.drop(["SalePrice"], axis = 1)
y = misc_df[["SalePrice"]]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [41]:
category = ["MoSold", "YrSold", "SaleType", "SaleCondition", "Condition1", "Condition2"]

In [42]:
cat_boost = CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')
cat_boost.fit(X_train, y_train, cat_features=category)

0:	learn: 77467.3897514	total: 1.25ms	remaining: 61.1ms
1:	learn: 76663.0486887	total: 2.31ms	remaining: 55.4ms
2:	learn: 76007.7865310	total: 3.25ms	remaining: 51ms
3:	learn: 75439.8907667	total: 4.11ms	remaining: 47.3ms
4:	learn: 74972.1826098	total: 5.51ms	remaining: 49.6ms
5:	learn: 74565.2160190	total: 6.29ms	remaining: 46.1ms
6:	learn: 74285.5914561	total: 7.15ms	remaining: 44ms
7:	learn: 74006.9250175	total: 8.18ms	remaining: 42.9ms
8:	learn: 73743.4875110	total: 10.1ms	remaining: 46.2ms
9:	learn: 73554.7617977	total: 11.2ms	remaining: 44.6ms
10:	learn: 73414.7108501	total: 12.1ms	remaining: 42.7ms
11:	learn: 73265.1658372	total: 12.8ms	remaining: 40.6ms
12:	learn: 73236.1065625	total: 14ms	remaining: 40ms
13:	learn: 73097.2809634	total: 15.6ms	remaining: 40ms
14:	learn: 72947.5872093	total: 16.7ms	remaining: 39.1ms
15:	learn: 72744.2525831	total: 17.7ms	remaining: 37.6ms
16:	learn: 72550.0855808	total: 18.6ms	remaining: 36ms
17:	learn: 72437.8451690	total: 19.4ms	remaining: 34.

<catboost.core.CatBoostRegressor at 0x109d653d0>

In [43]:
y_pred = cat_boost.predict(X_test)

In [44]:
#Results using regressor
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,200624,176356.23589
1,133000,152925.374775
2,110000,174700.676342
3,192000,174783.339276
4,88000,176500.320568


In [45]:
## We should also look at RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

78819.7867468342

In [46]:
## Normalised RMSE
sqrt(mean_squared_error(y_test, y_pred)) / (y_test["SalePrice"].max() - y_test["SalePrice"].min())

0.1162533727829413

In [47]:
list(cat_boost.get_feature_importance(Pool(X_test, label = y_test, cat_features=category)))

[0.673250258242363,
 1.163618892900523,
 10.767204900486272,
 73.54024102192115,
 13.85568492644969,
 0.0]

In [49]:
misc_df.head()

Unnamed: 0,SalePrice,MoSold,YrSold,SaleType,SaleCondition,Condition1,Condition2
0,208500,2,2008,WD,Normal,Norm,Norm
1,181500,5,2007,WD,Normal,Feedr,Norm
2,223500,9,2008,WD,Normal,Norm,Norm
3,140000,2,2006,WD,Abnorml,Norm,Norm
4,250000,12,2008,WD,Normal,Norm,Norm
