# Analysing the utilities data to predict house prices

### Pre-process data

In [61]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [62]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics

from sklearn.metrics import mean_squared_error
from math import sqrt

from catboost import CatBoostRegressor, Pool

In [63]:
data = r'/Users/OliverPan/Desktop/house/train.csv'

In [64]:
df = pd.read_csv(data)

In [65]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [66]:
utilities = df[["SalePrice", "Utilities", "Heating", "HeatingQC", "CentralAir", "Electrical", "Fireplaces", "FireplaceQu"]]

In [67]:
utilities.head()

Unnamed: 0,SalePrice,Utilities,Heating,HeatingQC,CentralAir,Electrical,Fireplaces,FireplaceQu
0,208500,AllPub,GasA,Ex,Y,SBrkr,0,
1,181500,AllPub,GasA,Ex,Y,SBrkr,1,TA
2,223500,AllPub,GasA,Ex,Y,SBrkr,1,TA
3,140000,AllPub,GasA,Gd,Y,SBrkr,1,Gd
4,250000,AllPub,GasA,Ex,Y,SBrkr,1,TA


In [68]:
utilities["Utilities"].value_counts()

AllPub    1459
NoSeWa       1
Name: Utilities, dtype: int64

#### Utilities not helpful for model

In [69]:
utilities = utilities.drop(["Utilities"], axis = 1)
utilities["Heating"] = utilities["Heating"].astype(str)
utilities["HeatingQC"] = utilities["HeatingQC"].astype(str)
utilities["CentralAir"] = utilities["CentralAir"].astype(str)
utilities["Electrical"] = utilities["Electrical"].astype(str)
utilities["FireplaceQu"] = utilities["FireplaceQu"].astype(str)

### CatBoost

In [70]:
X = utilities.drop(["SalePrice"], axis = 1)
y = utilities[["SalePrice"]]

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [72]:
category = ["Heating", "HeatingQC", "CentralAir", "Electrical", "Fireplaces", "FireplaceQu"]

In [73]:
cat_boost = CatBoostRegressor(depth=3, learning_rate=0.1, loss_function='RMSE')
cat_boost.fit(X_train, y_train, cat_features=category)

0:	learn: 75698.5945107	total: 3.76ms	remaining: 3.75s
1:	learn: 73314.2549182	total: 5.69ms	remaining: 2.84s
2:	learn: 71278.3451229	total: 6.92ms	remaining: 2.3s
3:	learn: 69694.6311375	total: 7.99ms	remaining: 1.99s
4:	learn: 68285.1897017	total: 9.05ms	remaining: 1.8s
5:	learn: 67130.5934195	total: 10ms	remaining: 1.66s
6:	learn: 66183.6445434	total: 10.9ms	remaining: 1.54s
7:	learn: 65398.1449560	total: 11.9ms	remaining: 1.47s
8:	learn: 64669.6367034	total: 13.5ms	remaining: 1.48s
9:	learn: 64193.7750103	total: 14.7ms	remaining: 1.46s
10:	learn: 63560.3652319	total: 15.7ms	remaining: 1.41s
11:	learn: 63182.0899198	total: 16.7ms	remaining: 1.37s
12:	learn: 62964.6178977	total: 17.7ms	remaining: 1.34s
13:	learn: 62653.2677683	total: 18.6ms	remaining: 1.31s
14:	learn: 62361.0424125	total: 19.6ms	remaining: 1.28s
15:	learn: 62034.5637989	total: 20.5ms	remaining: 1.26s
16:	learn: 61794.3618540	total: 21.7ms	remaining: 1.25s
17:	learn: 61488.6640554	total: 22.6ms	remaining: 1.23s
18:	le

171:	learn: 57246.2458659	total: 177ms	remaining: 854ms
172:	learn: 57245.9861825	total: 180ms	remaining: 859ms
173:	learn: 57245.5086963	total: 181ms	remaining: 860ms
174:	learn: 57220.2496133	total: 183ms	remaining: 862ms
175:	learn: 57219.4134019	total: 186ms	remaining: 873ms
176:	learn: 57218.6030401	total: 188ms	remaining: 874ms
177:	learn: 57216.9504820	total: 190ms	remaining: 875ms
178:	learn: 57206.3533434	total: 191ms	remaining: 876ms
179:	learn: 57188.4882450	total: 192ms	remaining: 876ms
180:	learn: 57186.8598879	total: 194ms	remaining: 877ms
181:	learn: 57186.5374627	total: 195ms	remaining: 878ms
182:	learn: 57186.2314745	total: 197ms	remaining: 880ms
183:	learn: 57173.6448253	total: 199ms	remaining: 881ms
184:	learn: 57166.5919199	total: 200ms	remaining: 881ms
185:	learn: 57166.3206142	total: 202ms	remaining: 884ms
186:	learn: 57164.0563678	total: 203ms	remaining: 884ms
187:	learn: 57163.8068803	total: 205ms	remaining: 887ms
188:	learn: 57161.5864099	total: 207ms	remaining

452:	learn: 55637.3595762	total: 520ms	remaining: 628ms
453:	learn: 55626.5688522	total: 522ms	remaining: 628ms
454:	learn: 55625.5417962	total: 524ms	remaining: 627ms
455:	learn: 55596.1499174	total: 525ms	remaining: 626ms
456:	learn: 55589.2126737	total: 526ms	remaining: 626ms
457:	learn: 55589.0501089	total: 528ms	remaining: 625ms
458:	learn: 55586.9677218	total: 530ms	remaining: 624ms
459:	learn: 55582.9635665	total: 531ms	remaining: 623ms
460:	learn: 55582.0192964	total: 533ms	remaining: 623ms
461:	learn: 55581.9768868	total: 534ms	remaining: 622ms
462:	learn: 55580.7440147	total: 536ms	remaining: 621ms
463:	learn: 55580.2547890	total: 537ms	remaining: 621ms
464:	learn: 55568.0566237	total: 540ms	remaining: 621ms
465:	learn: 55567.6251954	total: 541ms	remaining: 620ms
466:	learn: 55561.1735790	total: 543ms	remaining: 619ms
467:	learn: 55557.8947197	total: 544ms	remaining: 618ms
468:	learn: 55545.2481946	total: 545ms	remaining: 617ms
469:	learn: 55544.6792573	total: 546ms	remaining

619:	learn: 54889.3124988	total: 696ms	remaining: 426ms
620:	learn: 54886.4396410	total: 698ms	remaining: 426ms
621:	learn: 54885.0248146	total: 699ms	remaining: 425ms
622:	learn: 54878.2988701	total: 701ms	remaining: 424ms
623:	learn: 54876.1516750	total: 702ms	remaining: 423ms
624:	learn: 54874.8515766	total: 704ms	remaining: 422ms
625:	learn: 54870.7570061	total: 705ms	remaining: 421ms
626:	learn: 54868.7872088	total: 707ms	remaining: 420ms
627:	learn: 54866.2220542	total: 709ms	remaining: 420ms
628:	learn: 54864.8423002	total: 711ms	remaining: 419ms
629:	learn: 54863.4888801	total: 712ms	remaining: 418ms
630:	learn: 54859.4143788	total: 713ms	remaining: 417ms
631:	learn: 54829.8833838	total: 714ms	remaining: 416ms
632:	learn: 54825.0824834	total: 716ms	remaining: 415ms
633:	learn: 54825.0667735	total: 717ms	remaining: 414ms
634:	learn: 54818.2843462	total: 718ms	remaining: 413ms
635:	learn: 54814.3311016	total: 719ms	remaining: 411ms
636:	learn: 54806.8789601	total: 720ms	remaining

789:	learn: 54301.5085182	total: 874ms	remaining: 232ms
790:	learn: 54297.1613307	total: 875ms	remaining: 231ms
791:	learn: 54296.2627931	total: 876ms	remaining: 230ms
792:	learn: 54293.8931512	total: 877ms	remaining: 229ms
793:	learn: 54292.0654530	total: 878ms	remaining: 228ms
794:	learn: 54262.4700389	total: 879ms	remaining: 227ms
795:	learn: 54261.8371945	total: 880ms	remaining: 226ms
796:	learn: 54257.6223846	total: 881ms	remaining: 224ms
797:	learn: 54254.3900334	total: 882ms	remaining: 223ms
798:	learn: 54253.9919982	total: 883ms	remaining: 222ms
799:	learn: 54250.7836762	total: 884ms	remaining: 221ms
800:	learn: 54248.4808194	total: 885ms	remaining: 220ms
801:	learn: 54248.0852185	total: 886ms	remaining: 219ms
802:	learn: 54247.1972804	total: 887ms	remaining: 218ms
803:	learn: 54246.7006535	total: 888ms	remaining: 216ms
804:	learn: 54243.1857176	total: 889ms	remaining: 215ms
805:	learn: 54242.7415883	total: 891ms	remaining: 214ms
806:	learn: 54241.7100702	total: 893ms	remaining

952:	learn: 53713.8260213	total: 1.05s	remaining: 51.7ms
953:	learn: 53712.4351791	total: 1.05s	remaining: 50.7ms
954:	learn: 53706.5801412	total: 1.05s	remaining: 49.6ms
955:	learn: 53706.5335957	total: 1.05s	remaining: 48.5ms
956:	learn: 53704.0050086	total: 1.06s	remaining: 47.5ms
957:	learn: 53703.7593523	total: 1.06s	remaining: 46.4ms
958:	learn: 53697.9114855	total: 1.06s	remaining: 45.3ms
959:	learn: 53697.0256513	total: 1.06s	remaining: 44.3ms
960:	learn: 53695.1806051	total: 1.06s	remaining: 43.2ms
961:	learn: 53694.7404175	total: 1.06s	remaining: 42.1ms
962:	learn: 53694.0922116	total: 1.07s	remaining: 41ms
963:	learn: 53692.6448792	total: 1.07s	remaining: 39.9ms
964:	learn: 53692.6375518	total: 1.07s	remaining: 38.7ms
965:	learn: 53688.3804877	total: 1.07s	remaining: 37.6ms
966:	learn: 53686.1477373	total: 1.07s	remaining: 36.5ms
967:	learn: 53684.8434537	total: 1.07s	remaining: 35.4ms
968:	learn: 53678.3613325	total: 1.07s	remaining: 34.3ms
969:	learn: 53677.9010516	total: 

<catboost.core.CatBoostRegressor at 0x1a1dff2950>

In [74]:
y_pred = cat_boost.predict(X_test)

In [75]:
#Results using regressor
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,200624,223165.906347
1,133000,181616.392464
2,110000,168682.622711
3,192000,223987.654366
4,88000,127736.317128


In [76]:
## We should also look at RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

65753.54700238386

In [77]:
## Normalised RMSE
sqrt(mean_squared_error(y_test, y_pred)) / (y_test["SalePrice"].max() - y_test["SalePrice"].min())

0.09698163274687885

In [78]:
list(cat_boost.get_feature_importance(Pool(X_test, label = y_test, cat_features=category)))

[1.4827008116357552,
 39.684940887098975,
 2.595176412575824,
 5.262203287408806,
 11.978141353196063,
 38.99683724808456]

In [79]:
utilities.drop(["SalePrice"], axis = 1).columns.values

array(['Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'Fireplaces',
       'FireplaceQu'], dtype=object)