# Aggregating columns together based on subcategory

### Pre-process data

#### Note: Importance values are from other notebooks

In [96]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [97]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics

from catboost import CatBoostRegressor, Pool

from sklearn.metrics import mean_squared_error
from math import sqrt

In [98]:
data = r'/Users/OliverPan/Desktop/house/train.csv'

In [99]:
df = pd.read_csv(data)

In [100]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Subcategory 1: Main Factors

In [101]:
new_main_df = df[['Id', "SalePrice", "OverallQual", "OverallCond", "LotArea", "YearBuilt", "Street", "LotShape"]]

In [102]:
new_main_df.head()

Unnamed: 0,Id,SalePrice,OverallQual,OverallCond,LotArea,YearBuilt,Street,LotShape
0,1,208500,7,5,8450,2003,Pave,Reg
1,2,181500,6,8,9600,1976,Pave,Reg
2,3,223500,7,5,11250,2001,Pave,IR1
3,4,140000,7,5,9550,1915,Pave,IR1
4,5,250000,8,5,14260,2000,Pave,IR1


In [103]:
#Feature importance from main_analysis_df
main_importance = pd.DataFrame()
main_importance["variable"] = ['OverallQual', 'LotArea', 'YearBuilt', 'LotShape']
main_importance["importance"] = [76.95991136103147, 18.75558088809967, 3.0246606935231912, 1.2598470573456726]
main_importance

Unnamed: 0,variable,importance
0,OverallQual,76.959911
1,LotArea,18.755581
2,YearBuilt,3.024661
3,LotShape,1.259847


##### Catboost Normalised RMSE: 0.06657866598719948

### Subcategory 2: Garage Details

In [104]:
garage_df = df[["Id", "SalePrice", "GarageType", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond"]]

In [105]:
garage_df.head()

Unnamed: 0,Id,SalePrice,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond
0,1,208500,Attchd,2003.0,RFn,2,548,TA,TA
1,2,181500,Attchd,1976.0,RFn,2,460,TA,TA
2,3,223500,Attchd,2001.0,RFn,2,608,TA,TA
3,4,140000,Detchd,1998.0,Unf,3,642,TA,TA
4,5,250000,Attchd,2000.0,RFn,3,836,TA,TA


In [106]:
garage_importance = pd.DataFrame()
garage_importance["variable"] = ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']
garage_importance["importance"] = [7.0372203818727534, 15.101076654321679, 16.594485367397123, 26.732025422742815, 33.54529737715993, 0.3450221588943238, 0.6448726376113748]
garage_importance

Unnamed: 0,variable,importance
0,GarageType,7.03722
1,GarageYrBlt,15.101077
2,GarageFinish,16.594485
3,GarageCars,26.732025
4,GarageArea,33.545297
5,GarageQual,0.345022
6,GarageCond,0.644873


##### Catboost Normalised RMSE: 0.09055755195766964

### Subcategory 3: Basement Details

In [107]:
basement_df = df[['Id', "SalePrice", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF"]]

In [108]:
basement_df.head()

Unnamed: 0,Id,SalePrice,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
0,1,208500,Gd,TA,No,GLQ,706,Unf,0,150,856
1,2,181500,Gd,TA,Gd,ALQ,978,Unf,0,284,1262
2,3,223500,Gd,TA,Mn,GLQ,486,Unf,0,434,920
3,4,140000,TA,Gd,No,ALQ,216,Unf,0,540,756
4,5,250000,Gd,TA,Av,GLQ,655,Unf,0,490,1145


In [109]:
basement_importance = pd.DataFrame()
basement_importance["variable"] = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','TotalBsmtSF']
basement_importance["importance"] = [38.90071168327252, 1.5007923822496567, 8.373343396151819, 7.687886066966812, 43.537266471359175]
basement_importance

Unnamed: 0,variable,importance
0,BsmtQual,38.900712
1,BsmtCond,1.500792
2,BsmtExposure,8.373343
3,BsmtFinType1,7.687886
4,TotalBsmtSF,43.537266


##### Catboost Normalised RMSE: 0.07761983722416106

### Subcategory 4: Utilities

In [110]:
utilities_df = df[["SalePrice", "Utilities", "Heating", "HeatingQC", "CentralAir", "Electrical", "Fireplaces", "FireplaceQu"]]

In [111]:
utilities_df.head()

Unnamed: 0,SalePrice,Utilities,Heating,HeatingQC,CentralAir,Electrical,Fireplaces,FireplaceQu
0,208500,AllPub,GasA,Ex,Y,SBrkr,0,
1,181500,AllPub,GasA,Ex,Y,SBrkr,1,TA
2,223500,AllPub,GasA,Ex,Y,SBrkr,1,TA
3,140000,AllPub,GasA,Gd,Y,SBrkr,1,Gd
4,250000,AllPub,GasA,Ex,Y,SBrkr,1,TA


In [112]:
utilities_importance = pd.DataFrame()
utilities_importance["variable"] = ['Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'Fireplaces', 'FireplaceQu']
utilities_importance["importance"] = [1.4827008116357552, 39.684940887098975, 2.595176412575824, 5.262203287408806, 11.978141353196063, 38.99683724808456]
utilities_importance

Unnamed: 0,variable,importance
0,Heating,1.482701
1,HeatingQC,39.684941
2,CentralAir,2.595176
3,Electrical,5.262203
4,Fireplaces,11.978141
5,FireplaceQu,38.996837


##### Catboost Normalised RMSE: 0.09698163274687885

### Subcategory 5: Miscellaneous

In [113]:
misc_df = df[["SalePrice", "MoSold", "YrSold", "SaleType", "SaleCondition", "Condition1", "Condition2"]]

In [114]:
misc_df.head()

Unnamed: 0,SalePrice,MoSold,YrSold,SaleType,SaleCondition,Condition1,Condition2
0,208500,2,2008,WD,Normal,Norm,Norm
1,181500,5,2007,WD,Normal,Feedr,Norm
2,223500,9,2008,WD,Normal,Norm,Norm
3,140000,2,2006,WD,Abnorml,Norm,Norm
4,250000,12,2008,WD,Normal,Norm,Norm


In [115]:
misc_importance = pd.DataFrame()
misc_importance["variable"] = ["MoSold", "YrSold", "SaleType", "SaleCondition", "Condition1", "Condition2"]
misc_importance["importance"] = [0.673250258242363, 1.163618892900523, 10.767204900486272, 73.54024102192115, 13.85568492644969, 0.0]
misc_importance

Unnamed: 0,variable,importance
0,MoSold,0.67325
1,YrSold,1.163619
2,SaleType,10.767205
3,SaleCondition,73.540241
4,Condition1,13.855685
5,Condition2,0.0


##### Catboost Normalised RMSE: 0.1162533727829413

### Aggregate columns with higher feature importance

In [116]:
model_df = df[['SalePrice', 'OverallQual', 'LotArea', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'BsmtQual', 'TotalBsmtSF', 'HeatingQC', 'FireplaceQu', 'SaleCondition']]

In [117]:
model_df.head()

Unnamed: 0,SalePrice,OverallQual,LotArea,GarageYrBlt,GarageFinish,GarageCars,GarageArea,BsmtQual,TotalBsmtSF,HeatingQC,FireplaceQu,SaleCondition
0,208500,7,8450,2003.0,RFn,2,548,Gd,856,Ex,,Normal
1,181500,6,9600,1976.0,RFn,2,460,Gd,1262,Ex,TA,Normal
2,223500,7,11250,2001.0,RFn,2,608,Gd,920,Ex,TA,Normal
3,140000,7,9550,1998.0,Unf,3,642,TA,756,Gd,Gd,Abnorml
4,250000,8,14260,2000.0,RFn,3,836,Gd,1145,Ex,TA,Normal


In [125]:
#Converting to numerical values
model_df["GarageFinish"] = model_df["GarageFinish"].astype('category').cat.codes
model_df["BsmtQual"] = model_df["BsmtQual"].astype('category').cat.codes
model_df["HeatingQC"] = model_df["HeatingQC"].astype('category').cat.codes
model_df["FireplaceQu"] = model_df["FireplaceQu"].astype('category').cat.codes
model_df["SaleCondition"] = model_df["SaleCondition"].astype('category').cat.codes
model_df["GarageYrBlt"] = model_df["GarageYrBlt"].astype(int)

model_df = model_df.fillna(0)

In [126]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   SalePrice      1460 non-null   int64
 1   OverallQual    1460 non-null   int64
 2   LotArea        1460 non-null   int64
 3   GarageYrBlt    1460 non-null   int64
 4   GarageFinish   1460 non-null   int8 
 5   GarageCars     1460 non-null   int64
 6   GarageArea     1460 non-null   int64
 7   BsmtQual       1460 non-null   int8 
 8   TotalBsmtSF    1460 non-null   int64
 9   HeatingQC      1460 non-null   int8 
 10  FireplaceQu    1460 non-null   int8 
 11  SaleCondition  1460 non-null   int8 
dtypes: int64(7), int8(5)
memory usage: 87.1 KB


### CatBoost

In [127]:
X = model_df.drop(["SalePrice"], axis = 1)
y = model_df[["SalePrice"]]

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [129]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292 entries, 529 to 61
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   OverallQual    292 non-null    int64
 1   LotArea        292 non-null    int64
 2   GarageYrBlt    292 non-null    int64
 3   GarageFinish   292 non-null    int8 
 4   GarageCars     292 non-null    int64
 5   GarageArea     292 non-null    int64
 6   BsmtQual       292 non-null    int8 
 7   TotalBsmtSF    292 non-null    int64
 8   HeatingQC      292 non-null    int8 
 9   FireplaceQu    292 non-null    int8 
 10  SaleCondition  292 non-null    int8 
dtypes: int64(6), int8(5)
memory usage: 17.4 KB


In [130]:
category = ["OverallQual", "GarageYrBlt", "GarageFinish", "GarageCars", "BsmtQual", "HeatingQC", "FireplaceQu", "SaleCondition"]

In [131]:
cat_boost = CatBoostRegressor(depth=3, learning_rate=0.1, loss_function='RMSE')
cat_boost.fit(X_train, y_train, cat_features=category)

0:	learn: 74336.4668077	total: 3.11ms	remaining: 3.11s
1:	learn: 70047.9046351	total: 7.29ms	remaining: 3.64s
2:	learn: 66361.4720953	total: 10.3ms	remaining: 3.43s
3:	learn: 62986.1545667	total: 12.9ms	remaining: 3.21s
4:	learn: 60286.4136729	total: 15ms	remaining: 2.99s
5:	learn: 57681.0957765	total: 17.4ms	remaining: 2.88s
6:	learn: 55283.1674941	total: 19.1ms	remaining: 2.71s
7:	learn: 53098.3369067	total: 21.9ms	remaining: 2.72s
8:	learn: 51244.8401309	total: 23.8ms	remaining: 2.62s
9:	learn: 49339.9228176	total: 25.7ms	remaining: 2.54s
10:	learn: 47769.0169343	total: 27.5ms	remaining: 2.47s
11:	learn: 46305.1646212	total: 29.8ms	remaining: 2.45s
12:	learn: 45001.3661966	total: 31.9ms	remaining: 2.42s
13:	learn: 43872.6608695	total: 33.6ms	remaining: 2.37s
14:	learn: 42823.1152753	total: 35.8ms	remaining: 2.35s
15:	learn: 41784.1056658	total: 38.9ms	remaining: 2.39s
16:	learn: 41002.1916385	total: 41.7ms	remaining: 2.41s
17:	learn: 40334.5152106	total: 44.3ms	remaining: 2.42s
18:	

201:	learn: 26820.0718865	total: 372ms	remaining: 1.47s
202:	learn: 26817.3351047	total: 375ms	remaining: 1.47s
203:	learn: 26815.8155831	total: 377ms	remaining: 1.47s
204:	learn: 26799.9191066	total: 379ms	remaining: 1.47s
205:	learn: 26768.1728680	total: 381ms	remaining: 1.47s
206:	learn: 26765.5278824	total: 383ms	remaining: 1.47s
207:	learn: 26763.0228172	total: 384ms	remaining: 1.46s
208:	learn: 26757.6624801	total: 387ms	remaining: 1.46s
209:	learn: 26752.2073671	total: 389ms	remaining: 1.46s
210:	learn: 26751.3838032	total: 392ms	remaining: 1.47s
211:	learn: 26724.6405086	total: 395ms	remaining: 1.47s
212:	learn: 26712.7388073	total: 397ms	remaining: 1.47s
213:	learn: 26697.6472334	total: 399ms	remaining: 1.47s
214:	learn: 26694.4672112	total: 401ms	remaining: 1.47s
215:	learn: 26661.1206256	total: 404ms	remaining: 1.47s
216:	learn: 26612.8259297	total: 406ms	remaining: 1.47s
217:	learn: 26608.1060306	total: 408ms	remaining: 1.46s
218:	learn: 26594.8477203	total: 410ms	remaining

433:	learn: 23441.1063822	total: 739ms	remaining: 964ms
434:	learn: 23423.2282805	total: 741ms	remaining: 963ms
435:	learn: 23382.8629849	total: 744ms	remaining: 962ms
436:	learn: 23362.2630273	total: 746ms	remaining: 961ms
437:	learn: 23359.9353931	total: 748ms	remaining: 960ms
438:	learn: 23353.8758371	total: 751ms	remaining: 959ms
439:	learn: 23348.6023246	total: 753ms	remaining: 959ms
440:	learn: 23329.0348970	total: 755ms	remaining: 957ms
441:	learn: 23326.1186924	total: 757ms	remaining: 956ms
442:	learn: 23311.3385062	total: 759ms	remaining: 954ms
443:	learn: 23301.2434372	total: 761ms	remaining: 952ms
444:	learn: 23296.6826235	total: 762ms	remaining: 950ms
445:	learn: 23291.6108529	total: 763ms	remaining: 948ms
446:	learn: 23275.8371109	total: 765ms	remaining: 946ms
447:	learn: 23272.3946575	total: 766ms	remaining: 944ms
448:	learn: 23257.8592880	total: 768ms	remaining: 942ms
449:	learn: 23250.8761475	total: 769ms	remaining: 940ms
450:	learn: 23224.6038911	total: 771ms	remaining

661:	learn: 21556.6824605	total: 1.1s	remaining: 565ms
662:	learn: 21548.8713315	total: 1.11s	remaining: 563ms
663:	learn: 21536.8632392	total: 1.11s	remaining: 562ms
664:	learn: 21522.7341513	total: 1.11s	remaining: 561ms
665:	learn: 21513.9163342	total: 1.12s	remaining: 560ms
666:	learn: 21510.7789433	total: 1.12s	remaining: 559ms
667:	learn: 21507.8191737	total: 1.12s	remaining: 558ms
668:	learn: 21501.7856024	total: 1.13s	remaining: 557ms
669:	learn: 21501.3199465	total: 1.13s	remaining: 556ms
670:	learn: 21484.9618834	total: 1.13s	remaining: 554ms
671:	learn: 21481.1100890	total: 1.13s	remaining: 553ms
672:	learn: 21479.1398012	total: 1.13s	remaining: 551ms
673:	learn: 21471.8196717	total: 1.14s	remaining: 550ms
674:	learn: 21458.1818840	total: 1.14s	remaining: 548ms
675:	learn: 21457.6799318	total: 1.14s	remaining: 547ms
676:	learn: 21440.8416919	total: 1.14s	remaining: 546ms
677:	learn: 21440.2621419	total: 1.15s	remaining: 544ms
678:	learn: 21428.6153588	total: 1.15s	remaining:

876:	learn: 20016.2947718	total: 1.47s	remaining: 207ms
877:	learn: 20015.3126908	total: 1.48s	remaining: 205ms
878:	learn: 20011.3302222	total: 1.48s	remaining: 203ms
879:	learn: 19989.4748562	total: 1.48s	remaining: 202ms
880:	learn: 19982.7831691	total: 1.48s	remaining: 200ms
881:	learn: 19977.0955845	total: 1.49s	remaining: 199ms
882:	learn: 19971.3324972	total: 1.49s	remaining: 197ms
883:	learn: 19970.6184964	total: 1.49s	remaining: 196ms
884:	learn: 19968.7494261	total: 1.49s	remaining: 194ms
885:	learn: 19962.4739502	total: 1.49s	remaining: 192ms
886:	learn: 19956.5861063	total: 1.5s	remaining: 191ms
887:	learn: 19937.8143861	total: 1.5s	remaining: 189ms
888:	learn: 19929.7536434	total: 1.5s	remaining: 187ms
889:	learn: 19927.3037227	total: 1.5s	remaining: 186ms
890:	learn: 19916.8381701	total: 1.5s	remaining: 184ms
891:	learn: 19905.0581182	total: 1.5s	remaining: 182ms
892:	learn: 19899.8539852	total: 1.51s	remaining: 180ms
893:	learn: 19892.0516458	total: 1.51s	remaining: 179m

<catboost.core.CatBoostRegressor at 0x7fcfd8feb100>

In [132]:
y_pred = cat_boost.predict(X_test)

In [133]:
#Results using regressor
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,200624,264519.127967
1,133000,138191.602051
2,110000,123936.534407
3,192000,218618.151546
4,88000,93551.884459


In [134]:
## We should also look at RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

38139.97311238505

In [135]:
## Normalised RMSE
sqrt(mean_squared_error(y_test, y_pred)) / (y_test["SalePrice"].max() - y_test["SalePrice"].min())

0.05625364765838503

In [136]:
new_main_df.drop(["SalePrice"], axis = 1).columns.values

array(['Id', 'OverallQual', 'OverallCond', 'LotArea', 'YearBuilt',
       'Street', 'LotShape'], dtype=object)

In [137]:
list(cat_boost.get_feature_importance(Pool(X_test, label = y_test, cat_features=category)))

[41.615440933138856,
 8.395037932553377,
 2.563248483317475,
 2.012797298739493,
 11.671316916652003,
 8.782618474239058,
 5.406414822034362,
 10.28683039508616,
 1.4916044606271386,
 5.8450420995421695,
 1.9296481840698316]

#### We can see that the model is not predicting the higher Actual value houses very well. There might be additional features that we can include to help with higher valued houses

In [140]:
df[df["Actual"] > 400000]

Unnamed: 0,Actual,Predicted
8,745000,551875.576856
44,403000,306940.616606
50,412500,350474.695632
126,415298,478943.196834
253,438780,399945.656638
258,538000,343311.337477
288,555000,479523.933456
