# Combining main_analysis with garage & basement data

### Pre-process data

#### We have also decided that catboost is the strongest model for our regression problem, so we will go ahead and use it for the combined files

In [9]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [10]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics

from catboost import CatBoostRegressor, Pool

from sklearn.metrics import mean_squared_error
from math import sqrt

In [11]:
data = r'/Users/OliverPan/Desktop/house/train.csv'

In [12]:
df = pd.read_csv(data)

In [13]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [14]:
new_main_df = df[["Id", "SalePrice", "OverallQual", "LotArea", "GarageFinish", "GarageCars",
       "GarageArea", "BsmtQual", "TotalBsmtSF", "HeatingQC", "FireplaceQu"]]

In [15]:
new_main_df.head()

Unnamed: 0,Id,SalePrice,OverallQual,LotArea,GarageFinish,GarageCars,GarageArea,BsmtQual,TotalBsmtSF,HeatingQC,FireplaceQu
0,1,208500,7,8450,RFn,2,548,Gd,856,Ex,
1,2,181500,6,9600,RFn,2,460,Gd,1262,Ex,TA
2,3,223500,7,11250,RFn,2,608,Gd,920,Ex,TA
3,4,140000,7,9550,Unf,3,642,TA,756,Gd,Gd
4,5,250000,8,14260,RFn,3,836,Gd,1145,Ex,TA


In [16]:
new_main_df["GarageFinish"] = new_main_df["GarageFinish"].astype(str)
new_main_df["BsmtQual"] = new_main_df["BsmtQual"].astype(str)
new_main_df["FireplaceQu"] = new_main_df["FireplaceQu"].astype(str)
new_main_df["HeatingQC"] = new_main_df["HeatingQC"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

### Analysis

In [17]:
new_main_df.corr()["SalePrice"]

Id            -0.021917
SalePrice      1.000000
OverallQual    0.790982
LotArea        0.263843
GarageCars     0.640409
GarageArea     0.623431
TotalBsmtSF    0.613581
Name: SalePrice, dtype: float64

In [18]:
new_main_df = new_main_df.drop(["Id"], axis = 1)

In [19]:
new_main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   SalePrice     1460 non-null   int64 
 1   OverallQual   1460 non-null   int64 
 2   LotArea       1460 non-null   int64 
 3   GarageFinish  1460 non-null   object
 4   GarageCars    1460 non-null   int64 
 5   GarageArea    1460 non-null   int64 
 6   BsmtQual      1460 non-null   object
 7   TotalBsmtSF   1460 non-null   int64 
 8   HeatingQC     1460 non-null   object
 9   FireplaceQu   1460 non-null   object
dtypes: int64(6), object(4)
memory usage: 114.2+ KB


### CatBoost

In [20]:
X = new_main_df.drop(["SalePrice"], axis = 1)
y = new_main_df[["SalePrice"]]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [22]:
category = ["OverallQual", "GarageFinish", "GarageCars", "BsmtQual", "HeatingQC", "FireplaceQu"]

In [23]:
cat_boost = CatBoostRegressor(depth=3, learning_rate=0.1, loss_function='RMSE')
cat_boost.fit(X_train, y_train, cat_features=category)

0:	learn: 74445.9801748	total: 55.5ms	remaining: 55.5s
1:	learn: 71359.9403793	total: 58.4ms	remaining: 29.1s
2:	learn: 68226.0438136	total: 60.5ms	remaining: 20.1s
3:	learn: 65584.0450321	total: 62ms	remaining: 15.4s
4:	learn: 62288.8499188	total: 63.4ms	remaining: 12.6s
5:	learn: 59232.5131467	total: 64.9ms	remaining: 10.7s
6:	learn: 56725.7167438	total: 66.3ms	remaining: 9.4s
7:	learn: 54398.9583157	total: 67.8ms	remaining: 8.4s
8:	learn: 52470.0449160	total: 69ms	remaining: 7.6s
9:	learn: 50610.4027444	total: 70ms	remaining: 6.93s
10:	learn: 49020.5510622	total: 71.1ms	remaining: 6.39s
11:	learn: 47482.7165173	total: 72.2ms	remaining: 5.94s
12:	learn: 46089.5340500	total: 73.2ms	remaining: 5.56s
13:	learn: 44965.3782441	total: 74.3ms	remaining: 5.23s
14:	learn: 43615.2861620	total: 75.3ms	remaining: 4.95s
15:	learn: 42618.6141444	total: 76.7ms	remaining: 4.71s
16:	learn: 41649.8239620	total: 77.7ms	remaining: 4.49s
17:	learn: 40931.9409392	total: 79ms	remaining: 4.31s
18:	learn: 40

156:	learn: 28495.1402708	total: 305ms	remaining: 1.64s
157:	learn: 28476.9664216	total: 307ms	remaining: 1.64s
158:	learn: 28455.2877122	total: 309ms	remaining: 1.64s
159:	learn: 28443.2768253	total: 310ms	remaining: 1.63s
160:	learn: 28433.5903110	total: 312ms	remaining: 1.62s
161:	learn: 28415.1540021	total: 313ms	remaining: 1.62s
162:	learn: 28366.3236372	total: 314ms	remaining: 1.61s
163:	learn: 28348.3481940	total: 316ms	remaining: 1.61s
164:	learn: 28334.1012300	total: 317ms	remaining: 1.6s
165:	learn: 28331.4067475	total: 318ms	remaining: 1.6s
166:	learn: 28298.8310682	total: 320ms	remaining: 1.6s
167:	learn: 28292.2646080	total: 324ms	remaining: 1.6s
168:	learn: 28284.9924084	total: 326ms	remaining: 1.6s
169:	learn: 28236.2344282	total: 329ms	remaining: 1.6s
170:	learn: 28226.5148426	total: 331ms	remaining: 1.6s
171:	learn: 28184.7129367	total: 333ms	remaining: 1.6s
172:	learn: 28167.9736231	total: 336ms	remaining: 1.61s
173:	learn: 28159.3422081	total: 339ms	remaining: 1.61s


348:	learn: 25043.8411281	total: 592ms	remaining: 1.1s
349:	learn: 25036.0003802	total: 595ms	remaining: 1.1s
350:	learn: 25015.9442544	total: 598ms	remaining: 1.1s
351:	learn: 24994.8590933	total: 600ms	remaining: 1.1s
352:	learn: 24983.1461412	total: 602ms	remaining: 1.1s
353:	learn: 24969.2971531	total: 605ms	remaining: 1.1s
354:	learn: 24966.8983266	total: 606ms	remaining: 1.1s
355:	learn: 24963.3274576	total: 607ms	remaining: 1.1s
356:	learn: 24942.7966948	total: 608ms	remaining: 1.09s
357:	learn: 24924.1902033	total: 610ms	remaining: 1.09s
358:	learn: 24918.9601217	total: 611ms	remaining: 1.09s
359:	learn: 24908.4416707	total: 613ms	remaining: 1.09s
360:	learn: 24904.2167718	total: 614ms	remaining: 1.09s
361:	learn: 24878.5964393	total: 615ms	remaining: 1.08s
362:	learn: 24871.8031376	total: 616ms	remaining: 1.08s
363:	learn: 24869.7382990	total: 617ms	remaining: 1.08s
364:	learn: 24847.8788293	total: 619ms	remaining: 1.08s
365:	learn: 24842.0542973	total: 620ms	remaining: 1.07s


535:	learn: 23235.1105385	total: 844ms	remaining: 731ms
536:	learn: 23233.6809890	total: 847ms	remaining: 731ms
537:	learn: 23225.2691819	total: 851ms	remaining: 731ms
538:	learn: 23211.8516601	total: 860ms	remaining: 735ms
539:	learn: 23207.8596913	total: 861ms	remaining: 733ms
540:	learn: 23201.2272888	total: 862ms	remaining: 731ms
541:	learn: 23182.1143986	total: 863ms	remaining: 730ms
542:	learn: 23179.6126026	total: 865ms	remaining: 728ms
543:	learn: 23178.3762873	total: 866ms	remaining: 726ms
544:	learn: 23151.9940573	total: 868ms	remaining: 725ms
545:	learn: 23135.0691999	total: 869ms	remaining: 723ms
546:	learn: 23133.8120565	total: 871ms	remaining: 721ms
547:	learn: 23098.9077160	total: 872ms	remaining: 719ms
548:	learn: 23089.8282573	total: 875ms	remaining: 719ms
549:	learn: 23087.0022720	total: 876ms	remaining: 717ms
550:	learn: 23076.7321590	total: 877ms	remaining: 715ms
551:	learn: 23074.1195668	total: 879ms	remaining: 713ms
552:	learn: 23069.0812414	total: 880ms	remaining

706:	learn: 21795.3189080	total: 1.13s	remaining: 468ms
707:	learn: 21793.9832000	total: 1.13s	remaining: 467ms
708:	learn: 21785.2949511	total: 1.13s	remaining: 466ms
709:	learn: 21761.1414663	total: 1.14s	remaining: 466ms
710:	learn: 21754.5555662	total: 1.14s	remaining: 465ms
711:	learn: 21740.9927753	total: 1.15s	remaining: 463ms
712:	learn: 21738.1414530	total: 1.15s	remaining: 462ms
713:	learn: 21729.8170301	total: 1.15s	remaining: 460ms
714:	learn: 21722.6153496	total: 1.15s	remaining: 458ms
715:	learn: 21717.1554607	total: 1.15s	remaining: 456ms
716:	learn: 21715.6784132	total: 1.15s	remaining: 455ms
717:	learn: 21714.4248276	total: 1.15s	remaining: 453ms
718:	learn: 21709.7735704	total: 1.16s	remaining: 452ms
719:	learn: 21708.7747949	total: 1.16s	remaining: 450ms
720:	learn: 21707.7116004	total: 1.16s	remaining: 449ms
721:	learn: 21704.1020906	total: 1.16s	remaining: 447ms
722:	learn: 21698.1536077	total: 1.16s	remaining: 445ms
723:	learn: 21691.7308575	total: 1.16s	remaining

879:	learn: 20662.0162883	total: 1.38s	remaining: 189ms
880:	learn: 20660.2004601	total: 1.39s	remaining: 187ms
881:	learn: 20648.5566427	total: 1.39s	remaining: 186ms
882:	learn: 20640.2975711	total: 1.39s	remaining: 184ms
883:	learn: 20631.5270893	total: 1.39s	remaining: 183ms
884:	learn: 20626.5239102	total: 1.4s	remaining: 181ms
885:	learn: 20622.3088724	total: 1.4s	remaining: 180ms
886:	learn: 20615.6433848	total: 1.4s	remaining: 178ms
887:	learn: 20602.9860241	total: 1.4s	remaining: 177ms
888:	learn: 20596.7331858	total: 1.4s	remaining: 175ms
889:	learn: 20593.4817843	total: 1.4s	remaining: 173ms
890:	learn: 20575.8518122	total: 1.4s	remaining: 172ms
891:	learn: 20554.4219521	total: 1.41s	remaining: 170ms
892:	learn: 20550.9571358	total: 1.41s	remaining: 169ms
893:	learn: 20542.6742256	total: 1.41s	remaining: 167ms
894:	learn: 20541.5847894	total: 1.41s	remaining: 165ms
895:	learn: 20539.8750247	total: 1.41s	remaining: 164ms
896:	learn: 20528.8825916	total: 1.41s	remaining: 162ms

<catboost.core.CatBoostRegressor at 0x1a261fcf50>

In [24]:
y_pred = cat_boost.predict(X_test)

In [25]:
#Results using regressor
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,200624,243374.89352
1,133000,140535.646682
2,110000,119181.151517
3,192000,226294.319972
4,88000,97158.796181


In [26]:
## We should also look at RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

39327.9114640483

In [27]:
## Normalised RMSE
sqrt(mean_squared_error(y_test, y_pred)) / (y_test["SalePrice"].max() - y_test["SalePrice"].min())

0.0580057691210152

In [28]:
new_main_df.drop(["SalePrice"], axis = 1).columns.values

array(['OverallQual', 'LotArea', 'GarageFinish', 'GarageCars',
       'GarageArea', 'BsmtQual', 'TotalBsmtSF', 'HeatingQC',
       'FireplaceQu'], dtype=object)

In [29]:
list(cat_boost.get_feature_importance(Pool(X_test, label = y_test, cat_features=category)))

[23.697955481511418,
 10.975646221054449,
 8.280808399695259,
 6.268772647812654,
 18.31587692547164,
 7.954129195641539,
 17.643005972622,
 1.2055780998977301,
 5.658227056293314]

#### We can see that the model is not predicting the higher Actual value houses very well. There might be additional features that we can include to help with higher valued houses

In [30]:
df[df["Actual"] > 500000]

Unnamed: 0,Actual,Predicted
8,745000,568552.116374
258,538000,356818.186234
288,555000,424169.173552
