# Analysing if garage is important to SalesPrice of house

## Random Forest

### Pre-process data

In [65]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [66]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [67]:
data = r'/Users/OliverPan/Desktop/house/garage.csv'

In [68]:
garage_df = pd.read_csv(data)

In [69]:
garage_df.head()

Unnamed: 0,SalePrice,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea
0,208500,Attchd,2003.0,RFn,2,548
1,181500,Attchd,1976.0,RFn,2,460
2,223500,Attchd,2001.0,RFn,2,608
3,140000,Detchd,1998.0,Unf,3,642
4,250000,Attchd,2000.0,RFn,3,836


### Additional pre-processing

In [70]:
garage_df["GarageType"] = garage_df["GarageType"].astype("category").cat.codes
garage_df["GarageFinish"] = garage_df["GarageFinish"].astype("category").cat.codes

garage_df = garage_df.dropna()

### Random Forest with all variables

In [71]:
X = garage_df.drop(["SalePrice"], axis = 1)
y = garage_df[["SalePrice"]]

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [73]:
regr = RandomForestRegressor(max_depth=2, n_estimators = 250)

In [74]:
regr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=2, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=250, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [75]:
y_pred = regr.predict(X_test)

In [76]:
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,165400,190165.560971
1,143000,164142.380624
2,132250,163314.66269
3,90000,162180.433788
4,153900,228844.427392


In [77]:
## Error is very high, so we need to tune the model
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 35182.59205851747


### Random Forest with three categorical variables

In [78]:
garage_df = garage_df.drop(["GarageYrBlt", "GarageArea"], axis = 1)

In [79]:
X = garage_df.drop(["SalePrice"], axis = 1)
y = garage_df[["SalePrice"]]

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [81]:
regr = RandomForestRegressor(max_depth=2, n_estimators = 250)

In [82]:
regr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=2, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=250, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [83]:
y_pred = regr.predict(X_test)

In [84]:
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,78000,134351.456016
1,118858,158199.141068
2,275000,295628.535673
3,169500,165749.300295
4,374000,323396.348322


In [85]:
## Error is very high, so we need to tune the model
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 40340.360060716645


In [None]:
### Having used random forest and linear regression, we need to use more than just three categorical variables for garage analysis