# Analysing if garage is important to SalesPrice of house

### Pre-process data

In [19]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [20]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [21]:
data = r'/Users/OliverPan/Desktop/house/train.csv'

In [22]:
df = pd.read_csv(data)

In [23]:
df.columns.values

array(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'Pav

In [24]:
garage_df = df[["Id", "SalePrice", "GarageType", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond"]]

In [25]:
garage_df.head()

Unnamed: 0,Id,SalePrice,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond
0,1,208500,Attchd,2003.0,RFn,2,548,TA,TA
1,2,181500,Attchd,1976.0,RFn,2,460,TA,TA
2,3,223500,Attchd,2001.0,RFn,2,608,TA,TA
3,4,140000,Detchd,1998.0,Unf,3,642,TA,TA
4,5,250000,Attchd,2000.0,RFn,3,836,TA,TA


### Analyse Categorical Garage Variables

In [26]:
#Analysing different garage styles
df["GarageType"].value_counts()

Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64

In [27]:
#Analysing different garage finish and design
df["GarageFinish"].value_counts()

Unf    605
RFn    422
Fin    352
Name: GarageFinish, dtype: int64

In [28]:
#Analysing the amount of cars the garage can hold
df["GarageCars"].value_counts()

2    824
1    369
3    181
0     81
4      5
Name: GarageCars, dtype: int64

In [29]:
#TA Takes up a majority, so it would be best to temporarily drop this column
df["GarageQual"].value_counts()

TA    1311
Fa      48
Gd      14
Ex       3
Po       3
Name: GarageQual, dtype: int64

In [30]:
#TA Takes up a majority, so it would be best to temporarily drop this column
df["GarageCond"].value_counts()

TA    1326
Fa      35
Gd       9
Po       7
Ex       2
Name: GarageCond, dtype: int64

In [31]:
#Drop columns
garage_df = garage_df.drop(["GarageQual", "GarageCond"], axis = 1)

In [32]:
garage_df.describe()

Unnamed: 0,Id,SalePrice,GarageYrBlt,GarageCars,GarageArea
count,1460.0,1460.0,1379.0,1460.0,1460.0
mean,730.5,180921.19589,1978.506164,1.767123,472.980137
std,421.610009,79442.502883,24.689725,0.747315,213.804841
min,1.0,34900.0,1900.0,0.0,0.0
25%,365.75,129975.0,1961.0,1.0,334.5
50%,730.5,163000.0,1980.0,2.0,480.0
75%,1095.25,214000.0,2002.0,2.0,576.0
max,1460.0,755000.0,2010.0,4.0,1418.0


In [33]:
#Dataframe going into model
garage_df = garage_df.drop(["Id"], axis = 1)
garage_df.head()

Unnamed: 0,SalePrice,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea
0,208500,Attchd,2003.0,RFn,2,548
1,181500,Attchd,1976.0,RFn,2,460
2,223500,Attchd,2001.0,RFn,2,608
3,140000,Detchd,1998.0,Unf,3,642
4,250000,Attchd,2000.0,RFn,3,836


In [None]:
### Encode Categorial Variables

In [41]:
garage_df["GarageType"] = garage_df["GarageType"].astype("category").cat.codes
garage_df["GarageFinish"] = garage_df["GarageFinish"].astype("category").cat.codes

In [47]:
garage_df = garage_df.dropna()

### Linear Regression 

##### I am going to test and train on train.csv, to start

In [48]:
X = garage_df.drop(["SalePrice"], axis = 1)
y = garage_df[["SalePrice"]]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [50]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [51]:
print(regressor.intercept_)

#Shows coef per variable
print(regressor.coef_)

[85077.3580097]
[[-5.20013582e+03  6.85614508e+00 -2.30515486e+04  4.06871313e+04
   9.91301288e+01]]


In [52]:
y_pred = regressor.predict(X_test)

In [74]:
#Results using regressor
df = pd.DataFrame({'Actual': list(y_test.flatten()), 'Predicted': list(y_pred.flatten())})
df.head(15)

Unnamed: 0,Actual,Predicted
0,152000,176347.15368
1,130000,140469.496154
2,139000,116462.49015
3,187000,202024.078596
4,113000,93358.241374
5,168500,204181.274148
6,139600,173358.969576
7,133000,116373.360264
8,139000,117029.846343
9,266000,297024.520244


In [73]:
## Error is very high, so we need to tune the model
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 38491.753657353074
Mean Squared Error: 3417357438.959789
Root Mean Squared Error: 58458.16828262573
