# Analysing the basement data to predict house prices

### Pre-process data

In [43]:
#Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [44]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [45]:
data = r'/Users/OliverPan/Desktop/house/train.csv'

In [46]:
df = pd.read_csv(data)

In [47]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [48]:
df.columns.values

array(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'Pav

In [49]:
basement_df = df[['Id', "SalePrice", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF"]]

In [50]:
basement_df.head()

Unnamed: 0,Id,SalePrice,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF
0,1,208500,Gd,TA,No,GLQ,706,Unf,0,150,856
1,2,181500,Gd,TA,Gd,ALQ,978,Unf,0,284,1262
2,3,223500,Gd,TA,Mn,GLQ,486,Unf,0,434,920
3,4,140000,TA,Gd,No,ALQ,216,Unf,0,540,756
4,5,250000,Gd,TA,Av,GLQ,655,Unf,0,490,1145


In [51]:
#Filter df
high_corr = basement_df.corr().abs().unstack()
sort = high_corr.sort_values(kind="quicksort")
sort = sort.to_frame()

#Look at correlation that is higher than 0.4 and not 1
sort = sort[sort[0] >= 0.4]
sort = sort[sort[0] != 1]

In [52]:
### Correlation between basement variables
sort

Unnamed: 0,Unnamed: 1,0
TotalBsmtSF,BsmtUnfSF,0.41536
BsmtUnfSF,TotalBsmtSF,0.41536
BsmtFinSF1,BsmtUnfSF,0.495251
BsmtUnfSF,BsmtFinSF1,0.495251
BsmtFinSF1,TotalBsmtSF,0.522396
TotalBsmtSF,BsmtFinSF1,0.522396
SalePrice,TotalBsmtSF,0.613581
TotalBsmtSF,SalePrice,0.613581


In [53]:
basement_df.nunique()

Id              1460
SalePrice        663
BsmtQual           4
BsmtCond           4
BsmtExposure       4
BsmtFinType1       6
BsmtFinSF1       637
BsmtFinType2       6
BsmtFinSF2       144
BsmtUnfSF        780
TotalBsmtSF      721
dtype: int64

##### It seems that Total BSMT is the sum of SF1 and SF2 and UnfSF, so we should just keep totals as an indicator of SalePrice

In [54]:
basement_df = basement_df.drop(["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF"], axis = 1)

In [55]:
basement_df.head()

Unnamed: 0,Id,SalePrice,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,TotalBsmtSF
0,1,208500,Gd,TA,No,GLQ,Unf,856
1,2,181500,Gd,TA,Gd,ALQ,Unf,1262
2,3,223500,Gd,TA,Mn,GLQ,Unf,920
3,4,140000,TA,Gd,No,ALQ,Unf,756
4,5,250000,Gd,TA,Av,GLQ,Unf,1145


##### The rest of the categories have 4-6 value_counts, so we can turn them into categorical columns

In [56]:
basement_df["BsmtQual"] = basement_df["BsmtQual"].astype("category").cat.codes
basement_df["BsmtCond"] = basement_df["BsmtCond"].astype("category").cat.codes
basement_df["BsmtExposure"] = basement_df["BsmtExposure"].astype("category").cat.codes
basement_df["BsmtFinType1"] = basement_df["BsmtFinType1"].astype("category").cat.codes
basement_df["BsmtFinType2"] = basement_df["BsmtFinType2"].astype("category").cat.codes

basement_df = basement_df.drop(["Id"], axis = 1)

In [57]:
basement_df.corr()["SalePrice"]

SalePrice       1.000000
BsmtQual       -0.438881
BsmtCond        0.147367
BsmtExposure   -0.193079
BsmtFinType1   -0.013233
BsmtFinType2    0.130814
TotalBsmtSF     0.613581
Name: SalePrice, dtype: float64

In [58]:
### Linear Regression

In [59]:
X = basement_df.drop(["SalePrice"], axis = 1)
y = basement_df[["SalePrice"]]

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [61]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [62]:
print(regressor.coef_)

[[-33464.34039312   6922.93794066  -2261.58724876   -146.09421598
    6031.94494458    103.16721141]]


In [63]:
y_pred = regressor.predict(X_test)

In [64]:
#Results using regressor
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,200624,257794.05418
1,133000,125407.889055
2,110000,121140.42606
3,192000,201425.367234
4,88000,101865.470728


In [65]:
## Error is still very high, so we need to tune the model
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 39569.25556806494


### Take out TotalBsmtSF

In [66]:
X = basement_df.drop(["SalePrice", "TotalBsmtSF"], axis = 1)
y = basement_df[["SalePrice"]]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [68]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [69]:
print(regressor.coef_)

[[-43283.88086041  22561.2454893   -4796.39485987  -1423.17495143
   12423.4137382 ]]


In [70]:
y_pred = regressor.predict(X_test)

In [71]:
#Results using regressor
df = pd.DataFrame({'Actual': y_test["SalePrice"].to_list(), 'Predicted': list(y_pred.flatten())})
df.head()

Unnamed: 0,Actual,Predicted
0,200624,152223.430712
1,133000,144069.541828
2,110000,153646.605663
3,192000,199776.836426
4,88000,150800.25576


In [72]:
## Error is still very high, so further analysis may need to be conducted
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

Mean Absolute Error: 47101.434364094835
