In [1]:
# Dependencies and setup
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
from scipy.special import boxcox1p
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Imputer
import statsmodels.formula.api as sm
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
# from yellowbrick.regressor import ResidualsPlot
%matplotlib inline

In [2]:
# Set maximum rows to a high number
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 150)

In [3]:
# Load dataset
training_data = pd.read_csv("02-cleaned_data/cleaned_training.csv",index_col=0)

In [4]:
# Drop extreme outliers
training_data.drop(index = training_data[(training_data.GrLivArea>4000) & (training_data.SalePrice<300000)].index.tolist(), inplace=True)

In [5]:
numerical_variables=training_data[["GrLivArea","LotFrontage","MasVnrArea","TotalBsmtSF","1stFlrSF","2ndFlrSF","SalePrice"]].copy()

In [6]:
numerical_variables[["GrLivArea","1stFlrSF","SalePrice"]] = np.log(numerical_variables[["GrLivArea","1stFlrSF","SalePrice"]])
# Perform logarithmic transformation on columns with 0s
numerical_variables["TotalBsmtSF"] = np.log(numerical_variables["TotalBsmtSF"].replace(0, np.nan))
numerical_variables.update(numerical_variables["TotalBsmtSF"].fillna(0))

numerical_variables["2ndFlrSF"] = np.sqrt(numerical_variables["2ndFlrSF"])

numerical_variables[["LotFrontage","MasVnrArea"]] = np.cbrt(numerical_variables[["LotFrontage","MasVnrArea"]])

In [7]:
# year_bins = [0,1920,1940,1960,1980,2000,2020]
# year_groups = ["pre_1920","1920_1940","1940_1960","1960_1980","1980_2000","2000_up"]
# training_data["YearBuiltBin"] = pd.cut(training_data["YearBuilt"],year_bins,labels=year_groups)

In [8]:
training_data = training_data[["MSSubClass","LotFrontage","LotArea","MasVnrArea","Neighborhood","OverallQual","OverallCond","YearBuilt","ExterQual","BsmtQual","BsmtCond","TotalBsmtSF","1stFlrSF","2ndFlrSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","KitchenQual","TotRmsAbvGrd","Fireplaces","FireplaceQu","GarageType","GarageCars","GarageArea","GarageQual","GarageCond","PoolQC","YrSold","SaleCondition","SalePrice"]]

In [9]:
# Separate x and y variables
# Matrix of independent variable (square feet data)
# X = np.log(training_data.loc[:,["GrLivArea"]].values)
X = training_data.iloc[:,:-1].values
# Vector of dependent variable (home sale price)
y = np.log(training_data.iloc[:,-1].values)

In [10]:
data = training_data.iloc[:,:-1]

In [11]:
data_encoded = pd.get_dummies(data,columns=["MSSubClass","Neighborhood","OverallQual","ExterQual","BsmtQual","TotRmsAbvGrd","GarageType","GarageCars","GarageQual"])
# data_encoded = pd.get_dummies(data,columns=["MSSubClass","Neighborhood","OverallQual","ExterQual","BsmtQual","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","KitchenAbvGr","KitchenQual","TotRmsAbvGrd","Fireplaces","FireplaceQu","GarageType","GarageCars","GarageQual"])
data_encoded.head()

Unnamed: 0_level_0,LotFrontage,LotArea,MasVnrArea,OverallCond,YearBuilt,BsmtCond,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Fireplaces,FireplaceQu,GarageArea,GarageCond,PoolQC,YrSold,SaleCondition,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,OverallQual_1,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA,TotRmsAbvGrd_2,TotRmsAbvGrd_3,TotRmsAbvGrd_4,TotRmsAbvGrd_5,TotRmsAbvGrd_6,TotRmsAbvGrd_7,TotRmsAbvGrd_8,TotRmsAbvGrd_9,TotRmsAbvGrd_10,TotRmsAbvGrd_11,TotRmsAbvGrd_12,TotRmsAbvGrd_14,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageCars_0,GarageCars_1,GarageCars_2,GarageCars_3,GarageCars_4,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1
1,65.0,8450,196.0,5,2003,TA,856,856,854,1710,1,0,2,1,3,1,Gd,0,,548,TA,,2008,Normal,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
2,80.0,9600,0.0,8,1976,TA,1262,1262,0,1262,0,1,2,0,3,1,TA,1,TA,460,TA,,2007,Normal,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
3,68.0,11250,162.0,5,2001,TA,920,920,866,1786,1,0,2,1,3,1,Gd,1,TA,608,TA,,2008,Normal,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
4,60.0,9550,0.0,5,1915,Gd,756,961,756,1717,1,0,1,0,3,1,Gd,1,Gd,642,TA,,2006,Abnorml,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
5,84.0,14260,350.0,5,2000,TA,1145,1145,1053,2198,1,0,2,1,4,1,Gd,1,TA,836,TA,,2008,Normal,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [12]:
# Perform logarithmic transformation on columns
data_encoded[["GrLivArea","1stFlrSF"]] = np.log(data_encoded[["GrLivArea","1stFlrSF"]])
# Perform logarithmic transformation on columns with 0s
data_encoded["TotalBsmtSF"] = np.log(data_encoded["TotalBsmtSF"].replace(0, np.nan))
data_encoded.update(data_encoded["TotalBsmtSF"].fillna(0))

In [13]:
# Perform square root transformation on relevant columns
data_encoded["2ndFlrSF"] = np.sqrt(data_encoded["2ndFlrSF"])

In [14]:
# Perform cube root transformation on relevant columns
data_encoded[["LotFrontage","MasVnrArea"]] = np.cbrt(data_encoded[["LotFrontage","MasVnrArea"]])

In [15]:
# Define the variables for the multiple linear regression
X_1 = data_encoded[["GrLivArea","Neighborhood_Blmngtn","Neighborhood_BrDale","Neighborhood_BrkSide","Neighborhood_ClearCr","Neighborhood_CollgCr","Neighborhood_Crawfor","Neighborhood_Edwards","Neighborhood_Gilbert","Neighborhood_IDOTRR","Neighborhood_MeadowV","Neighborhood_Mitchel","Neighborhood_NAmes","Neighborhood_NPkVill","Neighborhood_NWAmes","Neighborhood_NoRidge","Neighborhood_NridgHt","Neighborhood_SWISU","Neighborhood_Sawyer","Neighborhood_SawyerW","Neighborhood_Somerst","Neighborhood_StoneBr","Neighborhood_Timber","Neighborhood_Veenker","OverallQual_2","OverallQual_3","OverallQual_4","OverallQual_5","OverallQual_6","OverallQual_7","OverallQual_8","OverallQual_9","OverallQual_10","GarageCars_1","GarageCars_2","GarageCars_3","GarageCars_4","ExterQual_Ex","ExterQual_TA","ExterQual_Gd","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","YearBuilt"]]
# Ignores these variables to avoid the dummy variable trap: "Neighborhood_Blueste","OverallQual_1",  "BedroomAbvGr_0","GarageCars_0","BsmtFullBath_0","BsmtHalfBath_0","HalfBath_0","FullBath_0","KitchenQual_Fa","ExterQual_Fa","YearBuiltBin_pre_1920",
# Redefine X
X = X_1.values
# Split data into training and testing set
# random_state ensures answers are reproducible
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2, random_state = 0)
# Fitting Simple Linear Regression to the Training set
regressor = LinearRegression()
# fitting linear regressor to our training dataset
regressor.fit(X_train, y_train)
# Predicting the test set results
y_pred = regressor.predict(X_test)
# Build the optimal model using backward elimination
X = np.append(arr = np.ones((len(X),1)).astype(int), values = X, axis = 1)
# X_opt = X[:, [0,1]]
regressor_ols = sm.OLS(endog = y, exog = X).fit()
regressor_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.871
Method:,Least Squares,F-statistic:,215.7
Date:,"Mon, 13 May 2019",Prob (F-statistic):,0.0
Time:,19:48:18,Log-Likelihood:,787.64
No. Observations:,1458,AIC:,-1481.0
Df Residuals:,1411,BIC:,-1233.0
Df Model:,46,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.5537,0.627,5.672,0.000,2.325,4.783
x1,0.4745,0.024,19.777,0.000,0.427,0.522
x2,0.0386,0.044,0.873,0.383,-0.048,0.125
x3,-0.1738,0.042,-4.161,0.000,-0.256,-0.092
x4,0.0980,0.024,4.169,0.000,0.052,0.144
x5,0.2305,0.033,6.998,0.000,0.166,0.295
x6,0.1115,0.027,4.147,0.000,0.059,0.164
x7,0.2515,0.025,10.042,0.000,0.202,0.301
x8,0.0164,0.022,0.736,0.462,-0.027,0.060

0,1,2,3
Omnibus:,239.934,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,773.052
Skew:,-0.811,Prob(JB):,1.36e-168
Kurtosis:,6.177,Cond. No.,331000.0
