In [1]:
from IPython.display import HTML, display

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import stats
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")


import seaborn as sns
sns.set_style("darkgrid")

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv('/Users/muratvarlik/Documents/DATASCIENCE/HousingData (3).csv',encoding='latin-1')
df.shape
#(1460, 81)

(1460, 81)

In [3]:
print(df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [4]:
#drop id out of dataframe
SalePrice = np.array(df['SalePrice'])
df= df.drop(['Id','SalePrice'], axis = 1)

In [5]:
#manage missing categorical missing data
features_nan=[feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes=='O']
#percentage of missing catagorical data in columns
for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(df[feature].isnull().mean(),4)))

Alley: 0.9377% missing values
MasVnrType: 0.0055% missing values
BsmtQual: 0.0253% missing values
BsmtCond: 0.0253% missing values
BsmtExposure: 0.026% missing values
BsmtFinType1: 0.0253% missing values
BsmtFinType2: 0.026% missing values
FireplaceQu: 0.4726% missing values
GarageType: 0.0555% missing values
GarageFinish: 0.0555% missing values
GarageQual: 0.0555% missing values
GarageCond: 0.0555% missing values
PoolQC: 0.9952% missing values
Fence: 0.8075% missing values
MiscFeature: 0.963% missing values


In [6]:
#replace missing categorical features with "Missing"
def replace_cat(df,features_nan):
    data=df.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

df=replace_cat(df,features_nan)

In [7]:
#check missing features in categorical values have been replaced by 'missing'
for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(df[feature].isnull().mean(),4)))

Alley: 0.0% missing values
MasVnrType: 0.0% missing values
BsmtQual: 0.0% missing values
BsmtCond: 0.0% missing values
BsmtExposure: 0.0% missing values
BsmtFinType1: 0.0% missing values
BsmtFinType2: 0.0% missing values
FireplaceQu: 0.0% missing values
GarageType: 0.0% missing values
GarageFinish: 0.0% missing values
GarageQual: 0.0% missing values
GarageCond: 0.0% missing values
PoolQC: 0.0% missing values
Fence: 0.0% missing values
MiscFeature: 0.0% missing values


In [8]:
#check numerical variables the contains missing values
numerical_with_nan=[feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes!='O']

for feature in numerical_with_nan:
    print("{}: {}% missing values".format(feature,np.round(df[feature].isnull().mean(),4)))

LotFrontage: 0.1774% missing values
MasVnrArea: 0.0055% missing values
GarageYrBlt: 0.0555% missing values


In [9]:
#replacing numberical missing data with medians
for feature in numerical_with_nan:
    median_value=df[feature].median() 
    df[feature].fillna(median_value,inplace=True)
    
for feature in numerical_with_nan:
    print("{}: {}% missing values".format(feature,np.round(df[feature].isnull().mean(),4)))

LotFrontage: 0.0% missing values
MasVnrArea: 0.0% missing values
GarageYrBlt: 0.0% missing values


In [10]:
df[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,2003,2003,2003.0
1,1976,1976,1976.0
2,2001,2002,2001.0
3,1915,1970,1998.0
4,2000,2000,2000.0


In [11]:
#Yearly values
df[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,2003,2003,2003.0
1,1976,1976,1976.0
2,2001,2002,2001.0
3,1915,1970,1998.0
4,2000,2000,2000.0


In [12]:
#change the yearly values to numerical features by doing "YrSold - feature"
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:     
    df[feature]=df['YrSold']-df[feature]

In [13]:
df[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0


In [14]:
df.shape

(1460, 79)

In [15]:
#skewed variable into a more normalized dataset
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']

for feature in num_features:
    df[feature]=np.log(df[feature])

In [16]:
df = pd.get_dummies(df)
df.head(20)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,4.174387,9.041922,7,5,5,5,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,4.382027,9.169518,6,8,31,31,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,4.219508,9.328123,7,5,7,6,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,4.094345,9.164296,7,5,91,36,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,4.430817,9.565214,8,5,8,8,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0
5,50,4.442651,9.554993,5,5,16,14,0.0,732,0,...,0,0,0,1,0,0,0,0,1,0
6,20,4.317488,9.218705,8,5,3,2,186.0,1369,0,...,0,0,0,1,0,0,0,0,1,0
7,60,4.234107,9.247829,7,6,36,36,240.0,859,32,...,0,0,0,1,0,0,0,0,1,0
8,50,3.931826,8.719317,7,5,77,58,0.0,0,0,...,0,0,0,1,1,0,0,0,0,0
9,190,3.912023,8.911934,5,6,69,58,0.0,851,0,...,0,0,0,1,0,0,0,0,1,0


In [17]:
print(df.head())

   MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  \
0          60     4.174387  9.041922            7            5          5   
1          20     4.382027  9.169518            6            8         31   
2          60     4.219508  9.328123            7            5          7   
3          70     4.094345  9.164296            7            5         91   
4          60     4.430817  9.565214            8            5          8   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_ConLw  \
0             5       196.0         706           0  ...               0   
1            31         0.0         978           0  ...               0   
2             6       162.0         486           0  ...               0   
3            36         0.0         216           0  ...               0   
4             8       350.0         655           0  ...               0   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_Abnorml  \
0          

In [18]:
df.shape

(1460, 303)

In [19]:
#train set
x_train,x_test,y_train,y_test=train_test_split(df,SalePrice,test_size=0.3,random_state=10)

In [20]:
#OLS
ols_reg = LinearRegression()
ols_reg.fit(x_train, y_train)
ols_pred = ols_reg.predict(x_test)

print("Mean absolute error:" , metrics.mean_absolute_error(y_test, ols_pred))
print("Mean squared error:" , metrics.mean_squared_error(y_test, ols_pred))
print("Root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, ols_pred)))

Mean absolute error: 19360.68410674302
Mean squared error: 855771321.1240983
Root mean squared error: 29253.569374079776


In [21]:
#Recursive feature elemination with cross validation
#regressor selection
#decided to use cross validation, as the processing time is not relevant 
ols= LinearRegression()

In [22]:
rfe = RFE(ols, n_features_to_select=120)
train_x_rfe = rfe.fit_transform(x_train,y_train)  
print(rfe.support_)

[False False False False False False False False  True  True  True  True
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
  True False  True False False  True  True False False False  True False
 False  True False  True False  True  True  True False  True  True False
 False  True False False  True False False False False False  True  True
 False  True  True  True  True  True False  True  True  True  True  True
 False  True  True  True  True  True False  True  True False  True  True
 False False False False  True  True  True  True False  True  True False
 False False False False  True False  True False False False False False
 False  True False  True  True  True  True False False False  True  True
  True False  True  True  True False  True False  True False False False
  True False False False  True  True  True  True False  True False  True
 False  True False False  True False False False Fa

In [23]:
x_train_RFE = x_train[x_train.columns[rfe.support_]]
x_test_RFE = x_test[x_test.columns[rfe.support_]]

In [24]:
#test the ols with reduced features

ols_rfe = sm.OLS(y_train, x_train_RFE)
type(ols_rfe)

results = ols_rfe.fit()
type(results)

ols_pred_rfe = results.predict(x_test_RFE)

print("Mean absolute error:" , metrics.mean_absolute_error(y_test, ols_pred_rfe))
print("Mean squared error:" , metrics.mean_squared_error(y_test, ols_pred_rfe))
print("Root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, ols_pred_rfe)))

Mean absolute error: 21660.699340113264
Mean squared error: 852467172.2068151
Root mean squared error: 29197.040469999953


In [25]:
#create a list of alpha values for ridge
parameters = {'alpha':list(x / 10 for x in range(0, 101))}

In [26]:
#ridge
#create ridge
ridge = Ridge()

In [27]:
ridge_regressor = GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=4)
ridge_regressor.fit(x_train, y_train)

print(ridge_regressor.best_params_)

ridge_pred = ridge_regressor.predict(x_test.values)

print("Mean absolute error:" , metrics.mean_absolute_error(y_test, ridge_pred))
print("Mean squared error:" , metrics.mean_squared_error(y_test, ridge_pred))
print("Root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, ridge_pred)))

{'alpha': 10.0}
Mean absolute error: 17768.667099060105
Mean squared error: 672516444.6837317
Root mean squared error: 25932.922023631116


In [28]:
#lasso
lasso = Lasso()

In [29]:
# parameters = {'alpha':list(x / 10 for x in range(0, 101))}

lasso_reg = GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=4)
lasso_reg.fit(x_train, y_train)

print(lasso_reg.best_params_)

lasso_pred = lasso_reg.predict(x_test.values)

print("Mean absolute error:" , metrics.mean_absolute_error(y_test, lasso_pred))
print("Mean squared error:" , metrics.mean_squared_error(y_test, lasso_pred))
print("Root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, lasso_pred)))

{'alpha': 10.0}
Mean absolute error: 17478.92580902901
Mean squared error: 627275117.259218
Root mean squared error: 25045.46101111373


In [30]:
elastic = ElasticNet()

parameters = {'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 3, 5, 10, 15, 20]}

elastic_regressor = GridSearchCV(elastic,parameters,scoring='neg_mean_squared_error',cv=4)
elastic_regressor.fit(x_train, y_train)

print(elastic_regressor.best_params_)

elastic_pred = elastic_regressor.predict(x_test.values)

print("Mean absolute error:" , metrics.mean_absolute_error(y_test, elastic_pred))
print("Mean squared error:" , metrics.mean_squared_error(y_test, elastic_pred))
print("Root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, elastic_pred)))

{'alpha': 0.01}
Mean absolute error: 17992.16907129154
Mean squared error: 678078900.7258426
Root mean squared error: 26039.948170567517


In [31]:
print("Ols root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, ols_pred)))
print("Ols with recursive feature elemination root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, ols_pred_rfe)))
print("Ridge root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, ridge_pred)))
print("Lasso root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, lasso_pred)))
print("Elastic net root mean squared error:" ,np.sqrt(metrics.mean_squared_error(y_test, elastic_pred)))

Ols root mean squared error: 29253.569374079776
Ols with recursive feature elemination root mean squared error: 29197.040469999953
Ridge root mean squared error: 25932.922023631116
Lasso root mean squared error: 25045.46101111373
Elastic net root mean squared error: 26039.948170567517


In [32]:
# As we can see from the comparison of the performances; Ridge, Lasso and Elastic Net Regression have performed 
# signifacantly better than the OLS. While Ridge, Lasso and Elastic Net Regression are worth exploring, their 
# run time and memory usage is signifacantly higher. OLS with recursive feature elemination(with 
# cross validation)has performed slightly better and has not made a signifacant improvemant.