In [1]:
import numpy as np
import pandas as pd
from  matplotlib import pyplot as plt, patches as mpatches
import seaborn as sns
import sklearn as skl
from sklearn import model_selection as skl_ms, preprocessing as skl_pre, feature_selection as skl_fs, linear_model as skl_lm, metrics as skl_met
pd.set_option('display.max_rows', 200)

import warnings
warnings.filterwarnings('ignore')

### DATA CLEANING

In [2]:
#Read Data
housing_data = pd.read_csv('train.csv')

In [3]:
#Fill Null values of Categroical Variables with appropriate category
housing_data['Alley'].fillna('no_alley',inplace = True)
housing_data['BsmtQual'].fillna('no_basement',inplace = True)
housing_data['BsmtCond'].fillna('no_basement', inplace=True)
housing_data['BsmtExposure'].fillna('no_basement', inplace=True)
housing_data['BsmtFinType1'].fillna('no_basement', inplace=True)
housing_data['BsmtFinType2'].fillna('no_basement', inplace=True)
housing_data['FireplaceQu'].fillna('no_fireplace', inplace=True)
housing_data['GarageType'].fillna('no_garage', inplace=True)
housing_data['GarageFinish'].fillna('no_garage', inplace=True)
housing_data['GarageQual'].fillna('no_garage', inplace=True)
housing_data['GarageCond'].fillna('no_garage', inplace=True)
housing_data['PoolQC'].fillna('no_pool', inplace=True)
housing_data['Fence'].fillna('no_pool', inplace=True)
housing_data['MiscFeature'].fillna('no_misc', inplace=True)
housing_data['Electrical'].fillna('no_elec', inplace=True)
housing_data['MasVnrType'].fillna('no_maso', inplace=True)

In [4]:
#Find columns with high null values
((housing_data.isna().sum()/len(housing_data))*100).sort_values(ascending=False)

LotFrontage      17.739726
GarageYrBlt       5.547945
MasVnrArea        0.547945
Id                0.000000
KitchenAbvGr      0.000000
GarageType        0.000000
FireplaceQu       0.000000
Fireplaces        0.000000
Functional        0.000000
TotRmsAbvGrd      0.000000
KitchenQual       0.000000
BedroomAbvGr      0.000000
GarageCars        0.000000
HalfBath          0.000000
FullBath          0.000000
BsmtHalfBath      0.000000
BsmtFullBath      0.000000
GrLivArea         0.000000
LowQualFinSF      0.000000
2ndFlrSF          0.000000
GarageFinish      0.000000
GarageArea        0.000000
Electrical        0.000000
PoolQC            0.000000
SaleCondition     0.000000
SaleType          0.000000
YrSold            0.000000
MoSold            0.000000
MiscVal           0.000000
MiscFeature       0.000000
Fence             0.000000
PoolArea          0.000000
GarageQual        0.000000
ScreenPorch       0.000000
3SsnPorch         0.000000
EnclosedPorch     0.000000
OpenPorchSF       0.000000
W

In [5]:
#Fill null values with median
housing_data['LotFrontage'].fillna(housing_data['LotFrontage'].median(),inplace=True)
housing_data['MasVnrArea'].fillna(housing_data['MasVnrArea'].median(),inplace=True)

In [6]:
#Derive new metrics
housing_data['PropertyAge']=housing_data['YrSold']-housing_data['YearBuilt']
housing_data['PropertyRemodifiedAge']=housing_data['YrSold']-housing_data['YearRemodAdd']

In [7]:
#Drop not required attributes
housing_data.drop(['MoSold','YrSold','YearRemodAdd','YearBuilt','Id','GarageYrBlt'],axis=1,inplace=True)

### DATA PREPARATION

In [8]:
#Identify Categorical and Numerical Variables
catvars=['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 
'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 
'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'BsmtQual', 
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
'CentralAir', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType', 'BedroomAbvGr',
'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 
'KitchenAbvGr', 'FireplaceQu', 'Foundation', 'Fence', 'MiscFeature', 'MiscVal','SaleType','SaleCondition']
num_vars=list(set(housing_data.columns)-{'SalePrice'}-set(catvars))

In [9]:
#Create Dummy Variables
housing_data = pd.get_dummies(housing_data,drop_first=True,columns=catvars)

In [10]:
#X-Y split
X=housing_data.drop('SalePrice',axis=1)
Y=housing_data['SalePrice']

In [11]:
#Train-Test Split
X_train,X_test = skl_ms.train_test_split(X,train_size=0.7,random_state=123)
Y_train,Y_test = skl_ms.train_test_split(Y,train_size=0.7,random_state=123)

In [12]:
#Scale independent training variables
scaler=skl_pre.MinMaxScaler()
X_train[num_vars]=scaler.fit_transform(X_train[num_vars])

### MODEL BUILDING

In [13]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

ridge = skl_lm.Ridge()
lasso = skl_lm.Lasso()

# cross validation
folds = 5

model_ridge = skl_ms.GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_ridge.fit(X_train, Y_train) 

model_lasso = skl_ms.GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_lasso.fit(X_train, Y_train) 

print('Lasso Alpha: ', model_lasso.best_params_)
print('Ridge Alpha: ',model_ridge.best_params_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Lasso Alpha:  {'alpha': 100}
Ridge Alpha:  {'alpha': 10.0}


### PREDICTION AND EVALUATION

In [14]:
#Prediction using Model
X_test[num_vars]=scaler.transform(X_test[num_vars])

#New model using optimal alpha
model_lasso = skl_lm.Lasso(alpha=100)
model_lasso.fit(X_train, Y_train)
model_ridge = skl_lm.Ridge(alpha=10)
model_ridge.fit(X_train, Y_train) 

Y_train_predicted_lasso=model_lasso.predict(X_train)
Y_test_predicted_lasso=model_lasso.predict(X_test)
Y_train_predicted_ridge=model_ridge.predict(X_train)
Y_test_predicted_ridge=model_ridge.predict(X_test)
#Lasso
mse_train_lasso=skl_met.mean_squared_error(Y_train,Y_train_predicted_lasso)
rmse_train_lasso=mse_train_lasso**0.5
r2_train_lasso = skl_met.r2_score(Y_train,Y_train_predicted_lasso)
mse_test_lasso = skl_met.mean_squared_error(Y_test,Y_test_predicted_lasso)
rmse_test_lasso=mse_test_lasso**0.5
r2_test_lasso = skl_met.r2_score(Y_test,Y_test_predicted_lasso)
#Ridge
mse_train_ridge = skl_met.mean_squared_error(Y_train,Y_train_predicted_ridge)
rmse_train_ridge=mse_train_ridge**0.5
r2_train_ridge = skl_met.r2_score(Y_train,Y_train_predicted_ridge)
mse_test_ridge = skl_met.mean_squared_error(Y_test,Y_test_predicted_ridge)
rmse_test_ridge=mse_test_ridge**0.5
r2_test_ridge = skl_met.r2_score(Y_test,Y_test_predicted_ridge)

result_df = pd.DataFrame([[mse_train_ridge,mse_train_lasso],[rmse_train_ridge,rmse_train_lasso],[r2_train_ridge,r2_train_lasso],[mse_test_ridge,mse_test_lasso],[rmse_test_ridge,rmse_test_lasso],[r2_test_ridge,r2_test_lasso]],
    index=pd.MultiIndex.from_tuples([('Train','MSE'),('Train','RMSE'),('Train','R2'),('Test','MSE'),('Test','RMSE'),('Test','R2')]),
    columns=['Ridge','Lasso'])
pd.set_option('display.float_format', lambda x: '%.10f' % x)
print(result_df)

                          Ridge                Lasso
Train MSE  655052921.4288556576 590433482.6583887339
      RMSE     25594.0016689234     24298.8370639088
      R2           0.9000374871         0.9098985552
Test  MSE  721493159.5533237457 643053587.4664866924
      RMSE     26860.6247051948     25358.5012858900
      R2           0.8740448900         0.8877385263


>>> Based on the evaluation metric lasso perfroms slightly better than the ridge

In [33]:
print('Most important params Lassso:')
df=pd.Series(model_lasso.coef_)
df=df.abs()
for i in df.sort_values()[-5:].index:
    print(X_train.columns[i])
print()
print('Most important params RIDGE:')
df=pd.Series(model_ridge.coef_)
df=df.abs()
for i in df.sort_values()[-5:].index:
    print(X_train.columns[i])


Most important params Lassso:
OverallQual_9
PoolQC_Gd
OverallQual_10
Condition2_PosN
GrLivArea

Most important params RIDGE:
Neighborhood_NridgHt
OverallQual_10
FullBath_3
Neighborhood_NoRidge
GrLivArea
