In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv("final.csv")

In [3]:
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450,1,0,1,1,0,0,...,0,0,0,0,0,2,2008,4,4,208500
1,20,3,80.0,9600,1,0,1,1,1,0,...,0,0,0,0,0,5,2007,4,4,181500
2,60,3,68.0,11250,1,1,1,1,0,0,...,0,0,0,0,0,9,2008,4,4,223500
3,70,3,60.0,9550,1,1,1,1,2,0,...,272,0,0,0,0,2,2006,4,1,140000
4,60,3,84.0,14260,1,1,1,1,1,0,...,0,0,0,0,0,12,2008,4,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,3,62.0,7917,1,0,1,1,0,0,...,0,0,0,0,0,8,2007,4,4,175000
1456,20,3,85.0,13175,1,0,1,1,0,0,...,0,0,0,0,0,2,2010,4,4,210000
1457,70,3,66.0,9042,1,0,1,1,0,0,...,0,0,0,0,2500,5,2010,4,4,266500
1458,20,3,68.0,9717,1,0,1,1,0,0,...,112,0,0,0,0,4,2010,4,4,142125


### Feature Selection

In [4]:
x = df.drop('SalePrice', axis= 1)
y = df['SalePrice']

In [5]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [6]:
columns = df.drop('SalePrice', axis= 1)


In [7]:
df_final = pd.DataFrame(x, columns=columns.columns)

In [8]:
df_final

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.073375,0.359044,-0.220875,-0.207142,0.064238,-0.657040,-0.111168,0.02618,-0.561478,-0.225716,...,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777,-0.239005,0.189969
1,-0.872563,0.359044,0.460320,-0.091886,0.064238,-0.657040,-0.111168,0.02618,0.291548,-0.225716,...,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.489110,-0.614439,-0.239005,0.189969
2,0.073375,0.359044,-0.084636,0.073480,0.064238,0.872909,-0.111168,0.02618,-0.561478,-0.225716,...,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777,-0.239005,0.189969
3,0.309859,0.359044,-0.447940,-0.096897,0.064238,0.872909,-0.111168,0.02618,1.144574,-0.225716,...,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655,-0.239005,-3.192412
4,0.073375,0.359044,0.641972,0.375148,0.064238,0.872909,-0.111168,0.02618,0.291548,-0.225716,...,0.563760,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777,-0.239005,0.189969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,0.359044,-0.357114,-0.260560,0.064238,-0.657040,-0.111168,0.02618,-0.561478,-0.225716,...,-0.100558,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.620891,-0.614439,-0.239005,0.189969
1456,-0.872563,0.359044,0.687385,0.266407,0.064238,-0.657040,-0.111168,0.02618,-0.561478,-0.225716,...,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,1.645210,-0.239005,0.189969
1457,0.309859,0.359044,-0.175462,-0.147810,0.064238,-0.657040,-0.111168,0.02618,-0.561478,-0.225716,...,0.201405,-0.359325,-0.116339,-0.270208,-0.068692,4.953112,-0.489110,1.645210,-0.239005,0.189969
1458,-0.872563,0.359044,-0.084636,-0.080160,0.064238,-0.657040,-0.111168,0.02618,-0.561478,-0.225716,...,-0.704483,1.473789,-0.116339,-0.270208,-0.068692,-0.087688,-0.859110,1.645210,-0.239005,0.189969


In [9]:
### Apply Feature Selection
# first, I specify the Lasso Regression model, and I
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then I use the selectFromModel object from sklearn, which
# will select the features which coefficients are non-zero
feature_model_sel = SelectFromModel(Lasso(alpha=777, random_state=0)) # remember to set the seed, the random state in this function
feature_model_sel.fit(df_final, y)

SelectFromModel(estimator=Lasso(alpha=777, random_state=0))

In [10]:
feature_model_sel.get_support()


array([ True, False,  True,  True, False, False,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True, False,
       False,  True,  True,  True, False, False,  True,  True, False,
       False,  True,  True,  True, False,  True, False, False, False,
       False, False,  True, False,  True,  True, False,  True,  True,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True, False, False, False,  True,  True, False,  True,
       False,  True, False, False,  True,  True, False, False,  True,
       False,  True,  True])

In [11]:
selected_feature = df_final.columns[(feature_model_sel.get_support())]
print(f"total features: {df_final.shape[1]}")
print(f"selected features: {len(selected_feature)}")

total features: 75
selected features: 43


In [12]:
selected_feature

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'LandContour', 'Utilities',
       'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'OverallQual',
       'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrArea',
       'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1',
       'HeatingQC', 'Electrical', '1stFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageCars', 'GarageArea', 'GarageCond', 'WoodDeckSF',
       '3SsnPorch', 'ScreenPorch', 'MoSold', 'SaleType', 'SaleCondition'],
      dtype='object')

In [13]:
df = df_final[selected_feature]

In [14]:
df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LandContour,Utilities,LotConfig,Neighborhood,Condition1,Condition2,OverallQual,...,FireplaceQu,GarageCars,GarageArea,GarageCond,WoodDeckSF,3SsnPorch,ScreenPorch,MoSold,SaleType,SaleCondition
0,0.073375,-0.220875,-0.207142,-0.111168,0.02618,-0.561478,0.626747,0.212875,0.051631,0.651479,...,-0.944190,0.311725,0.351000,0.303263,-0.752176,-0.116339,-0.270208,-1.599111,-0.239005,0.189969
1,-0.872563,0.460320,-0.091886,-0.111168,0.02618,0.291548,1.249443,-1.779413,0.051631,-0.071836,...,0.526229,0.311725,-0.060731,0.303263,1.626195,-0.116339,-0.270208,-0.489110,-0.239005,0.189969
2,0.073375,-0.084636,0.073480,-0.111168,0.02618,-0.561478,0.626747,0.212875,0.051631,0.651479,...,0.526229,0.311725,0.631726,0.303263,-0.752176,-0.116339,-0.270208,0.990891,-0.239005,0.189969
3,0.309859,-0.447940,-0.096897,-0.111168,0.02618,1.144574,0.782421,0.212875,0.051631,0.651479,...,1.261438,1.650307,0.790804,0.303263,-0.752176,-0.116339,-0.270208,-1.599111,-0.239005,-3.192412
4,0.073375,0.641972,0.375148,-0.111168,0.02618,0.291548,1.872139,0.212875,0.051631,1.374795,...,0.526229,1.650307,1.698485,0.303263,0.780197,-0.116339,-0.270208,2.100892,-0.239005,0.189969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.357114,-0.260560,-0.111168,0.02618,-0.561478,0.315400,0.212875,0.051631,-0.071836,...,0.526229,0.311725,-0.060731,0.303263,-0.752176,-0.116339,-0.270208,0.620891,-0.239005,0.189969
1456,-0.872563,0.687385,0.266407,-0.111168,0.02618,-0.561478,0.159726,0.212875,0.051631,-0.071836,...,0.526229,0.311725,0.126420,0.303263,2.033231,-0.116339,-0.270208,-1.599111,-0.239005,0.189969
1457,0.309859,-0.175462,-0.147810,-0.111168,0.02618,-0.561478,0.782421,0.212875,0.051631,0.651479,...,1.261438,-1.026858,-1.033914,0.303263,-0.752176,-0.116339,-0.270208,-0.489110,-0.239005,0.189969
1458,-0.872563,-0.084636,-0.080160,-0.111168,0.02618,-0.561478,-0.307296,0.212875,0.051631,-0.795151,...,-0.944190,-1.026858,-1.090059,0.303263,2.168910,-0.116339,-0.270208,-0.859110,-0.239005,0.189969


### Model Training

In [15]:
x = df.copy()

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=33)

In [17]:
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [18]:
y_pred_lr = lr.predict(x_test)

In [19]:
def metrics(actual, predicted):
    mse = np.sqrt(mean_squared_error(actual, predicted))
    r2_value = r2_score(actual, predicted)
    print(f"mean_squard error is {mse}")
    print(f"r2_score is {r2_value}")    

In [20]:
metrics(y_test, y_pred_lr)

mean_squard error is 38317.49336626855
r2_score is 0.7740666036109509


### Random Forest

In [21]:
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

RandomForestRegressor()

In [22]:
y_pred_rf = rf.predict(x_test)
metrics(y_test, y_pred_rf)

mean_squard error is 29626.158464436034
r2_score is 0.8649368320133202


### SVR

In [23]:
svr = SVR()
svr.fit(x_train, y_train)

SVR()

In [24]:
y_pred_svr = svr.predict(x_test)
metrics(y_test, y_pred_svr)

mean_squard error is 82808.7570859169
r2_score is -0.05520971640293526


## GridSearchCv LinearRegression

In [25]:
lr_cv = LinearRegression()
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid_lr = GridSearchCV(estimator=lr_cv, param_grid = parameters, cv = 2, n_jobs=-1)
grid_lr.fit(x_train, y_train)



GridSearchCV(cv=2, estimator=LinearRegression(), n_jobs=-1,
             param_grid={'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'normalize': [True, False]})

In [26]:
grid_lr.best_params_

{'copy_X': True, 'fit_intercept': True, 'normalize': False}

In [27]:
lr_cv = LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [28]:
lr_cv.fit(x_train, y_train)



LinearRegression(normalize=False)

In [29]:
y_pred_lr_Cv = lr_cv.predict(x_test)

In [30]:
metrics(y_test, y_pred_lr_Cv)

mean_squard error is 38317.49336626855
r2_score is 0.7740666036109509


### GridSearchCV RandomForest

In [31]:
rf_cv = RandomForestRegressor()

In [37]:
params = {'bootstrap': [True, False],
         'max_depth': [10, 20, 30, 40, 50],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [20, 30, 40, 100, 200, 400]}
grid_rf = GridSearchCV(estimator=rf_cv, param_grid=params, cv=2,  verbose=1)
grid_rf.fit(x_train, y_train)

Fitting 2 folds for each of 1080 candidates, totalling 2160 fits


GridSearchCV(cv=2, estimator=RandomForestRegressor(),
             param_grid={'bootstrap': [True, False],
                         'max_depth': [10, 20, 30, 40, 50],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [20, 30, 40, 100, 200, 400]},
             verbose=1)

In [39]:
grid_rf.best_params_

{'bootstrap': False,
 'max_depth': 40,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 20}

In [40]:
rf_cv = RandomForestRegressor(bootstrap=False, max_depth=40, max_features='sqrt', n_estimators=20, min_samples_leaf=1,
                             min_samples_split=2)

In [41]:
rf_cv.fit(x_train, y_train)

RandomForestRegressor(bootstrap=False, max_depth=40, max_features='sqrt',
                      n_estimators=20)

In [42]:
y_pred_rf_cv = rf_cv.predict(x_test)
metrics(y_test, y_pred_rf_cv)

mean_squard error is 27486.67644068626
r2_score is 0.8837398953433744
