# House Prices - Advanced Regression Techniques
## Feature Selection and Prediction

In this notebook we will perform feature selection on the feature engineered train and test dataset and predict the house prices of the test dataset.

In [26]:
import pandas as pd
import numpy as np

# For feature slection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Models for prediction
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

# Model evaluation
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# to visualise all the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Importing feature engineered train and test csv files
dataset_train = pd.read_csv('feature_train.csv')
dataset_test = pd.read_csv('feature_test.csv')

In [3]:
# Using Lasso regression model and selectecting a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Using the SelectFromModel object from sklearn, which will select the features whose coefficients are non-zero.

X_train=dataset_train.drop(['Id','SalePrice'],axis=1)
y_train=dataset_train[['SalePrice']]

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
feature_sel_model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [4]:
feature_sel_model.get_support()

array([False,  True, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False,  True, False,
       False,  True, False, False, False, False, False, False,  True,
       False, False,  True, False,  True,  True, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True, False,  True, False,  True,  True,  True, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [5]:
# Printing the number of total and selected features

# Making a list of the selected features
selected_features = X_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('Total features: {}'.format((X_train.shape[1])))
print('Selected features: {}'.format(len(selected_features)))
print('Features with coefficients shrank to zero: {}'.format(
    np.sum(feature_sel_model.estimator_.coef_ == 0)))

Total features: 82
Selected features: 21
Features with coefficients shrank to zero: 61


In [6]:
selected_features

Index(['MSZoning', 'LotShape', 'BldgType', 'OverallQual', 'YearRemodAdd',
       'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC',
       'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual',
       'Fireplaces', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageArea',
       'PavedDrive'],
      dtype='object')

In [7]:
X_train = X_train[selected_features]
X_train.head()

Unnamed: 0,MSZoning,LotShape,BldgType,OverallQual,YearRemodAdd,ExterQual,BsmtQual,BsmtExposure,BsmtFinType1,HeatingQC,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,GarageType,GarageFinish,GarageCars,GarageArea,PavedDrive
0,0.5,1.0,0.0,0.666667,0.112903,0.333333,0.5,1.0,0.333333,0.0,1.0,0.345385,0.577712,0.333333,0.5,0.0,0.0,0.666667,0.4,0.36828,1.0
1,0.5,1.0,0.0,0.555556,0.532258,1.0,0.5,0.25,0.0,0.0,1.0,0.487844,0.470245,0.0,1.0,0.25,0.0,0.666667,0.4,0.30914,1.0
2,0.5,0.0,0.0,0.666667,0.129032,0.333333,0.5,0.75,0.333333,0.0,1.0,0.371846,0.593095,0.333333,0.5,0.25,0.0,0.666667,0.4,0.408602,1.0
3,0.5,0.0,0.0,0.666667,0.612903,1.0,1.0,1.0,0.0,0.5,1.0,0.387847,0.579157,0.333333,0.5,0.25,0.6,1.0,0.6,0.431452,1.0
4,0.5,0.0,0.0,0.777778,0.16129,0.333333,0.5,0.0,0.333333,0.0,1.0,0.452138,0.666523,0.333333,0.5,0.25,0.0,0.666667,0.6,0.561828,1.0


## Model Training

In [23]:
# Putting models in a dictionary
models = {"Linear Regression": LinearRegression(),
          "Lasso": Lasso(),
          "Elastic Net": ElasticNet(),
          "Decision Tree Regressor": DecisionTreeRegressor(),
          "K Neighbours Regressor": KNeighborsRegressor(),
          "Gradient Boosting Regressor": GradientBoostingRegressor(),
          "Random Forest Regressor": RandomForestRegressor()}

def model_selection(models, X_train, y_train):
    np.random.seed(0)
    model_scores = {}
    for name, model in models.items():
        kfold = KFold(n_splits=10)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1, verbose=4, scoring='neg_mean_squared_error')
        model_scores[name] = cv_results.mean()
    return model_scores

In [24]:
model_scores = model_selection(models=models,
                               X_train=X_train,
                               y_train=y_train)
model_scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    1.0s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.0s r

{'Linear Regression': -0.0211478718968292,
 'Lasso': -0.15965699341266254,
 'Elastic Net': -0.15965699341266254,
 'Decision Tree Regressor': -0.042390368376733836,
 'K Neighbours Regressor': -0.04217416071148981,
 'Gradient Boosting Regressor': -0.018636555755007264,
 'Random Forest Regressor': -0.021742510474728482}

From the above output it is clear that Gradient Boosting Regressor performs the best. Therefore using the same model to train the model and tuning hyperparameters.

### Hyperparameter tuning with RandomizedSearchCV

In [27]:
%%time

# Setting up hyperparameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt','log2']
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
max_depth = [int(x) for x in np.linspace(10,1000,10)]
min_samples_split = [1, 3, 4, 5, 7, 9, 10]
min_samples_leaf = [1, 2, 4, 6, 8, 10]

# Creating the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Instantinate RandomizedSearchCV model
rs_model = RandomizedSearchCV(estimator=GradientBoostingRegressor(),
                             param_distributions=random_grid,
                             n_iter=100, cv=5, verbose=True,
                             random_state=0, n_jobs=-1)
 
# Fitting the RandomizedSearchCV model
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Wall time: 11min 35s
Parser   : 240 ms


RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.0001, 0.001, 0.01,
                                                          0.1, 0.2, 0.3],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8, 10],
                                        'min_samples_split': [1, 3, 4, 5, 7, 9,
                                                              10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
   

In [28]:
rs_model.best_params_

{'n_estimators': 1600,
 'min_samples_split': 10,
 'min_samples_leaf': 6,
 'max_features': 'sqrt',
 'max_depth': 1000,
 'learning_rate': 0.01}

### Hyperparameter tuning with GridSearchCV
Now on the basis of best parameters obtained in RandomizedSearchCV, tuning hyperparameters using GridSearchCV.

In [33]:
%%time

# Setting up hyperparameters
grid_search = {
    'n_estimators': [rs_model.best_params_['n_estimators']-200,
                     rs_model.best_params_['n_estimators']-100, 
                     rs_model.best_params_['n_estimators'], 
                     rs_model.best_params_['n_estimators']+100,
                     rs_model.best_params_['n_estimators']+200],
    'max_features': [rs_model.best_params_['max_features']],
    'learning_rate': [rs_model.best_params_['learning_rate']/4,
                      rs_model.best_params_['learning_rate']/2,
                      rs_model.best_params_['learning_rate'], 
                      rs_model.best_params_['learning_rate']*2,
                      rs_model.best_params_['learning_rate']*4],
    'max_depth': [rs_model.best_params_['max_depth']-200,
                  rs_model.best_params_['max_depth']-100, 
                  rs_model.best_params_['max_depth'], 
                  rs_model.best_params_['max_depth']+100,
                  rs_model.best_params_['max_depth']+200],
    'min_samples_split': [rs_model.best_params_['min_samples_split']-1,
                          rs_model.best_params_['min_samples_split'], 
                          rs_model.best_params_['min_samples_split']+1],
    'min_samples_leaf': [rs_model.best_params_['min_samples_leaf']-1,
                         rs_model.best_params_['min_samples_leaf'], 
                         rs_model.best_params_['min_samples_leaf']+1]}

# Instantinate GridSearchCV model
gs_model = GridSearchCV(estimator=GradientBoostingRegressor(),
                         param_grid=grid_search,
                         cv=10, n_jobs=-1, verbose=True)

# Fitting the GridSearchCV model
gs_model.fit(X_train, y_train)

Fitting 10 folds for each of 1125 candidates, totalling 11250 fits
Wall time: 4h 9min 8s


GridSearchCV(cv=10, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.0025, 0.005, 0.01, 0.02, 0.04],
                         'max_depth': [800, 900, 1000, 1100, 1200],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [5, 6, 7],
                         'min_samples_split': [9, 10, 11],
                         'n_estimators': [1400, 1500, 1600, 1700, 1800]},
             verbose=True)

In [34]:
gs_model.best_params_

{'learning_rate': 0.005,
 'max_depth': 800,
 'max_features': 'sqrt',
 'min_samples_leaf': 6,
 'min_samples_split': 9,
 'n_estimators': 1400}

In [42]:
# Applying feature selection on test data
X_test = pd.DataFrame()
train_features = X_train.columns
for feature in train_features:
    X_test[feature] = dataset_test[feature]

In [43]:
# Using best params for test predictions
ideal_model=gs_model.best_estimator_

# Making predictions on the test data
y_preds = ideal_model.predict(X_test)

# Format predictions into the same format Kaggle is after
df_preds = pd.DataFrame()
df_preds["Id"] = dataset_test["Id"]
df_preds["SalePrice"] = y_preds
df_preds

Unnamed: 0,Id,SalePrice
0,1461,11.654291
1,1462,11.849000
2,1463,12.041664
3,1464,12.119565
4,1465,12.153038
...,...,...
1454,2915,11.287426
1455,2916,11.337426
1456,2917,11.901501
1457,2918,11.577293


In [46]:
# Export predicted data as required by Kaggle
df_preds.to_csv("submission.csv", index=False)