
<br>
<div style="text-align:center">
<img src="images/search.jpg" alt="search" width="600"/>
<figcaption>Photo by <a href="https://unsplash.com/@laughayette?utm_source=unsplash&amp;utm_medium=referral&amp;utm_content=creditCopyText">Marten Newhall</a> on <a href="https://unsplash.com/s/photos/search?utm_source=unsplash&amp;utm_medium=referral&amp;utm_content=creditCopyText">Unsplash</a></figcaption>

# Prepare Data

In [1]:
import pandas as pd 

# preparing data 
from sklearn.model_selection import train_test_split

# feature scaling, encoding
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# putting together in pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# model selection
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import GridSearchCV

In [2]:
# import house price data 
df = pd.read_csv('../data/house_price/train.csv', index_col='Id')

# numerical columns vs. categorical columns 
num_cols = df.drop('SalePrice', axis=1).select_dtypes('number').columns
cat_cols = df.drop('SalePrice', axis=1).select_dtypes('object').columns

X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# split train and test dataset 
X_train, X_test, y_train, y_test = train_test_split(df.drop('SalePrice', axis=1), 
                                                    df['SalePrice'], 
                                                    test_size=0.3, 
                                                    random_state=0)

# check the size of train and test data
X_train.shape, X_test.shape

((1022, 79), (438, 79))

# Setup Pipeline

In [3]:
# feature engineering pipeline for numerical variables 
num_pipeline= Pipeline([('imputer', SimpleImputer(strategy='mean')),
                        ('scaler', StandardScaler())])

# feature engineering pipeline for categorical variables 
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(handle_unknown='ignore'))])

# put numerical and categorical feature engineering pipelines together
preprocessor = ColumnTransformer([("num_pipeline", num_pipeline, num_cols),
                                  ("cat_pipeline", cat_pipeline, cat_cols)])


# put transformers and an estimator together
pipe = Pipeline([('preprocessing', preprocessor),
                 ('lasso', Lasso(max_iter=10000))])  # increased max_iter to converge

# fit model 
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.6308258188969262

# Finding the best imputation technique using GridSearchCV

In [4]:
param_grid = dict(preprocessing__num_pipeline__imputer__strategy=['mean', 'median', 'most_frequent'],
                  preprocessing__cat_pipeline__imputer__strategy=['most_frequent'])  # KNN 

grid_search = GridSearchCV(pipe, param_grid)

grid_search.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrS...
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'Sa

In [5]:
grid_search.cv_results_

{'mean_fit_time': array([2.32447925, 2.24286404, 2.2551795 ]),
 'std_fit_time': array([2.15180666, 2.05670675, 2.04442326]),
 'mean_score_time': array([0.01562643, 0.0149888 , 0.01514149]),
 'std_score_time': array([0.00092181, 0.0004646 , 0.00042794]),
 'param_preprocessing__cat_pipeline__imputer__strategy': masked_array(data=['most_frequent', 'most_frequent', 'most_frequent'],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_preprocessing__num_pipeline__imputer__strategy': masked_array(data=['mean', 'median', 'most_frequent'],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'preprocessing__cat_pipeline__imputer__strategy': 'most_frequent',
   'preprocessing__num_pipeline__imputer__strategy': 'mean'},
  {'preprocessing__cat_pipeline__imputer__strategy': 'most_frequent',
   'preprocessing__num_pipeline__imputer__strategy': 'median'},
  {'preprocessing__cat_pipeline__imputer__

In [6]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__cat_pipeline__imputer__strategy,param_preprocessing__num_pipeline__imputer__strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.324479,2.151807,0.015626,0.000922,most_frequent,mean,{'preprocessing__cat_pipeline__imputer__strate...,0.855704,0.822841,0.803811,0.888467,0.632368,0.800638,0.088954,3
1,2.242864,2.056707,0.014989,0.000465,most_frequent,median,{'preprocessing__cat_pipeline__imputer__strate...,0.855513,0.823084,0.803867,0.888615,0.632403,0.800696,0.088959,2
2,2.25518,2.044423,0.015141,0.000428,most_frequent,most_frequent,{'preprocessing__cat_pipeline__imputer__strate...,0.855823,0.823196,0.804183,0.888832,0.635,0.801407,0.088066,1


In [7]:
grid_search.best_estimator_

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num_pipeline',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlr...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'G

In [8]:
grid_search.best_params_

{'preprocessing__cat_pipeline__imputer__strategy': 'most_frequent',
 'preprocessing__num_pipeline__imputer__strategy': 'most_frequent'}

In [9]:
grid_search.best_score_

0.8014070988628038

In [10]:
grid_search.refit_time_

5.486163854598999

# References
* [scikit-learn GridSearchCV]('https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html')