# Prepare Data

In [11]:
import pandas as pd 

# preparing data 
from sklearn.model_selection import train_test_split

# feature scaling, encoding
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# putting together in pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# model selection
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import GridSearchCV

In [12]:
# import house price data 
df = pd.read_csv('../data/house_price/train.csv', index_col='Id')

# numerical columns vs. categorical columns 
num_cols = df.drop('SalePrice', axis=1).select_dtypes('number').columns
cat_cols = df.drop('SalePrice', axis=1).select_dtypes('object').columns

# split train and test dataset 
X_train, X_test, y_train, y_test = train_test_split(df.drop('SalePrice', axis=1), 
                                                    df['SalePrice'], 
                                                    test_size=0.3, 
                                                    random_state=0)

# check the size of train and test data
X_train.shape, X_test.shape

((1022, 79), (438, 79))

# Setup Pipeline

In [14]:
# feature engineering pipeline for numerical variables 
num_pipeline= Pipeline([('imputer', SimpleImputer(strategy='mean')),
                        ('scaler', StandardScaler())])

# feature engineering pipeline for categorical variables 
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(handle_unknown='ignore'))])

# put numerical and categorical feature engineering pipelines together
preprocessor = ColumnTransformer([("num_pipeline", num_pipeline, num_cols),
                                  ("cat_pipeline", cat_pipeline, cat_cols)])


# put transformers and an estimator together
pipe = Pipeline([('preprocessing', preprocessor),
                 ('lasso', Lasso(max_iter=10000))])  # increased max_iter to converge

# fit model 
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.6308258188969262

# Finding the best imputation technique using GridSearchCV

In [19]:
param_grid = dict(preprocessor__num_pipeline__imputer__strategy=['mean', 'median', 'most_frequent'],
                  preprocessor__cat_pipeline__imputer__strategy=['most_frequent'])  # KNN 

grid_search = GridSearchCV(pipe, param_grid)

grid_search.fit(X_train, y_train)

ValueError: Invalid parameter preprocessor for estimator Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num_pipeline',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualF...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object'))])),
                ('lasso', Lasso(max_iter=10000))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
grid_search.cv_results_

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
grid_search.refit_time_