In [1]:
import pandas as pd 
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

# Read data

In [2]:
df = pd.read_csv('../data/house_price/train.csv')

In [3]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
categorical_cols = df.drop('SalePrice', axis=1).select_dtypes('object').columns
print(f"number of categorical columns: {len(categorical_cols)}")
print("categorical columns:", categorical_cols)

number of categorical columns: 43
categorical columns: Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [5]:
numerical_cols = df.drop('SalePrice', axis=1).select_dtypes(np.number).columns
print(f"number of numerical columns: {len(numerical_cols)}")
print("numerical columns:", numerical_cols)

number of numerical columns: 37
numerical columns: Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')


# Split data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('SalePrice', axis=1),
                                                    df['SalePrice'],
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((1022, 80), (438, 80))

In [7]:
# categorical variables that have missing data 
X_train[categorical_cols].isnull().mean()[X_train[categorical_cols].isnull().mean() > 0].sort_values(ascending=False)

PoolQC          0.997065
MiscFeature     0.956947
Alley           0.939335
Fence           0.813112
FireplaceQu     0.467710
GarageCond      0.052838
GarageQual      0.052838
GarageFinish    0.052838
GarageType      0.052838
BsmtFinType2    0.024462
BsmtFinType1    0.023483
BsmtExposure    0.023483
BsmtCond        0.023483
BsmtQual        0.023483
MasVnrType      0.004892
Electrical      0.000978
dtype: float64

In [8]:
# categorical variables that have missing data 
X_train[numerical_cols].isnull().mean()[X_train[numerical_cols].isnull().mean() > 0].sort_values(ascending=False)

LotFrontage    0.184932
GarageYrBlt    0.052838
MasVnrArea     0.004892
dtype: float64

# Create preprocessing pipelines

In [20]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols), 
    ('cat', categorical_transformer, categorical_cols)
])


In [21]:
pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                       ('classifier', Lasso(max_iter=2000))])

In [22]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrS...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 

In [23]:
print(f"model score: {pipe.score(X_test, y_test):.3f}")

model score: 0.631


# GridSearchCV

In [28]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
    'classifier__alpha': [10, 100, 200]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, iid=False, n_jobs=-1, scoring='r2')
# n_jobs=-1 means to use all available cpus 
# scoring='r2' indicates to evaluate using the r squared 

In [29]:
# train over all the possible combinations of parameters 
grid_search.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'Tota...
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeatur

In [30]:
# prin the best score over the train set 
print(f"Best linear regression from grid search: {grid_search.score(X_train, y_train): .3f}")

Best linear regression from grid search:  0.933


In [31]:
# best estimator parameters 
grid_search.best_estimator_

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrS...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 

In [None]:
Pipeline(steps=[('preprocessor', 
                 ColumnTransformer(transformers=[('num', Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())]), 
                                                  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrS...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object'))])),
                ('classifier', Lasso(alpha=100, max_iter=2000))])

In [33]:
# combination of prameters that GridSearchCV evaluated 
grid_search.cv_results_['params']

[{'classifier__alpha': 10,
  'preprocessor__cat__imputer__strategy': 'most_frequent',
  'preprocessor__num__imputer__strategy': 'mean'},
 {'classifier__alpha': 10,
  'preprocessor__cat__imputer__strategy': 'most_frequent',
  'preprocessor__num__imputer__strategy': 'median'},
 {'classifier__alpha': 10,
  'preprocessor__cat__imputer__strategy': 'constant',
  'preprocessor__num__imputer__strategy': 'mean'},
 {'classifier__alpha': 10,
  'preprocessor__cat__imputer__strategy': 'constant',
  'preprocessor__num__imputer__strategy': 'median'},
 {'classifier__alpha': 100,
  'preprocessor__cat__imputer__strategy': 'most_frequent',
  'preprocessor__num__imputer__strategy': 'mean'},
 {'classifier__alpha': 100,
  'preprocessor__cat__imputer__strategy': 'most_frequent',
  'preprocessor__num__imputer__strategy': 'median'},
 {'classifier__alpha': 100,
  'preprocessor__cat__imputer__strategy': 'constant',
  'preprocessor__num__imputer__strategy': 'mean'},
 {'classifier__alpha': 100,
  'preprocessor__ca