## Automatic selection of best imputation technique with Sklearn



Finding the best imputation technique for handling missing data can be a challenging task. However, with the help of Scikit-learn, we can automate this process by evaluating multiple imputation methods and selecting the one that yields the best performance on a given dataset. Below is an example of how to implement this in Python.



In [1]:
import pandas as pd
import numpy as np

# import classes for imputation
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# import classes for modelling
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso # for Lasso regression, we need StandardScaler as it is sensitive to feature scaling
#Also dataset contains categorical variables, so we will need OneHotEncoder for that.
from sklearn.model_selection import train_test_split, GridSearchCV
# we will use GridSearchCV to perform the grid search over the different imputation techniques

np.random.seed(0) # for reproducibility : Using same train-test split every time

In [2]:
# load dataset with all the variables

data = pd.read_csv("../../Datasets/houseprice.csv")

data.head(30)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


# Creating lists which contain numerical and categorical variables

In [3]:

features_categorical = [c for c in data.columns if data[c].dtypes == "O"]
print(features_categorical)
print("----------------------")

features_numerical = [
    c for c in data.columns if data[c].dtypes != "O" and c != "SalePrice"
]
print(features_numerical)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
----------------------
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDe

In [4]:
# inspect the categorical variables

data[features_categorical].head() # check the first 5 rows of categorical variables

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [5]:
# inspect the numerical variables

data[features_numerical].head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


In [6]:
# separate intro train and test set

# We did load train_test_split from sklearn.model_selection above

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  # as SalePrice is the target, we will drop this
    data["SalePrice"],  # the target
    test_size=0.3,  # 30% of the data for testing
    random_state=0, #For reproducibility
)  # for reproducibility

X_train.shape, X_test.shape

((1022, 80), (438, 80))

In [7]:
# We create the preprocessing pipelines for both numerical and categorical data

# here for numerical data we will impute missing values with median and then scale the data using StandardScaler.
# we are scaling the numerical data because we will use Lasso regression later which is sensitive to feature scaling.
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)


# for categorical data we will impute missing values with a constant 'missing' and then apply OneHotEncoder.
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")), # we use onehot encoder to convert categorical variables into numerical formats.We are getting numbers from numerical variables as well. The goal is to use Lasso regression later which requires all features to be in numerical format.
        #Also, we set handle_unknown='ignore' to handle any categories in the test set that were not present in the training set.
    ]
)


# now using the ColumnTransformer to apply the transformations to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        # name of the transformer, the transformer object (From above), the list of features to be transformed (we also created that above)
        ("numerical", numeric_transformer, features_numerical),
        ("categorical", categorical_transformer, features_categorical),
    ]
)

# Note that to initialise the pipeline I pass any argument to the transformers.
# Those will be changed during the gridsearch below.

In [8]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", Lasso(max_iter=2000))]
)

In [9]:
# now we create the grid with all the parameters that we would like to test

param_grid = {
    #for numerical variables we will test mean and median imputation
    #for categorical variables we will test most_frequent and constant imputation
    "preprocessor__numerical__imputer__strategy": ["mean", "median"],
    "preprocessor__categorical__imputer__strategy": ["most_frequent", "constant"],
    "regressor__alpha": [10, 100, 200], # alpha parameter for Lasso regression means the regularization strength
}

#final one
grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring="r2")

# pipe is the pipeline we created above
# param_grid is the grid of parameters we created above
# cv=5 indicates 5-fold cross-validation
# n_jobs=-1 indicates to use all available cpus
# scoring='r2' indicates to evaluate using the "r squared"



"""R-squared (R²), also called the coefficient of determination, is a statistical measure that tells you how well your regression model explains the variability of the target variable.

When R² Value is    1.0, it means Perfect fit — model explains *all* the variance in the target.

When R² Value is    0.0, it means Model explains *none* of the variance (same as predicting the mean every time).

When R² Value is   < 0.0, it means Model performs *worse than predicting the mean* (bad model).

"""


# for more details in the grid parameters visit:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

'R-squared (R²), also called the coefficient of determination, is a statistical measure that tells you how well your regression model explains the variability of the target variable.\n\nWhen R² Value is    1.0, it means Perfect fit — model explains *all* the variance in the target.\n\nWhen R² Value is    0.0, it means Model explains *none* of the variance (same as predicting the mean every time).\n\nWhen R² Value is   < 0.0, it means Model performs *worse than predicting the mean* (bad model).\n\n'

When setting the grid parameters, this is how we indicate the parameters:

preprocessor__numerical__imputer__strategy': ['mean', 'median'],

the above line of code indicates that I would like to test the mean and the median in the imputer step of the numerical processor.

preprocessor__categorical__imputer__strategy': ['most_frequent', 'constant']

the above line of code indicates that I would like to test the most frequent or a constant value in the imputer step of the categorical processor

classifier__alpha': [0.1, 1.0, 0.5]

the above line of code indicates that I want to test those 3 values for the alpha parameter of Lasso. Note that Lasso is the 'classifier' step of our last pipeline

In [10]:
# and now we train over all the possible combinations of the parameters above

grid_search.fit(X_train, y_train) # fitting the grid search to learn from data

# and we print the best score over the train set
print(
    (
        "best linear regression from grid search: %.3f"
        % grid_search.score(X_train, y_train)
    )
)

best linear regression from grid search: 0.933


In [11]:
grid_search.best_params_  # to see the best parameters found by grid search


# We can see that best imputation strategies for numerical and categorical variables along with the best alpha for Lasso regression have been selected by the grid search.

{'preprocessor__categorical__imputer__strategy': 'constant',
 'preprocessor__numerical__imputer__strategy': 'mean',
 'regressor__alpha': 100}

In [12]:
# we can print the best estimator parameters like this
grid_search.best_estimator_

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,100
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,2000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [13]:

grid_search.cv_results_["params"] # all the combinations tried during the grid search

[{'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 10},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 100},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 200},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 10},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 100},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 200},
 {'preprocessor__categorical__imputer__strategy': 'constant',
  'preprocessor__numerical__

In [14]:
# and here the scores for each of one of the above combinations
grid_search.cv_results_["mean_test_score"]

array([0.84705347, 0.86572577, 0.86538734, 0.84700725, 0.86569462,
       0.86535737, 0.84857915, 0.86673792, 0.86536049, 0.84852923,
       0.86670218, 0.86530628])

In [15]:
# and finally let's check the performance over the test set
print(
    (
        "best linear regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)

best linear regression from grid search: 0.738


This model overfits to the train set, look at the r^2 of 0.93 obtained for the train set vs 
0.738 for the test set.
