# MICE imputation with Scikit-learn 

knearest neighbours with the IterativeImputer from Scikit-learn:

http://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html

which performs MICE (multivariate imputation with chain equations) and you can use KNN as the estimator of missing data(you can actually use any estimator, like random forest, bayes, etc).

In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# these are the objects we need to impute missing data
# with sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# to split the datasets
from sklearn.model_selection import train_test_split

In [10]:
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

data = df1.append(df2)

# we use only the following variables for the demo:
# 3 of which contain NA

cols_to_use = [
    'OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'WoodDeckSF',
    'BsmtUnfSF', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'SalePrice'
]

data = data[cols_to_use]

data.head(4)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,OverallQual,TotalBsmtSF,1stFlrSF,GrLivArea,WoodDeckSF,BsmtUnfSF,LotFrontage,MasVnrArea,GarageYrBlt,SalePrice
0,7,856.0,856,1710,0,150.0,65.0,196.0,2003.0,208500.0
1,6,1262.0,1262,1262,298,284.0,80.0,0.0,1976.0,181500.0
2,7,920.0,920,1786,0,434.0,68.0,162.0,2001.0,223500.0
3,7,756.0,961,1717,0,540.0,60.0,0.0,1998.0,140000.0


In [11]:
data.isnull().mean()

OverallQual    0.000000
TotalBsmtSF    0.000343
1stFlrSF       0.000000
GrLivArea      0.000000
WoodDeckSF     0.000000
BsmtUnfSF      0.000343
LotFrontage    0.166495
MasVnrArea     0.007879
GarageYrBlt    0.054471
SalePrice      0.499829
dtype: float64

In [12]:
# let's separate into training and testing 

# first let's remove the target from the features
cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use], # just the features
                                                    data['SalePrice'], # the target
                                                    test_size=0.3, # the percentage of obs in the test set
                                                    random_state=0) # for reproducibility
X_train.shape, X_test.shape

((2043, 9), (876, 9))

In [13]:
# let's check the misssing data again
X_train.isnull().mean()

OverallQual    0.000000
TotalBsmtSF    0.000489
1stFlrSF       0.000000
GrLivArea      0.000000
WoodDeckSF     0.000000
BsmtUnfSF      0.000489
LotFrontage    0.162996
MasVnrArea     0.006853
GarageYrBlt    0.058248
dtype: float64

# Iterative imputer with KNN 

In [14]:
# Now we impute the missing values with SimpleImputer

# create an instance of the simple imputer
# we indicate that we want to impute with the median
imputer = IterativeImputer(random_state=0, estimator=KNeighborsRegressor(n_neighbors=3))

# we fit the imputer to the train set
# the imputer will learn the median of all variables
imputer.fit(X_train[cols_to_use])

IterativeImputer(add_indicator=False,
                 estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                               metric='minkowski',
                                               metric_params=None, n_jobs=None,
                                               n_neighbors=3, p=2,
                                               weights='uniform'),
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [19]:
imputer.estimator

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform')

In [22]:
# and now we impute the train and test set

# NOTE: the data is returned as a numpy array!!!
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [23]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [24]:
X_train.isnull().mean()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
dtype: float64

In [25]:
X_test.isnull().mean()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
dtype: float64