In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as sns
import seaborn as sns
from sklearn.linear_model import SGDClassifier

# SimpleImputer

In [3]:
from sklearn.impute import SimpleImputer

In [4]:
X = np.array([
    [10,3],
    [0,4],
    [5,3],
    [np.nan, 3]
])

In [5]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit_transform(X)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

In [12]:
X_test = np.array([
    [12,5],
    [40,2],
    [5,5],
    [np.nan, np.nan]
])
imputer.transform(X_test)

array([[12.  ,  5.  ],
       [40.  ,  2.  ],
       [ 5.  ,  5.  ],
       [ 5.  ,  3.25]])

# KNNImputer

In [16]:
from sklearn.impute import KNNImputer

In [18]:
X = np.array([
    [1,100],
    [3,15],
    [3,15],
    [np.nan, 20]
])

In [20]:
imputer = KNNImputer(n_neighbors=1)
imputer.fit_transform(X)

array([[  1., 100.],
       [  3.,  15.],
       [  3.,  15.],
       [  3.,  20.]])

# MissingIndicator

In [26]:
from sklearn.impute import MissingIndicator

In [28]:
X = np.array([
    [1,100],
    [3,15],
    [3,15],
    [np.nan, np.nan]
])

In [30]:
MissingIndicator().fit_transform(X)

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

In [40]:
from sklearn.pipeline import make_union

In [44]:
pipeline = make_union(SimpleImputer(strategy='constant', fill_value=-99) , MissingIndicator())
pipeline.fit_transform(X)

array([[  1., 100.,   0.,   0.],
       [  3.,  15.,   0.,   0.],
       [  3.,  15.,   0.,   0.],
       [-99., -99.,   1.,   1.]])

# Application 

In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import seaborn as sns
from sklearn.model_selection import train_test_split

In [60]:
titanic = sns.load_dataset('titanic')
X = titanic[['pclass', 'age']]
y = titanic['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [62]:
model = make_pipeline(KNNImputer(), SGDClassifier())

In [64]:
params = {
    'knnimputer__n_neighbors': [1,2,3,4]
}

In [66]:
grid = GridSearchCV(model, param_grid=params, cv=5)


In [68]:
grid.fit(X_train, y_train)

In [70]:
grid.best_params_

{'knnimputer__n_neighbors': 3}