In [None]:
 #kNN Imputation on sex column
#Measures the distance between the new sample and the N closest samples(as specified by the n_neighbours parameter)
#Based on its closest neighbour(s), it will take the mean value of the N closest non-null neighbors to the missing value.


In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

In [2]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
#Using this method, we can see what values need to be imputed.
df = df.drop(['PassengerId','Name'],axis=1)
df = df[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Fare", "Age"]]
df["Sex"] = [1 if x=="male" else 0 for x in df["Sex"]]
#Here, we drop some unneeded features and quickly One-hot-encode our Sex feature.

In [4]:
imputer = KNNImputer(n_neighbors=5)
imputer.fit(df)

KNNImputer()

In [5]:
imputer.transform(df)

array([[ 0.    ,  3.    ,  1.    , ...,  0.    ,  7.25  , 22.    ],
       [ 1.    ,  1.    ,  0.    , ...,  0.    , 71.2833, 38.    ],
       [ 1.    ,  3.    ,  0.    , ...,  0.    ,  7.925 , 26.    ],
       ...,
       [ 0.    ,  3.    ,  0.    , ...,  2.    , 23.45  , 26.8   ],
       [ 1.    ,  1.    ,  1.    , ...,  0.    , 30.    , 26.    ],
       [ 0.    ,  3.    ,  1.    , ...,  0.    ,  7.75  , 32.    ]])

In [6]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
SibSp         0
Parch         0
Fare          0
Age         177
dtype: int64

In [None]:
#Advantages of the KNNImputer:
#Can be much more accurate than the mean, median or the mode(It depends on the dataset).
#Disadvantages of the KNNImputer:
#Computationally expensive, as it stores in the entire dataset in memory.
#Is quite sensitive to outliers, so imputed values may cause the model to not perform as well as possible.
#You have to specify the number of neighbors