## https://towardsdatascience.com/missing-value-imputation-with-python-and-k-nearest-neighbors-308e7abd273d

In [10]:
import pandas as pd
import numpy as np
df = pd.read_csv(r'files\realestate.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [11]:
i1 = np.random.choice(a=df.index, size=35)
i2 = np.random.choice(a=df.index, size=20)

In [12]:
i1

array([400, 227, 431, 440, 197, 333, 204, 509, 506, 496, 159,  75, 118,
       157, 379, 206, 328,   7, 339, 279, 303, 506, 269, 238, 140,  48,
       388, 481, 470, 201, 104, 181, 244, 428, 397], dtype=int64)

In [13]:
i2

array([290, 241, 422, 447,  52, 238, 203,  40, 180, 254, 287, 106, 144,
       347,  88, 269, 146, 456,  98, 483], dtype=int64)

In [14]:
df.loc[i1, 'INDUS'] = np.nan
df.loc[i2, 'TAX'] = np.nan

In [15]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [16]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
imputed = imputer.fit_transform(df)
df_imputed = pd.DataFrame(imputed, columns=df.columns)

## Kijken of er lege waardes in zitten.
### Kolom INDUS en Tax heeft blijkbaar nog lege waardes

In [21]:
df.isna().any()

CRIM       False
ZN         False
INDUS       True
CHAS       False
NOX        False
RM          True
AGE        False
DIS        False
RAD        False
TAX         True
PTRATIO    False
B          False
LSTAT      False
MEDV       False
dtype: bool

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rmse = lambda y, yhat: np.sqrt(mean_squared_error(y, yhat))

In [25]:
def optimize_k(data, target):
    errors = []
    for k in range(1, 20, 2):
        imputer = KNNImputer(n_neighbors=k)
        imputed = imputer.fit_transform(data)
        df_imputed = pd.DataFrame(imputed, columns=df.columns)
        
        X = df_imputed.drop(target, axis=1)
        y = df_imputed[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        error = rmse(y_test, preds)
        errors.append({'K': k, 'RMSE': error})
        
    return errors

In [26]:
k_errors = optimize_k(data=df, target='MEDV')

In [27]:
k_errors

[{'K': 1, 'RMSE': 5.060415691509325},
 {'K': 3, 'RMSE': 5.061987670348208},
 {'K': 5, 'RMSE': 5.018888473044045},
 {'K': 7, 'RMSE': 5.25550292273478},
 {'K': 9, 'RMSE': 5.3015622372410025},
 {'K': 11, 'RMSE': 5.1404245149768295},
 {'K': 13, 'RMSE': 5.0478957723149716},
 {'K': 15, 'RMSE': 4.945234725009209},
 {'K': 17, 'RMSE': 5.102840766033677},
 {'K': 19, 'RMSE': 5.24099213815791}]