In [1]:
#as per https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import numpy as np
random.seed(0)

In [3]:
#Fetch the dataset
import pandas as pd
dataset = fetch_california_housing()
type(dataset)

sklearn.utils.Bunch

In [4]:
train, target = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24


In [5]:
target.head(3)

Unnamed: 0,0
0,4.526
1,3.585
2,3.521


In [6]:
train.columns = ['0','1','2','3','4','5','6','7']
train.insert(loc=len(train.columns), column='target', value=target)
train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521


In [7]:
#Randomly replace 40% of the first column with NaN values
#grab first column
column = train['0']
print(column.size)

20640


In [8]:
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
print(column.shape[0])
train.head()

20640


Unnamed: 0,0,1,2,3,4,5,6,7,target
0,,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [9]:
#Impute the values using scikit-learn SimpleImpute Class
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='mean') #for median imputation replace 'mean' with 'median'

In [10]:
imp_mean.fit(train)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [11]:
imputed_train_df = imp_mean.transform(train)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# Simple Imputer

In [12]:
#Impute the values using scikit-learn SimpleImpute Class
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(train)
imputed_train_df = imp_mean.transform(train)

In [13]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# k-NN

In [17]:
import sys
from impyute.imputation.cs import fast_knn
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS

# start the KNN training
imputed_training=fast_knn(train.values, k=30)
print(imputed_training)

[[   3.30117882   41.            6.98412698 ...   37.88
  -122.23          4.526     ]
 [   8.3014       21.            6.23813708 ...   37.86
  -122.22          3.585     ]
 [   4.31326938   52.            8.28813559 ...   37.85
  -122.24          3.521     ]
 ...
 [   1.7          17.            5.20554273 ...   39.43
  -121.22          0.923     ]
 [   3.40189346   18.            5.32951289 ...   39.43
  -121.32          0.847     ]
 [   2.3886       16.            5.25471698 ...   39.37
  -121.24          0.894     ]]


# MICE

In [18]:
from impyute.imputation.cs import mice

# start the MICE training
imputed_training=mice(train.values)

# DataWig

# Stochastic Regression Imputation

# Hot Deck Imputation