Univariate feature imputation

Imputes missing values in a feature using only non-missing values in that feature (and no other features)

The SimpleImputer class provides basic strategies for imputing missing values. Missing values can be imputed with a provided constant value, or using the statistics (mean, median or most frequent) of each column in which the missing values are located. This class also allows for different missing values encodings.

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer

x= np.array([[1, 2], [np.nan, 3], [7, 6],[1, 6]])
print("Original data: \n",x)

#Average Imputation using strategy='mean'
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
transformed_x=imp.fit_transform(x)
print("Transformed data (mean imputation): \n",transformed_x)

Original data: 
 [[ 1.  2.]
 [nan  3.]
 [ 7.  6.]
 [ 1.  6.]]
Transformed data (mean imputation): 
 [[1. 2.]
 [3. 3.]
 [7. 6.]
 [1. 6.]]


In [2]:
x= np.array([[1, 2], [np.nan, 3], [7, 6],[1, 6]])
print("Original data: \n",x)

#Average Imputation using strategy='median'
imp = SimpleImputer(missing_values=np.nan, strategy='median')
transformed_x=imp.fit_transform(x)
print("Transformed data (median imputation): \n",transformed_x)

Original data: 
 [[ 1.  2.]
 [nan  3.]
 [ 7.  6.]
 [ 1.  6.]]
Transformed data (median imputation): 
 [[1. 2.]
 [1. 3.]
 [7. 6.]
 [1. 6.]]


In [3]:
x= np.array([[1, 2], [np.nan, 3], [7, 6],[1, 6]])
print("Original data: \n",x)

#Average Imputation using strategy='constant'
imp = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=17)
transformed_x=imp.fit_transform(x)
print("Transformed data (constant imputation): \n",transformed_x)

Original data: 
 [[ 1.  2.]
 [nan  3.]
 [ 7.  6.]
 [ 1.  6.]]
Transformed data (constant imputation): 
 [[ 1.  2.]
 [17.  3.]
 [ 7.  6.]
 [ 1.  6.]]


In [4]:
x= np.array([[1, 2], [np.nan, 3], [7, 6],[1, 6]])
print("Original data: \n",x)

#Average Imputation using strategy='most_frequent'
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
transformed_x=imp.fit_transform(x)
print("Transformed data (mostfrequent value imputation): \n",transformed_x)

Original data: 
 [[ 1.  2.]
 [nan  3.]
 [ 7.  6.]
 [ 1.  6.]]
Transformed data (mostfrequent value imputation): 
 [[1. 2.]
 [1. 3.]
 [7. 6.]
 [1. 6.]]


In [5]:
#Univariate imputation with nominal values
#first try
x_string= np.array([["Mike", 2], [np.nan, 3], ["Peter", 6],["Peter", 6]])
print("Original data: \n",x_string)

#Average Imputation using strategy='most_frequent'
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
transformed_x_string=imp.fit_transform(x_string)
print("Transformed data (mostfrequent value imputation): \n",transformed_x_string)

Original data: 
 [['Mike' '2']
 ['nan' '3']
 ['Peter' '6']
 ['Peter' '6']]


ValueError: SimpleImputer does not support data with dtype <U32. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.

In [6]:
#Univariate imputation with string values
#second try:  dtype="object"
x_string= np.array([["Mike", 2], [np.nan, 3], ["Peter", 6],["Peter", 6]], dtype="object")
print("Original data: \n",x_string)

#Average Imputation using strategy='most_frequent'
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
transformed_x_string=imp.fit_transform(x_string)
print("Transformed data (most frequent value imputation): \n",transformed_x_string)

print("Int values keep their type: ",transformed_x_string[1,1]," + 4 = ",transformed_x_string[1,1]+4)

Original data: 
 [['Mike' 2]
 [nan 3]
 ['Peter' 6]
 ['Peter' 6]]
Transformed data (most frequent value imputation): 
 [['Mike' 2]
 ['Peter' 3]
 ['Peter' 6]
 ['Peter' 6]]
Int values keep their type:  3  + 4 =  7


In [7]:
#Univariate imputation with nominal values
#Similarily with constant values
x_string= np.array([["Mike", 2], [np.nan, 3], ["Peter", 6],["Peter", 6]], dtype="object")
print("Original data: \n",x_string)

#Average Imputation using strategy='constant'
imp = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value="Hugo")
transformed_x_string=imp.fit_transform(x_string)
print("Transformed data (constant value imputation): \n",transformed_x_string)

Original data: 
 [['Mike' 2]
 [nan 3]
 ['Peter' 6]
 ['Peter' 6]]
Transformed data (constant value imputation): 
 [['Mike' 2]
 ['Hugo' 3]
 ['Peter' 6]
 ['Peter' 6]]
