# ICE 5: Missing Data Imputation
### Nikita Tejwani
### HUDK 4051: Learning Analytics

In [1]:
#Import necessary packages
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.impute import SimpleImputer
from sklearn import linear_model
from sklearn.impute import KNNImputer

In [2]:
#Load data
iris = datasets.load_iris()
iris = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [3]:
#Simulating missing data
missing = 0.3

#iris dataset with data missing completely at random (MCAR)
#30% of values missing in sepal length column
iris_mcar = iris.copy()
iris_mcar.loc[iris_mcar.sample(frac = missing).index, 'sepal length (cm)'] = np.nan

#iris dataset with data missing at random (MAR)
#rows with larger values of sepal width are more likely to be sampled
iris_mar = iris.copy()
iris_mar.loc[iris_mar.sample(frac = missing, weights = 'sepal width (cm)').index, 'sepal length (cm)'] = np.nan

In [4]:
#Deletion methods

#Listwise deletion
print(iris_mcar.dropna())

#Pairwise deletion
print('\nMean sepal length, MCAR: ', iris_mcar['sepal length (cm)'].mean())
print('\nMean sepal length, MAR: ', iris_mar['sepal length (cm)'].mean())
print('\nMean sepal length, original: ', iris['sepal length (cm)'].mean())

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
8                  4.4               2.9                1.4               0.2
..                 ...               ...                ...               ...
144                6.7               3.3                5.7               2.5
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[105 rows x 4 columns]

Mean sepal length, MCAR:  5.93999999999

In [9]:
#Imputation Methods

#Simple Imputation
iris_simple_imp = iris_mar.copy()
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp.fit(iris_mar)
print(imp.transform(iris_simple_imp))

#Regression Imputation
iris_regression = iris_mar.copy()
iris_model = iris_mar.dropna()

#model creation
model = linear_model.LinearRegression()
x = iris_model['sepal width (cm)'].values.reshape(-1, 1)
y = iris_model['sepal length (cm)'].values.reshape(-1, 1)
model.fit(X = x, y = y)

#find and replace null values
null_index = iris_regression['sepal length (cm)'].isnull()
imp_values = model.predict(iris_regression[null_index]['sepal width (cm)'].values.reshape(-1, 1))
iris_regression.loc[iris_regression['sepal length (cm)'].isnull(), 'sepal length (cm)'] = imp_values.reshape(len(imp_values),)
print(iris_regression)

iris_knn = iris_mar.copy()
knn_imputer = KNNImputer(n_neighbors = 2, weights = 'uniform')
knn_imputer.fit_transform(iris_knn)

[[5.1        3.5        1.4        0.2       ]
 [4.9        3.         1.4        0.2       ]
 [4.7        3.2        1.3        0.2       ]
 [4.6        3.1        1.5        0.2       ]
 [5.         3.6        1.4        0.2       ]
 [5.86571429 3.9        1.7        0.4       ]
 [4.6        3.4        1.4        0.3       ]
 [5.86571429 3.4        1.5        0.2       ]
 [4.4        2.9        1.4        0.2       ]
 [5.86571429 3.1        1.5        0.1       ]
 [5.4        3.7        1.5        0.2       ]
 [5.86571429 3.4        1.6        0.2       ]
 [5.86571429 3.         1.4        0.1       ]
 [5.86571429 3.         1.1        0.1       ]
 [5.8        4.         1.2        0.2       ]
 [5.86571429 4.4        1.5        0.4       ]
 [5.4        3.9        1.3        0.4       ]
 [5.1        3.5        1.4        0.3       ]
 [5.86571429 3.8        1.7        0.3       ]
 [5.1        3.8        1.5        0.3       ]
 [5.86571429 3.4        1.7        0.2       ]
 [5.1        

array([[5.1 , 3.5 , 1.4 , 0.2 ],
       [4.9 , 3.  , 1.4 , 0.2 ],
       [4.7 , 3.2 , 1.3 , 0.2 ],
       [4.6 , 3.1 , 1.5 , 0.2 ],
       [5.  , 3.6 , 1.4 , 0.2 ],
       [5.1 , 3.9 , 1.7 , 0.4 ],
       [4.6 , 3.4 , 1.4 , 0.3 ],
       [5.15, 3.4 , 1.5 , 0.2 ],
       [4.4 , 2.9 , 1.4 , 0.2 ],
       [4.7 , 3.1 , 1.5 , 0.1 ],
       [5.4 , 3.7 , 1.5 , 0.2 ],
       [5.15, 3.4 , 1.6 , 0.2 ],
       [4.65, 3.  , 1.4 , 0.1 ],
       [4.85, 3.  , 1.1 , 0.1 ],
       [5.8 , 4.  , 1.2 , 0.2 ],
       [5.6 , 4.4 , 1.5 , 0.4 ],
       [5.4 , 3.9 , 1.3 , 0.4 ],
       [5.1 , 3.5 , 1.4 , 0.3 ],
       [5.1 , 3.8 , 1.7 , 0.3 ],
       [5.1 , 3.8 , 1.5 , 0.3 ],
       [5.15, 3.4 , 1.7 , 0.2 ],
       [5.1 , 3.7 , 1.5 , 0.4 ],
       [4.95, 3.6 , 1.  , 0.2 ],
       [5.1 , 3.3 , 1.7 , 0.5 ],
       [5.1 , 3.4 , 1.9 , 0.2 ],
       [5.  , 3.  , 1.6 , 0.2 ],
       [5.25, 3.4 , 1.6 , 0.4 ],
       [5.2 , 3.5 , 1.5 , 0.2 ],
       [5.2 , 3.4 , 1.4 , 0.2 ],
       [4.7 , 3.2 , 1.6 , 0.2 ],
       [4.