In [185]:
import pandas as pd

In [186]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data', header=None, na_values=["?"])
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [188]:
print("Shape: ", data.shape)

Shape:  (690, 16)


In [189]:
data.columns = ['A' + str(x) for x in range(1,17)]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    float64
 14  A15     690 non-null    int64  
 15  A16     690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB


In [190]:
data[data.isna().any(axis=1)].head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
71,b,34.83,4.0,u,g,d,bb,12.5,t,f,0,t,g,,0,-
83,a,,3.5,u,g,d,v,3.0,t,f,0,t,g,300.0,0,-
86,b,,0.375,u,g,d,v,0.875,t,f,0,t,s,928.0,0,-
92,b,,5.0,y,p,aa,v,8.5,t,f,0,f,g,0.0,0,-
97,b,,0.5,u,g,c,bb,0.835,t,f,0,t,s,320.0,0,-


In [191]:
data.isna().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

### Iterative Imputation with sklearn

In [192]:
X = data.select_dtypes(include=["float64", "int64"])
Y = data.A16

In [193]:
X.isnull().sum()

A2     12
A3      0
A8      0
A11     0
A14    13
A15     0
dtype: int64

In [194]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=2020)

In [235]:
print("Train:\n", xtrain.isna().sum())
print("Test:\n", xtest.isna().sum())

Train:
 A2     10
A3      0
A8      0
A11     0
A14    12
A15     0
dtype: int64
Test:
 A2     2
A3     0
A8     0
A11    0
A14    1
A15    0
dtype: int64


### Create an imputation model, fit/train it then produce imputed results

In [238]:
from sklearn.linear_model import BayesianRidge
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# There are two seperate models for imputation because
# This will avoid leakage of data between two datasets(test/train)
# Which ultimately helps improve performance for entirely new datasets in production
imputer_train = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0)
imputer_test = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0)

y_train_predicted = pd.DataFrame(imputer_train.fit_transform(xtrain), columns=xtrain.columns)
y_test_predicted = pd.DataFrame(imputer_test.fit_transform(xtest), columns=xtest.columns)

### Update all NaN values in original dataset with imputed ones

In [248]:
imputedValues = pd.concat([y_train_predicted, y_test_predicted], ignore_index=True)
imputedValues.index.unique()

RangeIndex(start=0, stop=690, step=1)

In [250]:
dataCopy = data.copy()
dataCopy.loc[imputedValues.index, imputedValues.columns] = imputedValues
dataCopy.isna().sum()

A1     12
A2      0
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14     0
A15     0
A16     0
dtype: int64