## Missing data Imputation 

## Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

## Load data

In [6]:
dataset = pd.read_csv('D:/data/csv/diabetes.csv', header=None)
data = dataset.iloc[2:]
data = pd.DataFrame(data)
data = data.apply(pd.to_numeric)
print(data.dtypes)
data.head()

0      int64
1      int64
2      int64
3      int64
4      int64
5    float64
6    float64
7      int64
8      int64
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0
5,0,137,40,35,168,43.1,2.288,33,1
6,5,116,74,0,0,25.6,0.201,30,0


## Data Analysis

#### Primary analisis

In [9]:
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,767.0,762.0,732.0,540.0,394.0,756.0,767.0,767.0,767.0
mean,3.842243,121.652231,72.405738,29.142593,155.548223,32.455952,0.471674,33.219035,0.34811
std,3.370877,30.540786,12.390616,10.483667,118.775855,6.929448,0.331497,11.752296,0.476682
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.2435,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.371,29.0,0.0
75%,6.0,140.75,80.0,36.0,190.0,36.6,0.625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


#### Identify zeros

In [11]:
(data[[1,2,3,4,5]] == 0).sum()

1    0
2    0
3    0
4    0
5    0
dtype: int64

#### Marking

In [12]:
data[[1,2,3,4,5]] = data[[1,2,3,4,5]].replace(0, np.NaN)
print(data.isnull().sum()) # NaNs by column
data.head(20)

0      0
1      5
2     35
3    227
4    373
5     11
6      0
7      0
8      0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8
2,1,85.0,66.0,29.0,,26.6,0.351,31,0
3,8,183.0,64.0,,,23.3,0.672,32,1
4,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
5,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,5,116.0,74.0,,,25.6,0.201,30,0
7,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,10,115.0,,,,35.3,0.134,29,0
9,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
10,8,125.0,96.0,,,,0.232,54,1
11,4,110.0,92.0,,,37.6,0.191,30,0


## Strategy 1 : Remove rows with missing values

In [16]:
print(data.shape)
data.dropna(inplace=True)
print(data.shape)

(392, 9)
(392, 9)


#### Strategy 2 : Impute Missing Values (mean)

In [18]:
data.fillna(data.mean(), inplace=True)
print(data.isnull().sum()) # count NaNs by column
data.head(20)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8
4,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
5,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
7,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
9,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
14,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1
15,5,166.0,72.0,19.0,175.0,25.8,0.587,51,1
17,0,118.0,84.0,47.0,230.0,45.8,0.551,31,1
19,1,103.0,30.0,38.0,83.0,43.3,0.183,33,0
20,1,115.0,70.0,30.0,96.0,34.6,0.529,32,1
21,3,126.0,88.0,41.0,235.0,39.3,0.704,27,0


#### Strategy 3 :  LDA imputation (to review)

In [19]:
# split dataset into inputs and outputs
values = data.values
X = values[:,0:8]
y = values[:,8]

# fill missing values with mean column values
imputer = SimpleImputer()
transfX = imputer.fit_transform(X)

# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, transfX, y, cv=kfold, scoring='accuracy')
print(result.mean())

0.7858289293403797


## Credits & Links