In [1]:
import pandas as pd
import numpy as np

In [2]:
diabetes = pd.read_csv('datasets/diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
diabetes.shape

(768, 9)

In [4]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
diabetes.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [6]:
diabetes['Glucose'].replace(0, np.nan, inplace=True)
diabetes['BloodPressure'].replace(0, np.nan, inplace=True)
diabetes['SkinThickness'].replace(0, np.nan, inplace=True)
diabetes['Insulin'].replace(0, np.nan, inplace=True)
diabetes['BMI'].replace(0, np.nan, inplace=True)

In [7]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [8]:
arr = diabetes['SkinThickness'].values.reshape(-1,1)
arr.shape

(768, 1)

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp.fit(diabetes['SkinThickness'].values.reshape(-1,1))

diabetes['SkinThickness'] = imp.transform(diabetes['SkinThickness'].values.reshape(-1,1))

In [11]:
diabetes['SkinThickness'].describe()

count    768.000000
mean      29.994792
std        8.886506
min        7.000000
25%       25.000000
50%       32.000000
75%       32.000000
max       99.000000
Name: SkinThickness, dtype: float64

In [12]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [13]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')

imp.fit(diabetes['Glucose'].values.reshape(-1,1))

diabetes['Glucose'] = imp.transform(diabetes['Glucose'].values.reshape(-1,1))

In [14]:
diabetes['Glucose'].describe()

count    768.000000
mean     121.656250
std       30.438286
min       44.000000
25%       99.750000
50%      117.000000
75%      140.250000
max      199.000000
Name: Glucose, dtype: float64

In [15]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [16]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

imp.fit(diabetes['BloodPressure'].values.reshape(-1,1))

diabetes['BloodPressure'] = imp.transform(diabetes['BloodPressure'].values.reshape(-1,1))

In [17]:
diabetes['BloodPressure'].describe()

count    768.000000
mean      72.405184
std       12.096346
min       24.000000
25%       64.000000
50%       72.202592
75%       80.000000
max      122.000000
Name: BloodPressure, dtype: float64

In [18]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [19]:
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=32)

imp.fit(diabetes['BMI'].values.reshape(-1,1))

diabetes['BMI'] = imp.transform(diabetes['BMI'].values.reshape(-1,1))


In [20]:
diabetes['BMI'].describe()

count    768.000000
mean      32.450911
std        6.875366
min       18.200000
25%       27.500000
50%       32.000000
75%       36.600000
max       67.100000
Name: BMI, dtype: float64

In [21]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [23]:
diabetes.to_csv('datasets/diabetes_processed_incomplete.csv', index=False)