In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [10]:
df_diabetes = pd.read_csv('./diabetes.csv')

In [12]:
features = list(df_diabetes.columns.values)
features.remove('Outcome')
print(features)
X = df_diabetes[features]
y = df_diabetes['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print(X_train.shape)
print(X_test.shape)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
(576, 8)
(192, 8)


In [20]:
def check_zero_entries(data, fields):
    """ List number of 0-entries in each of the given fields"""
    for field in fields:
        print('field %s: num 0-entries: %d' % (field, len(data.loc[ data[field] == 0, field ])))

In [21]:
# lets fix the 0-entry for a field in the dataset with its mean value
def impute_zero_field_by_mean(data, field):
    nonzero_vals = data.loc[data[field] != 0, field]
    mean = np.mean(nonzero_vals)
    k = len(data.loc[ data[field] == 0, field])   # num of 0-entries
    data.loc[ data[field] == 0, field ] = mean
    print('Field: %s; fixed %d entries with value: %.3f' % (field, k, mean))

In [22]:
# lets fix the 0-entry for a field in the dataset with its mean value
def impute_zero_field_by_medium(data, field):
    nonzero_vals = data.loc[data[field] != 0, field]
    median = np.median(nonzero_vals)
    k = len(data.loc[ data[field] == 0, field])   # num of 0-entries
    data.loc[ data[field] == 0, field ] = median
    print('Field: %s; fixed %d entries with value: %.3f' % (field, k, median))

In [24]:
for field in ['Glucose', 'BloodPressure']:
    impute_zero_field_by_mean(X_train, field)

for field in ['SkinThickness', 'Insulin', 'BMI']:
    impute_zero_field_by_medium(X_train, field)


Field: Glucose; fixed 4 entries with value: 122.003
Field: BloodPressure; fixed 25 entries with value: 72.846
Field: SkinThickness; fixed 163 entries with value: 30.000
Field: Insulin; fixed 270 entries with value: 127.500
Field: BMI; fixed 8 entries with value: 32.500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [26]:
check_zero_entries(X_train, ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'])

field Glucose: num 0-entries: 0
field BloodPressure: num 0-entries: 0
field SkinThickness: num 0-entries: 0
field Insulin: num 0-entries: 0
field BMI: num 0-entries: 0


In [27]:
for field in ['Glucose', 'BloodPressure']:
    impute_zero_field_by_mean(X_test, field)

for field in ['SkinThickness', 'Insulin', 'BMI']:
    impute_zero_field_by_medium(X_test, field)

Field: Glucose; fixed 1 entries with value: 120.738
Field: BloodPressure; fixed 10 entries with value: 71.071
Field: SkinThickness; fixed 64 entries with value: 28.500
Field: Insulin; fixed 104 entries with value: 115.000
Field: BMI; fixed 3 entries with value: 31.200


In [28]:
check_zero_entries(X_test, ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'])

field Glucose: num 0-entries: 0
field BloodPressure: num 0-entries: 0
field SkinThickness: num 0-entries: 0
field Insulin: num 0-entries: 0
field BMI: num 0-entries: 0


In [38]:
print(X_train.shape)
print(X_test.shape)
print(y_train.size)
print(y_test.size)

(576, 8)
(192, 8)
576
192


In [35]:
X_train.to_csv('train/X_train.csv')
y_train.to_csv('train/y_train.csv')
X_test.to_csv('test/X_test.csv')
y_test.to_csv('test/y_test.csv')

In [39]:
test = pd.read_csv('train/X_train.csv')
print(test.shape)

(576, 9)
