In [197]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelBinarizer

In [44]:
csv_path: str = "diabetes.csv"
data: pd.DataFrame = pd.read_csv(csv_path)

Data description:
- Pregnancies - number of times pregnant
- Glucose - plasma glucose concentration over 2 hours in an oral glucose tolerance test            	|
- BloodPressure - diastolic blood pressure (mm Hg)
- SkinThickness - triceps skin fold thickness (mm)
- Insulin - 2-Hour serum insulin (mu U/ml)
- BMI - body mass index (weight in kg/(height in m)2)
- DiabetesPedigreeFunction - diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
- Age - age (years)
- Outcome - class variable (0 -> non-diabetic, 1 -> diabetic)

In [119]:
print(data.head())
print(data.values.shape)
print(data.values[0])
print(data["Glucose"][0])
print(data.columns)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            2      138             62             35        0  33.6   
1            0       84             82             31      125  38.2   
2            0      145              0              0        0  44.2   
3            0      135             68             42      250  42.3   
4            1      139             62             41      480  40.7   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.127   47        1  
1                     0.233   23        0  
2                     0.630   31        1  
3                     0.365   24        1  
4                     0.536   21        0  
(2000, 9)
[2.00e+00 1.38e+02 6.20e+01 3.50e+01 0.00e+00 3.36e+01 1.27e-01 4.70e+01
 1.00e+00]
138
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [246]:
print(data.isna().values.any())

False


No NaN and None variables - no further data processing is required.

In [167]:
# split data into 3 sets
data_train, data_test, data_validate  = np.split(data, [int(len(data)/3), int(len(data)/3) * 2])
print(len(data_train), len(data_test), len(data_validate))

666 666 668


In [183]:
print(data_train.values)
print(data_train.columns)
binarizer = LabelBinarizer()
encoded_labels: np.ndarray = binarizer.fit_transform(data_train.columns)
# x_test, y_test = data_train

[[2.00e+00 1.38e+02 6.20e+01 ... 1.27e-01 4.70e+01 1.00e+00]
 [0.00e+00 8.40e+01 8.20e+01 ... 2.33e-01 2.30e+01 0.00e+00]
 [0.00e+00 1.45e+02 0.00e+00 ... 6.30e-01 3.10e+01 1.00e+00]
 ...
 [9.00e+00 1.45e+02 8.00e+01 ... 6.37e-01 4.00e+01 1.00e+00]
 [6.00e+00 1.15e+02 6.00e+01 ... 2.45e-01 4.00e+01 1.00e+00]
 [1.00e+00 1.12e+02 8.00e+01 ... 2.17e-01 2.40e+01 0.00e+00]]
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
<class 'numpy.ndarray'>


In [355]:
x_train = data_train.drop(columns="Outcome").values
y_train = data_train["Outcome"].values

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(14, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))

model.compile(
    optimizer=tf.optimizers.Adam(),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.fit(x_train, y_train, epochs=3)

Train on 666 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f1251fb97f0>

In [324]:
x_test = data_test.drop(columns="Outcome").values
predictions = model.predict([x_test])

idx = 8
print(np.argmax(predictions[idx]))
print(data_test["Outcome"].values[idx])

1
0
