In [197]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelBinarizer

## Reading input

In [44]:
csv_path: str = "diabetes.csv"
data: pd.DataFrame = pd.read_csv(csv_path)

## Data description:
- Pregnancies - number of times pregnant
- Glucose - plasma glucose concentration over 2 hours in an oral glucose tolerance test            	|
- BloodPressure - diastolic blood pressure (mm Hg)
- SkinThickness - triceps skin fold thickness (mm)
- Insulin - 2-Hour serum insulin (mu U/ml)
- BMI - body mass index (weight in kg/(height in m)2)
- DiabetesPedigreeFunction - diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
- Age - age (years)
- Outcome - class variable (0 -> non-diabetic, 1 -> diabetic)

In [422]:
print(data.values.shape)
print(data.head())

(2000, 9)
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            2       11             62             35        0  33.6   
1            0       84             82             31      125  38.2   
2            0      145              0              0        0  44.2   
3            0      135             68             42      250  42.3   
4            1      139             62             41      480  40.7   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.127   47        1  
1                     0.233   23        0  
2                     0.630   31        1  
3                     0.365   24        1  
4                     0.536   21        0  


## Checking for mismatched data

In [423]:
print(data.isna().values.any())
# No NaN and None variables - no further data processing is required.

False


## Spliting the data

In [416]:
# split data into 3 sets
data_train, data_test, data_validate  = np.split(data, [int(len(data)/3), int(len(data)/3) * 2])
print(len(data_train), len(data_test), len(data_validate))

666 666 668


## Selecting dependend and independend variables

In [417]:
y_column = "Outcome"
x_columns = list(set(list(data_train.columns)) - set(y_column))
# x_train = data_train.drop(columns="Outcome").values
# y_train = data_train["Outcome"].values

## Data normalization
Normalization the data by scaling between 0 and 1.<br/>
Dividing each value by its column max value.

In [446]:
data[x_columns] = data[x_columns] / data[x_columns].max()
print(data.head())

   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.117647  0.055276       0.508197       0.318182  0.000000  0.416873   
1     0.000000  0.422111       0.672131       0.281818  0.168011  0.473945   
2     0.000000  0.728643       0.000000       0.000000  0.000000  0.548387   
3     0.000000  0.678392       0.557377       0.381818  0.336022  0.524814   
4     0.058824  0.698492       0.508197       0.372727  0.645161  0.504963   

   DiabetesPedigreeFunction       Age  Outcome  
0                  0.052479  0.580247      1.0  
1                  0.096281  0.283951      0.0  
2                  0.260331  0.382716      1.0  
3                  0.150826  0.296296      1.0  
4                  0.221488  0.259259      0.0  


## Setting up the model

- input layer length == 8 for 8 depended variables
- one deep layers
- output layer length == 2 -> 0 no diabetes, 1 -> diabetes

In [600]:
input_shape = (data[x_columns].values.shape[1],)
output_layer_length = len(np.unique(data[y_column].values))

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(500, activation=tf.nn.relu, input_shape=input_shape))
model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(50, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(output_layer_length, activation=tf.nn.softmax))

model.compile(
    optimizer=tf.optimizers.Adam(),
#     loss=tf.losses.SparseCategoricalCrossentropy(),
    loss=tf.losses.CategoricalCrossentropy(),
    metrics=["accuracy"]
)

In [601]:
x_train = data_train[x_columns].values
y_train = tf.keras.utils.to_categorical(data_train[y_column].values)
# y_train = data_train[y_column].values

model.fit(x_train, y_train, epochs=20)

Train on 666 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f1219e65278>

In [602]:
x_test = data_test[x_columns].values
predictions = model.predict([x_test])

idx = 7
print(np.argmax(predictions[idx]))
print(data_test["Outcome"].values[idx])

y_test = tf.keras.utils.to_categorical(data_test[y_column].values)
# y_test = data_test[y_column].values

scores = model.evaluate(x_test, y_test, verbose=0)
print("accuracy: {}%".format(scores[1]))

0
0
accuracy: 0.6486486196517944%
