In [60]:
import pandas as pd
import numpy as np
import autoviz
from autoviz import data_cleaning_suggestions

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [61]:
# Loading the dataset
df = pd.read_csv('./diabetes_prediction_dataset.csv')

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [62]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,100000.0,41.885856,22.51684,0.08,24.0,43.0,60.0,80.0
hypertension,100000.0,0.07485,0.26315,0.0,0.0,0.0,0.0,1.0
heart_disease,100000.0,0.03942,0.194593,0.0,0.0,0.0,0.0,1.0
bmi,100000.0,27.320767,6.636783,10.01,23.63,27.32,29.58,95.69
HbA1c_level,100000.0,5.527507,1.070672,3.5,4.8,5.8,6.2,9.0
blood_glucose_level,100000.0,138.05806,40.708136,80.0,100.0,140.0,159.0,300.0
diabetes,100000.0,0.085,0.278883,0.0,0.0,0.0,0.0,1.0


In [63]:
#df['gender'].value_counts()
df.value_counts(['gender', 'smoking_history'])

gender  smoking_history
Female  never              22869
        No Info            19700
Male    No Info            16110
        never              12223
Female  current             5058
        former              4774
Male    former              4578
        current             4228
Female  not current         3913
Male    not current         2526
Female  ever                2238
Male    ever                1765
Other   not current            8
        No Info                6
        never                  3
        ever                   1
Name: count, dtype: int64

In [64]:
# Data cleaning suggestions
data_cleaning_suggestions(df)

There are 3854 duplicate rows in your dataset
    Alert: Dropping duplicate rows can sometimes cause your column data types to change to object!
    All variables classified into correct types.


Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
gender,object,0.0,0.0,,,1 rare categories: ['Other']. Group them into a single category or drop the categories.
age,float64,0.0,,0.08,80.0,No issue
hypertension,int64,0.0,0.0,0.0,1.0,No issue
heart_disease,int64,0.0,0.0,0.0,1.0,No issue
smoking_history,object,0.0,0.0,,,No issue
bmi,float64,0.0,,10.01,95.69,Column has 5354 outliers greater than upper bound (39.55) or lower than lower bound(13.71). Cap them or remove them.
HbA1c_level,float64,0.0,,3.5,9.0,Column has 1312 outliers greater than upper bound (8.30) or lower than lower bound(2.70). Cap them or remove them.
blood_glucose_level,int64,0.0,0.0,80.0,300.0,Column has 2031 outliers greater than upper bound (247.50) or lower than lower bound(11.50). Cap them or remove them.
diabetes,int64,0.0,0.0,0.0,1.0,No issue


Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
gender,object,0.0,0.0,,,1 rare categories: ['Other']. Group them into a single category or drop the categories.
age,float64,0.0,,0.08,80.0,No issue
hypertension,int64,0.0,0.0,0.0,1.0,No issue
heart_disease,int64,0.0,0.0,0.0,1.0,No issue
smoking_history,object,0.0,0.0,,,No issue
bmi,float64,0.0,,10.01,95.69,Column has 5354 outliers greater than upper bound (39.55) or lower than lower bound(13.71). Cap them or remove them.
HbA1c_level,float64,0.0,,3.5,9.0,Column has 1312 outliers greater than upper bound (8.30) or lower than lower bound(2.70). Cap them or remove them.
blood_glucose_level,int64,0.0,0.0,80.0,300.0,Column has 2031 outliers greater than upper bound (247.50) or lower than lower bound(11.50). Cap them or remove them.
diabetes,int64,0.0,0.0,0.0,1.0,No issue


In [65]:
le = LabelEncoder()

list_str = ['gender', 'smoking_history']
for c in list_str:
    df[c] = le.fit_transform(df[c])
    print(dict(zip(le.classes_,range(len(le.classes_)))))



{'Female': 0, 'Male': 1, 'Other': 2}
{'No Info': 0, 'current': 1, 'ever': 2, 'former': 3, 'never': 4, 'not current': 5}


In [66]:
X = df.drop('diabetes', axis = 1)
y = df['diabetes']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = 0)

X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,80.0,0,1,4,25.19,6.6,140
1,0,54.0,0,0,0,27.32,6.6,80
2,1,28.0,0,0,4,27.32,5.7,158
3,0,36.0,0,0,1,23.45,5.0,155
4,1,76.0,1,1,1,20.14,4.8,155


In [11]:
# scaler = StandardScaler()
# xtrain = scaler.fit_transform(xtrain)
# xtest = scaler.transform(xtest)

In [67]:
model = Sequential([
    Input(shape=(8,)),
    Dense(32, activation = 'relu', input_shape = (xtrain.shape[1],)),
    Dropout(0.1),
    Dense(32, activation = 'relu'),
    Dropout(0.5),
    Dense(1, activation = 'sigmoid')
])

In [68]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

In [69]:
model.fit(xtrain, ytrain, epochs = 5, batch_size = 16, validation_data = (xtest, ytest))


Epoch 1/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - accuracy: 0.8934 - loss: 0.8589 - val_accuracy: 0.9148 - val_loss: 0.2459
Epoch 2/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.9202 - loss: 0.2538 - val_accuracy: 0.9290 - val_loss: 0.2221
Epoch 3/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.9235 - loss: 0.2356 - val_accuracy: 0.9344 - val_loss: 0.1977
Epoch 4/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.9271 - loss: 0.2075 - val_accuracy: 0.9380 - val_loss: 0.1546
Epoch 5/5
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.9363 - loss: 0.1775 - val_accuracy: 0.9548 - val_loss: 0.1268


<keras.src.callbacks.history.History at 0x192f19410>

In [59]:
loss, accuracy = model.evaluate(xtest, ytest)

print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9536 - loss: 0.1660
Test loss: 0.1637
Test accuracy: 0.9532


In [91]:
model.save('y.h5')

In [36]:
import tensorflow as tf
import tensorflowjs as tfjs
model = tf.keras.models.load_model('y.h5')

for layer in model.layers:
    for weight in layer.weights:
        weight.name = '{0}/{1}'.format(layer.name, weight.name)

tfjs.converters.save_keras_model(model, 'web')

ModuleNotFoundError: No module named 'tensorflowjs'