## Preprocessing

In [15]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the diabetes_data.csv.
diabtetes_df = pd.read_csv("Resources/diabetes_data.csv")
diabtetes_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [16]:
# Determine the number of unique values in each column.
diabtetes_df.nunique()

gender                    3
age                     102
hypertension              2
heart_disease             2
smoking_history           6
bmi                    4247
HbA1c_level              18
blood_glucose_level      18
diabetes                  2
dtype: int64

In [21]:
# Convert categorical data to numeric with `pd.get_dummies`
numeric_diabtetes_df=pd.get_dummies(diabtetes_df)
numeric_diabtetes_df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,True,False,False,False,False,False,False,True,False
1,54.0,0,0,27.32,6.6,80,0,True,False,False,True,False,False,False,False,False
2,28.0,0,0,27.32,5.7,158,0,False,True,False,False,False,False,False,True,False
3,36.0,0,0,23.45,5.0,155,0,True,False,False,False,True,False,False,False,False
4,76.0,1,1,20.14,4.8,155,0,False,True,False,False,True,False,False,False,False


In [22]:
# Create 'smoke_yes' and 'smoke_no' columns
numeric_diabtetes_df['smoke_yes'] = numeric_diabtetes_df[['smoking_history_current', 
                                                          'smoking_history_ever', 
                                                          'smoking_history_former']].any(axis=1)
numeric_diabtetes_df['smoke_no'] = numeric_diabtetes_df['smoking_history_never']

# Drop the original smoking history columns
numeric_diabtetes_df = numeric_diabtetes_df.drop(['smoking_history_No Info', 
                                                  'smoking_history_current', 
                                                  'smoking_history_ever', 
                                                  'smoking_history_former', 
                                                  'smoking_history_never', 
                                                  'smoking_history_not current'], axis=1)

# View the updated DataFrame
numeric_diabtetes_df.head()


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoke_yes,smoke_no
0,80.0,0,1,25.19,6.6,140,0,True,False,False,False,True
1,54.0,0,0,27.32,6.6,80,0,True,False,False,False,False
2,28.0,0,0,27.32,5.7,158,0,False,True,False,False,True
3,36.0,0,0,23.45,5.0,155,0,True,False,False,True,False
4,76.0,1,1,20.14,4.8,155,0,False,True,False,True,False


In [30]:
import pandas as pd

# Sample DataFrame setup
data = {
    'age': [80, 54, 28, 36, 76],
    'hypertension': [0, 0, 0, 0, 1],
    'heart_disease': [1, 0, 0, 0, 1],
    'bmi': [25.19, 27.32, 27.32, 23.45, 20.14],
    'HbA1c_level': [6.6, 6.6, 5.7, 5.0, 4.8],
    'blood_glucose_level': [140, 80, 158, 155, 155],
    'diabetes': [0, 0, 0, 0, 0],
    'gender_Female': [True, True, False, True, False],
    'gender_Male': [False, False, True, False, True],
    'gender_Other': [False, False, False, False, False],
    'smoking_history_No Info': [False, False, False, False, False],
    'smoking_history_current': [False, False, False, True, True],
    'smoking_history_ever': [False, False, False, False, False],
    'smoking_history_former': [False, False, False, False, True],
    'smoking_history_never': [True, True, True, False, False],
    'smoking_history_not current': [False, False, False, False, False]
}

numeric_diabtetes_df = pd.DataFrame(data)

# Create 'smoke_yes' and 'smoke_no' columns
numeric_diabtetes_df['smoke_yes'] = numeric_diabtetes_df[['smoking_history_current', 
                                                          'smoking_history_ever', 
                                                          'smoking_history_former']].any(axis=1)

numeric_diabtetes_df['smoke_no'] = numeric_diabtetes_df['smoking_history_never']

# Drop the original smoking history columns
numeric_diabtetes_df = numeric_diabtetes_df.drop(['smoking_history_No Info', 
                                                  'smoking_history_current', 
                                                  'smoking_history_ever', 
                                                  'smoking_history_former', 
                                                  'smoking_history_never', 
                                                  'smoking_history_not current'], axis=1)

# Drop 'gender_Other' column
numeric_diabtetes_df = numeric_diabtetes_df.drop('gender_Other', axis=1)

# View the updated DataFrame
print(numeric_diabtetes_df.head())


   age  hypertension  heart_disease    bmi  HbA1c_level  blood_glucose_level  \
0   80             0              1  25.19          6.6                  140   
1   54             0              0  27.32          6.6                   80   
2   28             0              0  27.32          5.7                  158   
3   36             0              0  23.45          5.0                  155   
4   76             1              1  20.14          4.8                  155   

   diabetes  gender_Female  gender_Male  smoke_yes  smoke_no  
0         0           True        False      False      True  
1         0           True        False      False      True  
2         0          False         True      False      True  
3         0           True        False       True     False  
4         0          False         True       True     False  


In [31]:
# Define features and target variable again
X = numeric_diabtetes_df.drop('diabetes', axis=1)
y = numeric_diabtetes_df['diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [35]:
# Define the model
number_input_features = X_train_scaled.shape[1]  # Number of input features
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer for binary classification
nn.add(tf.keras.layers.Dense(1, activation="sigmoid"))

# Compile the model
nn.compile(optimizer='adam',
           loss='binary_crossentropy',
           metrics=['accuracy'])

# Check the structure of the model
nn.summary()

# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=20, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.6667 - loss: 0.6455 - val_accuracy: 0.0000e+00 - val_loss: 0.8062
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.6667 - loss: 0.6373 - val_accuracy: 0.0000e+00 - val_loss: 0.8027
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.6667 - loss: 0.6292 - val_accuracy: 0.0000e+00 - val_loss: 0.7992
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.6667 - loss: 0.6212 - val_accuracy: 0.0000e+00 - val_loss: 0.7957
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.6667 - loss: 0.6135 - val_accuracy: 0.0000e+00 - val_loss: 0.7922
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.6667 - loss: 0.6068 - val_accuracy: 0.0000e+00 - val_loss: 0.7888
Epoch 7/20
[1m1/1[0m 

In [36]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - 21ms/step - accuracy: 1.0000 - loss: 0.6289
Loss: 0.6288600564002991, Accuracy: 1.0


In [34]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - 19ms/step - accuracy: 0.0000e+00 - loss: 1.0534
Loss: 1.053396224975586, Accuracy: 0.0


In [10]:
import tensorflow as tf

# Define the model
number_input_features = X_train_scaled.shape[1]  # Number of input features
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer for binary classification
nn.add(tf.keras.layers.Dense(1, activation="sigmoid"))

# Compile the model
nn.compile(optimizer='adam',
           loss='binary_crossentropy',
           metrics=['accuracy'])

# Check the structure of the model
nn.summary()

# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=20, validation_split=0.2, verbose=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 874us/step - accuracy: 0.8927 - loss: 0.2789 - val_accuracy: 0.9592 - val_loss: 0.1181
Epoch 2/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 704us/step - accuracy: 0.9592 - loss: 0.1178 - val_accuracy: 0.9621 - val_loss: 0.1121
Epoch 3/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 753us/step - accuracy: 0.9623 - loss: 0.1125 - val_accuracy: 0.9628 - val_loss: 0.1102
Epoch 4/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 677us/step - accuracy: 0.9651 - loss: 0.1083 - val_accuracy: 0.9639 - val_loss: 0.1089
Epoch 5/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 702us/step - accuracy: 0.9657 - loss: 0.1068 - val_accuracy: 0.9648 - val_loss: 0.1076
Epoch 6/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 754us/step - accuracy: 0.9658 - loss: 0.1038 - val_accuracy: 0.9646 - val_loss: 0.1063
Epoc

In [11]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

625/625 - 0s - 659us/step - accuracy: 0.9696 - loss: 0.0927
Loss: 0.09272713959217072, Accuracy: 0.9695500135421753


In [7]:
# Compile the model
nn.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])



In [8]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 890us/step - accuracy: 0.8942 - loss: 0.2987
Epoch 2/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 734us/step - accuracy: 0.9579 - loss: 0.1193
Epoch 3/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 660us/step - accuracy: 0.9609 - loss: 0.1120
Epoch 4/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 659us/step - accuracy: 0.9602 - loss: 0.1122
Epoch 5/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 650us/step - accuracy: 0.9606 - loss: 0.1121
Epoch 6/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 654us/step - accuracy: 0.9608 - loss: 0.1137
Epoch 7/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 650us/step - accuracy: 0.9611 - loss: 0.1121
Epoch 8/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 652us/step - accuracy: 0.9611 - loss: 0.1098


In [9]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

782/782 - 1s - 715us/step - accuracy: 0.9682 - loss: 0.0895
Loss: 0.08948338031768799, Accuracy: 0.9682400226593018


In [10]:
# Export our model to HDF5 file
nn.save('diabetes_prediction_model.h5')

