## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from pathlib import Path
from sqlalchemy import create_engine
import sqlite3
import os
import tensorflow as tf

#Define the paths
resources_folder = os.path.join(os.getcwd(), 'resources')
csv_file_path = os.path.join(resources_folder, 'diabetes_data.csv')
database_path = os.path.join(os.getcwd(), 'my_database.db')

#Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

#Connect to the SQLite database (or create it)
connection = sqlite3.connect(database_path)

#Save the DataFrame to the SQLite database
df.to_sql('diabetes_data', connection, if_exists='replace', index=False)

#Commit and close the connection
connection.commit()
connection.close()

#Verify the data by querying the database
connection = sqlite3.connect(database_path)
diabetes_df = pd.read_sql('SELECT * FROM diabetes_data', connection)
diabetes_BMI = pd.read_sql('SELECT BMI FROM diabetes_data', connection)

diabetes_df.head()
#diabetes_BMI.head(3)

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [2]:
# Determine the number of unique values in each column.
diabetes_df.nunique()

Diabetes_012             3
HighBP                   2
HighChol                 2
CholCheck                2
BMI                     84
Smoker                   2
Stroke                   2
HeartDiseaseorAttack     2
PhysActivity             2
Fruits                   2
Veggies                  2
HvyAlcoholConsump        2
AnyHealthcare            2
NoDocbcCost              2
GenHlth                  5
MentHlth                31
PhysHlth                31
DiffWalk                 2
Sex                      2
Age                     13
Education                6
Income                   8
dtype: int64

In [3]:
# Rename the outcome column 
diabetes_df = diabetes_df.rename(columns={'Diabetes_012': 'Diabetes'})

# Convert the values of 1.0 and 2.0 in the column "Diabetes" into 1.0 so that the binary model can be used
diabetes_df['Diabetes'] = diabetes_df['Diabetes'].replace({2.0: 1.0})

# Check the number of unique values in each column again
diabetes_df.nunique()


Diabetes                 2
HighBP                   2
HighChol                 2
CholCheck                2
BMI                     84
Smoker                   2
Stroke                   2
HeartDiseaseorAttack     2
PhysActivity             2
Fruits                   2
Veggies                  2
HvyAlcoholConsump        2
AnyHealthcare            2
NoDocbcCost              2
GenHlth                  5
MentHlth                31
PhysHlth                31
DiffWalk                 2
Sex                      2
Age                     13
Education                6
Income                   8
dtype: int64

In [4]:
# Split our preprocessed data into our features and target arrays
y = diabetes_df['Diabetes'].values
X = diabetes_df.drop(['Diabetes'], axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Compile, Train and Evaluate the Model

In [5]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])



In [7]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=100)

Epoch 1/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 680us/step - accuracy: 0.8437 - loss: 0.4206
Epoch 2/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 619us/step - accuracy: 0.8456 - loss: 0.3562
Epoch 3/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 590us/step - accuracy: 0.8468 - loss: 0.3520
Epoch 4/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 633us/step - accuracy: 0.8479 - loss: 0.3501
Epoch 5/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 654us/step - accuracy: 0.8482 - loss: 0.3495
Epoch 6/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 650us/step - accuracy: 0.8493 - loss: 0.3469
Epoch 7/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 588us/step - accuracy: 0.8509 - loss: 0.3432
Epoch 8/100
[1m5946/5946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 598us/step - accuracy: 0.8499 - loss: 0.3442


In [8]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1982/1982 - 1s - 570us/step - accuracy: 0.8512 - loss: 0.3385
Loss: 0.3385173976421356, Accuracy: 0.8512141108512878


In [9]:
# Export our model to HDF5 file
nn.save('diabetes_prediction_model.h5')

