In [40]:
!pip install imblearn
!pip install scikeras tensorflow



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier

In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/data.csv')
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Data Cleaning
df.shape

(769, 9)

In [None]:
df.drop_duplicates(inplace=True)
df.shape

(769, 9)

In [None]:
# Removing outliers for each numerical feature
for feature in df.columns:
  Q1 = df[feature].quantile(0.25)
  Q3 = df[feature].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]

print("Data shape after removing outliers:", df.shape)

Data shape after removing outliers: (641, 9)


In [None]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
5,5,116,74,0,0,25.6,0.201,30,0


In [43]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33,0,50,1
1,1,85,66,29,0,26,0,31,0
2,8,183,64,0,0,23,0,32,1
3,1,89,66,23,94,28,0,21,0
5,5,116,74,0,0,25,0,30,0


In [44]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [45]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [46]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [47]:
y_resampled.value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
1,440
0,440


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

In [49]:
# Building the Neural Network
def build_model(optimizer='adam', layers=[128, 64, 32, 16]):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(X_train.shape[1],)))
    for units in layers:
        model.add(keras.layers.Dense(units, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['recall'])
    return model

In [50]:
# Wrap the model for compatibility with scikit-learn
model = KerasClassifier(model=build_model, epochs=10, batch_size=32, verbose= False)

In [51]:
# Define hyperparameters to tune.
# The 'model__' prefix is used to access arguments of the build_model function
param_grid = {
    'batch_size': [32],
    'epochs': [100],
    'model__optimizer': ['adam'],
    'model__layers': [
        [128, 64, 16],
    ]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, error_score='raise') # Added error_score='raise'

# If using GPU ensure its enabled
with tf.device('/device:GPU:0'): # Or CPU if no GPU is available: tf.device('/device:CPU:0')
    grid_result = grid.fit(X_train, y_train)


# Print the best results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

Best: 0.8338095238095239 using {'batch_size': 32, 'epochs': 100, 'model__layers': [128, 64, 16], 'model__optimizer': 'adam'}


In [42]:
# After GridSearchCV finishes and finds the best model, make predictions
best_model = grid_result.best_estimator_
y_test_pred = best_model.predict(X_test)
y_train_pred = best_model.predict(X_train)

# Convert probabilities to binary predictions
y_test_pred = (y_test_pred > 0.5).astype(int)
y_train_pred = (y_train_pred > 0.5).astype(int)

# Calculate evaluation metrics
recall_test = recall_score(y_test, y_test_pred)
recall_train = recall_score(y_train, y_train_pred)

from sklearn.metrics import accuracy_score

# Calculate accuracy on the test and train sets and convert to percentage
accuracy_test = accuracy_score(y_test, y_test_pred) * 100
accuracy_train = accuracy_score(y_train, y_train_pred) * 100

print(f"Accuracy on test set: {accuracy_test:.2f}%")
print(f"Accuracy on train set: {accuracy_train:.2f}%")

# Print recall for comparison
print(f"Recall on test set: {recall_test * 100:.2f}%")
print(f"Recall on train set: {recall_train * 100:.2f}%")

# Classification report and confusion matrix
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

Accuracy on test set: 82.39%
Accuracy on train set: 100.00%
Recall on test set: 88.64%
Recall on train set: 100.00%

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.87      0.76      0.81        88
           1       0.79      0.89      0.83        88

    accuracy                           0.82       176
   macro avg       0.83      0.82      0.82       176
weighted avg       0.83      0.82      0.82       176

