In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tqdm.notebook import tqdm  # Use tqdm for progress visualization


2025-04-06 22:38:37.508762: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-06 22:38:37.512867: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-06 22:38:37.524010: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743993517.542932 2954621 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743993517.549408 2954621 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743993517.563394 2954621 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [None]:

# 1. Load and Preprocess Data
df = pd.read_csv("./diabetes_prediction_dataset.csv")  
df.drop_duplicates(inplace=True)
print("Missing values in each column:\n", df.isna().sum())
categorical_columns = ["gender", "smoking_history"]
le_dict = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

# Define features and target
X = df.drop("diabetes", axis=1)
y = df["diabetes"]

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#  Conv1D deep learning model reshaping the input to (samples, time_steps, channels)
X_dl = X_scaled.reshape((X_scaled.shape[0], X_scaled.shape[1], 1))



Missing values in each column:
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [3]:

# 2. Model Definitions
# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# SVM Model
svm_model = SVC(kernel='linear', probability=True, random_state=42)

# Conv1D Deep Learning Model
def create_conv1d_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.Input(shape=input_shape),
        tf.keras.layers.Conv1D(filters=32, kernel_size=2, activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


In [4]:

# 3. Metrics Calculation Function
def calculate_metrics(cm):
    TN, FP, FN, TP = cm.ravel()
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0  
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
    FNR = FN / (TP + FN) if (TP + FN) > 0 else 0
    TSS = TPR - FPR  
    denominator = ((TP + FN) * (FN + TN) + (TP + FP) * (FP + TN))
    HSS = 2 * (TP * TN - FN * FP) / denominator if denominator != 0 else 0
    return {"accuracy": accuracy, "TPR": TPR, "FPR": FPR, "FNR": FNR, "TSS": TSS, "HSS": HSS,
            "TP": TP, "TN": TN, "FP": FP, "FN": FN}


In [5]:

# 4. KFold Cross-Validation Setup
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store metrics for each model across folds
rf_metrics_list = []
svm_metrics_list = []
conv1d_metrics_list = []

# 5. Training and Evaluation using KFold with tqdm
print("Training Random Forest Model:")
for i, (train_index, test_index) in enumerate(tqdm(kf.split(X), total=n_splits, desc="RF CV"), start=1):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Measure training time
    start_time = time.time()
    rf_model.fit(X_train, y_train)
    train_time = time.time() - start_time
    print(f"Fold {i} RF training time: {train_time:.6f} seconds")
    
    # Prediction and metrics calculation
    y_pred = rf_model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    rf_metrics_list.append(calculate_metrics(cm))


Training Random Forest Model:


RF CV:   0%|          | 0/10 [00:00<?, ?it/s]

Fold 1 RF training time: 4.828195 seconds
Fold 2 RF training time: 4.747567 seconds
Fold 3 RF training time: 4.663944 seconds
Fold 4 RF training time: 4.769497 seconds
Fold 5 RF training time: 4.896482 seconds
Fold 6 RF training time: 4.779871 seconds
Fold 7 RF training time: 4.857546 seconds
Fold 8 RF training time: 5.295923 seconds
Fold 9 RF training time: 5.237453 seconds
Fold 10 RF training time: 5.330139 seconds


In [6]:

print("\nTraining SVM Model:")
for i, (train_index, test_index) in enumerate(tqdm(kf.split(X), total=n_splits, desc="SVM CV"), start=1):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    start_time = time.time()
    svm_model.fit(X_train, y_train)
    train_time = time.time() - start_time
    print(f"Fold {i} SVM training time: {train_time:.6f} seconds")
    
    y_pred = svm_model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    svm_metrics_list.append(calculate_metrics(cm))



Training SVM Model:


SVM CV:   0%|          | 0/10 [00:00<?, ?it/s]

Fold 1 SVM training time: 235.199971 seconds
Fold 2 SVM training time: 210.981436 seconds
Fold 3 SVM training time: 231.348896 seconds
Fold 4 SVM training time: 152.992190 seconds
Fold 5 SVM training time: 168.868646 seconds
Fold 6 SVM training time: 154.605697 seconds
Fold 7 SVM training time: 157.822639 seconds
Fold 8 SVM training time: 149.309565 seconds
Fold 9 SVM training time: 147.865381 seconds
Fold 10 SVM training time: 138.212827 seconds


In [None]:

print("\nTraining Conv1D Model:")
for i, (train_index, test_index) in enumerate(tqdm(kf.split(X), total=n_splits, desc="Conv1D CV"), start=1):
    X_train_dl, X_test_dl = X_dl[train_index], X_dl[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Create a new model instance for each fold to ensure fresh weights
    model = create_conv1d_model((X_train_dl.shape[1], 1))
    start_time = time.time()
    model.fit(X_train_dl, y_train, epochs=20, batch_size=16, verbose=0)
    train_time = time.time() - start_time
    print(f"Fold {i} Conv1D training time: {train_time:.6f} seconds")
    
    y_pred = (model.predict(X_test_dl) > 0.5).astype("int32").flatten()
    cm = confusion_matrix(y_test, y_pred)
    conv1d_metrics_list.append(calculate_metrics(cm))



Training Conv1D Model:


Conv1D CV:   0%|          | 0/10 [00:00<?, ?it/s]

E0000 00:00:1743995332.869811 2954621 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1743995332.870680 2954621 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Fold 1 Conv1D training time: 125.798733 seconds
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 711us/step
Fold 2 Conv1D training time: 132.645822 seconds
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 742us/step
Fold 3 Conv1D training time: 124.719298 seconds
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 797us/step
Fold 4 Conv1D training time: 139.025658 seconds
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 726us/step


In [None]:

# 6. Aggregate and Display Results per Fold and Overall
def display_fold_metrics(model_name, metrics_list):
    df_folds = pd.DataFrame(metrics_list)
    print(f"\n{model_name} Metrics for Each Fold:")
    display(df_folds)  
    
    # Calculate overall average metrics
    overall_avg = df_folds.mean()
    print(f"\n{model_name} Overall Average Metrics (10-Fold CV):")
    display(overall_avg.to_frame().transpose())  

# Display metrics 
display_fold_metrics("Random Forest", rf_metrics_list)
display_fold_metrics("SVM", svm_metrics_list)
display_fold_metrics("Conv1D", conv1d_metrics_list)




In [None]:
# combine overall averages into one comparison table
overall_results = pd.DataFrame({
    "Random Forest": pd.DataFrame(rf_metrics_list).mean(),
    "SVM": pd.DataFrame(svm_metrics_list).mean(),
    "Conv1D": pd.DataFrame(conv1d_metrics_list).mean()
}).transpose()

print("\nComparison of Overall Average Metrics for All Models:")
display(overall_results)