In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, cohen_kappa_score, matthews_corrcoef
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, BatchNormalization, LeakyReLU, MaxPool1D, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.backend import clear_session
import csv
import datetime

In [7]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Enable mixed precision
tf.keras.mixed_precision.set_global_policy('mixed_float16')

Num GPUs Available:  1


In [8]:
# Load data
df = pd.read_csv("oversample_smote.csv")

selected_columns = ['hv021', 'hv104', 'hv106', 'hml18',
                    'shb70', 'ha53', 'shb13', 'avg_sys']

# Split the data into features and target
X = df[selected_columns]
y = df['final_diabetes']

# Convert X and y to numpy arrays if they are not already
X = np.array(X)
y = np.array(y)

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
# Define the CNN model architecture using a function
def cnnmodel():
    clear_session()
    model = Sequential()
    model.add(Conv1D(filters=128, kernel_size=3, strides=1, padding='same', input_shape=(8, 1)))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(MaxPool1D(pool_size=2, strides=2))
    model.add(Dropout(0.3))
    
    model.add(Conv1D(filters=64, kernel_size=3, strides=1, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(MaxPool1D(pool_size=2, strides=2))
    model.add(Dropout(0.3))
    
    model.add(Conv1D(filters=32, kernel_size=3, strides=1, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(GlobalAveragePooling1D())
    
    model.add(Dense(1, activation='sigmoid', dtype='float32'))  # Ensure output layer is float32
    
    optimizer = Adam(learning_rate=0.001)  # Static learning rate of 0.001
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [10]:
# Reshape data for the CNN model
X = X.reshape((X.shape[0], X.shape[1], 1))

# Split data into 80% training/validation and 20% independent test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to tensors for GPU
X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)

In [11]:
def evaluate_model(model, X_val_cv, y_val_cv):
    y_val_pred = model.predict(X_val_cv)
    y_val_pred_binary = (y_val_pred > 0.5).astype(int)

    accuracy = accuracy_score(y_val_cv, y_val_pred_binary)
    precision = precision_score(y_val_cv, y_val_pred_binary)
    recall = recall_score(y_val_cv, y_val_pred_binary)
    f1 = f1_score(y_val_cv, y_val_pred_binary)
    roc_auc = roc_auc_score(y_val_cv, y_val_pred)
    cm = confusion_matrix(y_val_cv, y_val_pred_binary)
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
    kappa = cohen_kappa_score(y_val_cv, y_val_pred_binary)
    mcc = matthews_corrcoef(y_val_cv, y_val_pred_binary)

    metrics = [accuracy, precision, recall, f1, roc_auc, specificity, kappa, mcc, cm]
    return metrics

In [12]:
# Initialize KFold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# CSV file to store performance metrics for each fold
csv_filename = "CNN_8_Features_kfold_performance.csv"
header = ['Fold', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'Specificity', 'Kappa', 'MCC', 'Confusion Matrix']
with open(csv_filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)

# Variables to store aggregate metrics for averaging
metrics_sum = np.zeros(8)
confusion_matrices = np.zeros((2, 2))

# Cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_train_cv, X_val_cv = tf.gather(X_train, train_idx), tf.gather(X_train, val_idx)
    y_train_cv, y_val_cv = tf.gather(y_train, train_idx), tf.gather(y_train, val_idx)
    
    model = cnnmodel()
    # Train the model on GPU
    with tf.device('/GPU:0'):
        log_dir = "logs/profile/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=log_dir, profile_batch=0)
        
        history = model.fit(
            X_train_cv, y_train_cv, 
            epochs=100, 
            batch_size=256, 
            validation_data=(X_val_cv, y_val_cv), 
            verbose=1,
            callbacks=[
                tensorboard_callback,
            ]
        )
    
    # Evaluate the model
    metrics = evaluate_model(model, X_val_cv, y_val_cv)
    metrics_sum += metrics[:-1]
    confusion_matrices += metrics[-1]
    
    # Store confusion matrix as a string
    cm_str = np.array2string(metrics[-1], separator=',')
    
    with open(csv_filename, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([fold + 1] + metrics[:-1] + [cm_str])
    
    print(f"Fold {fold + 1} - Accuracy: {metrics[0]}, Precision: {metrics[1]}, Recall: {metrics[2]}, F1-Score: {metrics[3]}, ROC AUC: {metrics[4]}, Specificity: {metrics[5]}, Kappa: {metrics[6]}, MCC: {metrics[7]}")
    print(f"Confusion Matrix:\n{metrics[-1]}")

# Calculate and save the average metrics
average_metrics = metrics_sum / 10
average_confusion_matrix = confusion_matrices / 10
with open(csv_filename, 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Overall'] + list(average_metrics) + [np.array2string(average_confusion_matrix, separator=',')])

print("Average Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}, ROC AUC: {:.4f}, Specificity: {:.4f}, Kappa: {:.4f}, MCC: {:.4f}".format(*average_metrics))
print("Average Confusion Matrix:\n", average_confusion_matrix)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [13]:
# Evaluate the model on the independent test dataset
print("Independent Data Test:")
test_metrics = evaluate_model(model, X_test, y_test)
test_accuracy, test_precision, test_recall, test_f1, test_roc_auc, test_specificity, test_kappa, test_mcc, test_cm = test_metrics

# Print and save the performance metrics for the independent test data
print("Test Confusion Matrix:\n", test_cm)
print("Test Accuracy:", test_accuracy)
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)
print("Test F1-score:", test_f1)
print("Test ROC AUC:", test_roc_auc)
print("Test Specificity:", test_specificity)
print("Test Cohen's kappa:", test_kappa)
print("Test Matthews correlation coefficient:", test_mcc)

# Save performance metrics to a CSV file for independent test data
independent_csv_filename = "CNN_8_Features_independent_test_performance.csv"
with open(independent_csv_filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'Specificity', 'Kappa', 'MCC', 'Confusion Matrix'])
    writer.writerow(test_metrics[:-1] + [np.array2string(test_cm, separator=',')])

Independent Data Test:
Test Confusion Matrix:
 [[139882   2244]
 [  3168 139699]]
Test Accuracy: 0.9810100598962079
Test Precision: 0.9841908371670318
Test Recall: 0.9778255300384273
Test F1-score: 0.9809978582212703
Test ROC AUC: 0.9988996563066145
Test Specificity: 0.9842111928851864
Test Cohen's kappa: 0.9620205033641954
Test Matthews correlation coefficient: 0.9620407286563472


In [14]:
# Save the model
model.save("cnn_8_Features_model.keras")
print("Model saved as cnn_model")

Model saved as cnn_model
