In [1]:
# prompt: mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, cohen_kappa_score, matthews_corrcoef
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, BatchNormalization, LeakyReLU, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from tensorflow.keras.backend import clear_session
import datetime
import csv
from tensorflow.keras.layers import Conv1D, BatchNormalization, LeakyReLU, MaxPool1D, Dropout, GlobalAveragePooling1D, Dense

In [3]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Enable mixed precision
tf.keras.mixed_precision.set_global_policy('mixed_float16')

Num GPUs Available:  1


In [4]:
# Load data
df = pd.read_csv("/content/drive/MyDrive/DHS/Diabetes/Data/India/Data Balance/oversample_smote.csv")

selected_columns = ['hv021', 'hv104', 'hv106', 'hml18',
                    'shb70', 'ha53', 'shb13', 'avg_sys']

# Split the data into features and target
X = df[selected_columns]
y = df['final_diabetes']

# Convert X and y to numpy arrays if they are not already
X = np.array(X)
y = np.array(y)

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Reshape data for the CNN model
X = X.reshape((X.shape[0], X.shape[1], 1))

In [5]:
# Define the CNN model architecture using a function
def cnnmodel():
    clear_session()
    model = Sequential()
    model.add(Conv1D(filters=128, kernel_size=3, strides=1, padding='same', input_shape=(8, 1)))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(MaxPool1D(pool_size=2, strides=2))
    model.add(Dropout(0.3))

    model.add(Conv1D(filters=64, kernel_size=3, strides=1, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(MaxPool1D(pool_size=2, strides=2))
    model.add(Dropout(0.3))

    model.add(Conv1D(filters=32, kernel_size=3, strides=1, padding='same'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(GlobalAveragePooling1D())

    model.add(Dense(1, activation='sigmoid', dtype='float32'))  # Ensure output layer is float32

    optimizer = Adam(learning_rate=0.001)  # Static learning rate of 0.001
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [6]:
# Evaluate the model
def evaluate_model(model, X_val, y_val):
    y_val_pred = model.predict(X_val)
    y_val_pred_binary = (y_val_pred > 0.5).astype(int)

    accuracy = accuracy_score(y_val, y_val_pred_binary)
    precision = precision_score(y_val, y_val_pred_binary)
    recall = recall_score(y_val, y_val_pred_binary)
    f1 = f1_score(y_val, y_val_pred_binary)
    roc_auc = roc_auc_score(y_val, y_val_pred)
    cm = confusion_matrix(y_val, y_val_pred_binary)
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
    kappa = cohen_kappa_score(y_val, y_val_pred_binary)
    mcc = matthews_corrcoef(y_val, y_val_pred_binary)

    metrics = [accuracy, precision, recall, f1, roc_auc, specificity, kappa, mcc, cm]
    return metrics

In [7]:
from sklearn.model_selection import StratifiedKFold

class FederatedData:
    def __init__(self, X, y, num_clients, test_size=0.2):
        self.X = X
        self.y = y
        self.num_clients = num_clients
        self.test_size = test_size
        self.partitions = []
        self.create_partitions()

    def create_partitions(self):
        skf = StratifiedKFold(n_splits=self.num_clients, shuffle=True, random_state=42)
        for train_index, test_index in skf.split(self.X, self.y):
            X_train, X_test = self.X[train_index], self.X[test_index]
            y_train, y_test = self.y[train_index], self.y[test_index]
            self.partitions.append((X_train, y_train, X_test, y_test))
            print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

    def get_training_and_validation_data(self, client_idx):
        if client_idx < 0 or client_idx >= len(self.partitions):
            raise ValueError(f"Invalid client index. Must be between 0 and {len(self.partitions) - 1}.")
        partition_X_train, partition_y_train, partition_X_test, partition_y_test = self.partitions[client_idx]
        X_train, X_val, y_train, y_val = train_test_split(partition_X_train, partition_y_train, test_size=0.2, stratify=partition_y_train, random_state=42)
        return X_train, X_val, y_train, y_val, partition_X_test, partition_y_test

# Initialize federated data
federated_data = FederatedData(X, y, num_clients=3)

Train shape: (949976, 8, 1), Test shape: (474988, 8, 1)
Train shape: (949976, 8, 1), Test shape: (474988, 8, 1)
Train shape: (949976, 8, 1), Test shape: (474988, 8, 1)


In [8]:
import os
import datetime
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import csv

# Custom CSV Logger
class CustomCSVLogger(tf.keras.callbacks.Callback):
    def __init__(self, filename, separator=',', append=False):
        self.sep = separator
        self.filename = filename
        self.append = append
        self.file = None
        self.writer = None
        self.keys = None
        super(CustomCSVLogger, self).__init__()

    def on_train_begin(self, logs=None):
        if not self.append and os.path.isfile(self.filename):
            os.remove(self.filename)
        mode = 'a' if self.append else 'w'
        self.file = open(self.filename, mode, newline='')
        self.writer = csv.writer(self.file, delimiter=self.sep)

    def on_epoch_end(self, epoch, logs=None):
        if self.keys is None:
            self.keys = sorted(logs.keys())
            self.writer.writerow(self.keys)
        row = [logs.get(key) for key in self.keys]
        self.writer.writerow(row)

    def on_train_end(self, logs=None):
        self.file.close()


# Initialize global model
input_shape = X.shape[1:]
global_model = cnnmodel()
num_clients = 3
local_epochs = 1

# CSV file to store performance metrics
csv_filename = "/content/drive/MyDrive/DHS/Diabetes/Data/India/Deep Learning/CNN Federated/8 Features/CNN_8_features_federated_learning_metrics.csv"
header = ['Local Epoch', 'Client', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'Specificity', 'Kappa', 'MCC', 'Confusion Matrix']
with open(csv_filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)

# Federated training process
for local_epoch in range(1, local_epochs + 1):
    print(f"Local Epoch {local_epoch}:")
    client_models = []

    # Create a directory for the current local epoch
    epoch_dir = f"/content/drive/MyDrive/DHS/Diabetes/Data/India/Deep Learning/CNN Federated/8 Features/epoch_{local_epoch}"
    os.makedirs(epoch_dir, exist_ok=True)

    for client in range(num_clients):
        X_train, X_val, y_train, y_val, X_test, y_test = federated_data.get_training_and_validation_data(client)
        client_model = cnnmodel()
        client_model.set_weights(global_model.get_weights())  # Initialize with global weights

        with tf.device('/GPU:0'):
            log_dir = os.path.join(epoch_dir, "logs/profile/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
            tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, profile_batch=0)
            csv_logger = CustomCSVLogger(os.path.join(epoch_dir, f"training_client_{client+1}_epoch_{local_epoch}.csv"))

            history = client_model.fit(
                X_train, y_train,
                epochs=50,  # Train for 10 epochs (as a placeholder, adjust as needed)
                batch_size=256,
                validation_data=(X_val, y_val),
                verbose=1,
                callbacks=[tensorboard_callback, csv_logger]
            )

        # Evaluate the client model
        client_metrics = evaluate_model(client_model, X_val, y_val)
        print(f"Client {client + 1} - Metrics after Local Epoch {local_epoch}: {client_metrics}")

        # Save client model performance to CSV
        with open(csv_filename, 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([local_epoch, client + 1] + client_metrics[:-1] + [np.array2string(client_metrics[-1], separator=',')])

        client_models.append(client_model)

    # Aggregating weights from client models to update global model
    global_weights = global_model.get_weights()
    new_weights = [client_model.get_weights() for client_model in client_models]

    averaged_weights = [np.mean(np.array([client_weight[layer] for client_weight in new_weights]), axis=0) for layer in range(len(global_weights))]
    global_model.set_weights(averaged_weights)

# Save the global model
global_model.save("/content/drive/MyDrive/DHS/Diabetes/Data/India/Deep Learning/CNN Federated/8 Features/CNN_8_features_global_model.keras")

Local Epoch 1:
Epoch 1/50
   6/2969 [..............................] - ETA: 32s - loss: 0.5692 - accuracy: 0.6960    



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Client 1 - Metrics after Local Epoch 1: [0.979973262595002, 0.9835823903106407, 0.9762416050864229, 0.9798982497477375, 0.9987747690212885, 0.9837049201035811, 0.959946525190004, 0.9599732613317326, array([[93450,  1548],
       [ 2257, 92741]])]
Epoch 1/50
   1/2969 [..............................] - ETA: 3:22:50 - loss: 0.7651 - accuracy: 0.4844



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Client 2 - Metrics after Local Epoch 1: [0.978783763868713, 0.9874816458206059, 0.9698625234215457, 0.9785927849560012, 0.9986995170439533, 0.9877050043158804, 0.957567527737426, 0.957719986926355, array([[93830,  1168],
       [ 2863, 92135]])]
Epoch 1/50
   1/2969 [..............................] - ETA: 2:44:17 - loss: 0.7938 - accuracy: 0.4727



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Client 3 - Metrics after Local Epoch 1: [0.9791890355586433, 0.9829715449742186, 0.9752731636455504, 0.9791072221165429, 0.9987085521058134, 0.9831049074717363, 0.9583780711172867, 0.9584074641094005, array([[93393,  1605],
       [ 2349, 92649]])]
