
I utilized the CNN implementation specifically tailored for EEG data from the following GitHub repository: https://github.com/theyou21/BigProject. This resource provided invaluable support for my CNN analysis.

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
ec_data_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EC"
eo_data_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EO"
ec_eeg_data = np.load(os.path.join(ec_data_dir, "normalized_epoch_eeg_data.npy"))
eo_eeg_data = np.load(os.path.join(eo_data_dir, "normalized_epoch_eeg_data.npy"))

In [None]:
print(ec_eeg_data.shape)
print(eo_eeg_data.shape)

(4356, 1, 32, 4975)
(4344, 1, 32, 4975)


In [None]:
ec_labels_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EC"
eo_labels_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EO"
ec_eeg_labels = np.load(os.path.join(ec_labels_dir, "labels_data.npy"))
eo_eeg_labels = np.load(os.path.join(eo_labels_dir, "labels_data.npy"))

In [None]:
print(ec_eeg_labels.shape)
print(eo_eeg_labels.shape)

(4356, 2)
(4344, 2)


In [None]:
for label in ec_eeg_labels:
  sample_id = label[0]
  if sample_id not in eo_eeg_labels[:, 0]:
        index_to_remove = np.where(ec_eeg_labels[:, 0] == sample_id)[0]
        ec_eeg_labels = np.delete(ec_eeg_labels, index_to_remove, axis=0)
        ec_eeg_data = np.delete(ec_eeg_data, index_to_remove, axis=0)
print(ec_eeg_labels.shape)
print(ec_eeg_data.shape)

eeg_data = np.concatenate((ec_eeg_data[:, 0], eo_eeg_data[:, 0]), axis=1)
eeg_data.shape

(4344, 2)
(4344, 1, 32, 4975)


(4344, 64, 4975)

In [None]:
eeg_labels = ec_eeg_labels

In [None]:
healthy_count, mdd_count = 0, 0
for sample in eeg_labels:
  if sample[1] == "MDD":
      mdd_count += 1
  else:
      healthy_count += 1

print(f"Number of MDD participants: {mdd_count}")
print(f"Number of Healthy participants: {healthy_count}")

Number of MDD participants: 3780
Number of Healthy participants: 564


Extracting data for female participants

In [None]:
import pandas as pd
import numpy as np

# Load the participants data
df_participants = pd.read_pickle('/content/drive/MyDrive/TD-BRAIN/TDBRAIN_participants_V2_data/df_participants.pkl')

# Prepare lists to hold the filtered data and labels
eeg_data_female = []
eeg_label_female = []

# Loop over each label in your existing eeg_labels list
for i, labels in enumerate(eeg_labels):
    sample_id = labels[0]
    index = df_participants.loc[df_participants['participants_ID'] == sample_id].index

    if not index.empty:  # Check if the index is not empty
        participant_gender = df_participants.loc[index, 'gender'].values[0]
        participant_condition = labels[1]  # Assuming the condition (MDD/HEALTHY) is stored in labels[1]

        # Check if participant is female and has the condition "Healthy" or "MDD"
        if participant_gender == 1 and (participant_condition == "HEALTHY" or participant_condition == "MDD"):
            eeg_data_female.append(eeg_data[i])
            eeg_label_female.append(labels)

# Convert lists to NumPy arrays for further processing
eeg_data_female = np.array(eeg_data_female)
eeg_label_female = np.array(eeg_label_female)

# Output the shape of the arrays to verify the results
print(f"Shape of female EEG data: {eeg_data_female.shape}")
print(f"Shape of female EEG labels: {eeg_label_female.shape}")

Shape of female EEG data: (1932, 64, 4975)
Shape of female EEG labels: (1932, 2)


In [None]:
healthy_count_female, mdd_count_female = 0, 0
for sample in eeg_label_female:
  if sample[1] == "MDD":
      mdd_count_female += 1
  else:
      healthy_count_female += 1

print(f"Number of MDD female participants: {mdd_count_female}")
print(f"Number of Healthy female participants: {healthy_count_female}")

Number of MDD female participants: 1740
Number of Healthy female participants: 192


### **Converting the labels to binary**
1 -> MDD

0 -> HEALTHY

# **Model**

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, BatchNormalization, Conv1D, MaxPooling1D, Flatten
import matplotlib.pyplot as plt
from keras.regularizers import l2
from keras.metrics import Precision, Recall
from sklearn.metrics import f1_score, confusion_matrix

In [None]:
ll = eeg_label_female
encountered_sample_ids = {}
sample_ids_with_more_than_12_entries = []

for index, sample_id in enumerate(ll):
    sample_id_tuple = tuple(sample_id)
    count = encountered_sample_ids.get(sample_id_tuple, 0)
    count += 1
    encountered_sample_ids[sample_id_tuple] = count
    if count > 12:
        sample_ids_with_more_than_12_entries.append((sample_id_tuple, index))

indices_to_remove = [index for _, index in sample_ids_with_more_than_12_entries]
eeg_label_female = [sample for i, sample in enumerate(eeg_label_female) if i not in indices_to_remove]
eeg_data_female = [data for i, data in enumerate(eeg_data_female) if i not in indices_to_remove]
print("Length of filtered eeg_label_female:", len(eeg_label_female))
print("Length of filtered eeg_data_female:", len(eeg_data_female))


###### Undersampling and preparing training data ########
ll = eeg_label_female
unique_sample_id = []
encountered_sample_ids = set()
print(len(ll))
for sample_id in ll:
    sample_id_tuple = tuple(sample_id)
    if sample_id_tuple not in encountered_sample_ids:
        unique_sample_id.append(sample_id)
        encountered_sample_ids.add(sample_id_tuple)
print(len(unique_sample_id))

num_samples_minority = 16
indices_maj = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "MDD"]
indices_min = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "HEALTHY"]
undersampled = np.random.choice(indices_maj, num_samples_minority, replace=False)

balanced_data_indices = np.concatenate([indices_min, undersampled])
# print(unique_sample_id)
balanced_unique_sample_id = [unique_sample_id[i] for i in balanced_data_indices]

# Extract all unique sample IDs from train_unique_sample_id
unique_sample_ids = [sample_id[0] for sample_id in balanced_unique_sample_id]
print(len(unique_sample_ids))
# Extract all indices from eeg_labels for sample IDs in train_unique_sample_id
indices = []
for i, sample_id in enumerate(eeg_label_female):
  # print(sample_id[0])
  if sample_id[0] in unique_sample_ids:
        indices.append(i)

# Convert indices to a NumPy array
indices = np.array(indices)
X_train = []
y_train = []
for i in indices:
    X_train.append(eeg_data_female[i])
    y_train.append(eeg_label_female[i])

X_train = np.array(X_train)
y_train = np.array(y_train)

# Shuffle together with their indices
permutation = np.random.permutation(len(X_train))
X_train = X_train[permutation]
y_train = y_train[permutation]

print(X_train.shape)
# print(y_train)

sample_ids = []
for sample in y_train:
  sample_ids.append(sample[0])
sample_ids = np.array(sample_ids)
l = np.array([1 if label[1] == "MDD" else 0 for label in y_train])

Length of filtered eeg_label_female: 1920
Length of filtered eeg_data_female: 1920
1920
160
32
(384, 64, 4975)


In [None]:
class EEGClassifier:
    def __init__(self, input_shape=(64, 4975), lstm_units=64):
        self.input_shape = input_shape
        self.lstm_units = lstm_units
        self.model = self.build_model()

    def build_model(self):
        from keras.models import Sequential
        from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
        from keras.metrics import Precision, Recall

        model = Sequential()
        model.add(Conv1D(filters=32, kernel_size=4, activation='relu', input_shape=self.input_shape))
        model.add(MaxPooling1D(pool_size=2))
        model.add(LSTM(units=128, return_sequences=True))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        model.add(LSTM(units=self.lstm_units))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        model.add(Dense(units=64, activation='sigmoid'))
        model.add(Dense(units=1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])
        return model

    def train(self, X_train, y_train, X_val, y_val, epochs=20, batch_size=1):
        # Calculate class weights
        class_weights = {0: 1, 1: 1}  # Initialize with equal weights
        num_minority = np.sum(y_train == 0)
        num_majority = np.sum(y_train == 1)
        total_samples = len(y_train)
        class_weights[0] = (1 / num_minority) * (total_samples / 2.0)
        class_weights[1] = (1 / num_majority) * (total_samples / 2.0)

        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), class_weight=class_weights, verbose=0)
        return history

    def evaluate(self, X_test, y_test):
        loss, accuracy, precision, recall = self.model.evaluate(X_test, y_test, verbose=0)
        print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')
        y_pred = self.model.predict(X_test)
        y_pred_classes = np.round(y_pred)
        f1 = f1_score(y_test, y_pred_classes)
        print(f'F1 Score: {f1}')
        cm = confusion_matrix(y_test, y_pred_classes)
        print('Confusion Matrix:')
        print(cm)

        evaluation_metrics = {
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'confusion_matrix': cm
        }

        return evaluation_metrics

    def predict(self, X):
        return self.model.predict(X)

    def plot_loss(self, history):
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()


In [None]:
from sklearn.model_selection import StratifiedKFold

def main():
    global classifier
    classifier = EEGClassifier()

    num_splits = 5
    cv = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

    for fold_idx, (train_index, val_index) in enumerate(cv.split(X_train, l), 1):
        print(f"Fold {fold_idx}:")

        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = l[train_index], l[val_index]

        history = classifier.train(X_train_fold, y_train_fold, X_val_fold, y_val_fold)

        # Evaluate on training set after training
        train_metrics = classifier.evaluate(X_train_fold, y_train_fold)
        print(f'Training Results - Loss: {train_metrics["loss"]}, Accuracy: {train_metrics["accuracy"]}, '
              f'Precision: {train_metrics["precision"]}, Recall: {train_metrics["recall"]}, '
              f'F1 Score: {train_metrics["f1_score"]}')

        # Evaluate on the validation set after training
        val_metrics = classifier.evaluate(X_val_fold, y_val_fold)
        print(f'Validation Results - Loss: {val_metrics["loss"]}, Accuracy: {val_metrics["accuracy"]}, '
              f'Precision: {val_metrics["precision"]}, Recall: {val_metrics["recall"]}, '
              f'F1 Score: {val_metrics["f1_score"]}')
        print()

if __name__ == "__main__":
    main()

Fold 1:
Test Loss: 0.730476975440979, Test Accuracy: 0.5276873111724854, Precision: 0.5279502868652344, Recall: 0.551948070526123
F1 Score: 0.5396825396825397
Confusion Matrix:
[[77 76]
 [69 85]]
Training Results - Loss: 0.730476975440979, Accuracy: 0.5276873111724854, Precision: 0.5279502868652344, Recall: 0.551948070526123, F1 Score: 0.5396825396825397
Test Loss: 0.7136221528053284, Test Accuracy: 0.5194805264472961, Precision: 0.5151515007019043, Recall: 0.44736841320991516
F1 Score: 0.4788732394366197
Confusion Matrix:
[[23 16]
 [21 17]]
Validation Results - Loss: 0.7136221528053284, Accuracy: 0.5194805264472961, Precision: 0.5151515007019043, Recall: 0.44736841320991516, F1 Score: 0.4788732394366197

Fold 2:
Test Loss: 0.7140184044837952, Test Accuracy: 0.5179153084754944, Precision: 0.5306122303009033, Recall: 0.33766233921051025
F1 Score: 0.4126984126984127
Confusion Matrix:
[[107  46]
 [102  52]]
Training Results - Loss: 0.7140184044837952, Accuracy: 0.5179153084754944, Precisi