I utilized the LSTM implementation specifically tailored for EEG data from the following GitHub repository: https://github.com/theyou21/BigProject. This resource provided invaluable support for my LSTM analysis.

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
ec_data_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EC"
eo_data_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EO"
ec_eeg_data = np.load(os.path.join(ec_data_dir, "normalized_epoch_eeg_data.npy"))
eo_eeg_data = np.load(os.path.join(eo_data_dir, "normalized_epoch_eeg_data.npy"))

In [4]:
print(ec_eeg_data.shape)
print(eo_eeg_data.shape)

(4356, 1, 32, 4975)
(4344, 1, 32, 4975)


In [5]:
ec_labels_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EC"
eo_labels_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EO"
ec_eeg_labels = np.load(os.path.join(ec_labels_dir, "labels_data.npy"))
eo_eeg_labels = np.load(os.path.join(eo_labels_dir, "labels_data.npy"))

In [6]:
print(ec_eeg_labels.shape)
print(eo_eeg_labels.shape)

(4356, 2)
(4344, 2)


In [7]:
for label in ec_eeg_labels:
  sample_id = label[0]
  if sample_id not in eo_eeg_labels[:, 0]:
        index_to_remove = np.where(ec_eeg_labels[:, 0] == sample_id)[0]
        ec_eeg_labels = np.delete(ec_eeg_labels, index_to_remove, axis=0)
        ec_eeg_data = np.delete(ec_eeg_data, index_to_remove, axis=0)
print(ec_eeg_labels.shape)
print(ec_eeg_data.shape)

(4344, 2)
(4344, 1, 32, 4975)


In [8]:
eeg_data = np.concatenate((ec_eeg_data[:, 0], eo_eeg_data[:, 0]), axis=1)
eeg_data.shape

(4344, 64, 4975)

In [9]:
eeg_labels = eo_eeg_labels

In [10]:
healthy_count, mdd_count = 0, 0
for sample in eeg_labels:
  if sample[1] == "MDD":
      mdd_count += 1
  else:
      healthy_count += 1

print(f"Number of MDD patient: {mdd_count}")
print(f"Number of Healthy patient: {healthy_count}")

Number of MDD patient: 3780
Number of Healthy patient: 564


### **Converting the labels to binary**
1 -> MDD

0 -> HEALTHY

# **Model**

In [11]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
import matplotlib.pyplot as plt
from keras.regularizers import l2
from keras.metrics import Precision, Recall
from sklearn.metrics import f1_score, confusion_matrix

In [12]:
ll = ec_eeg_labels
encountered_sample_ids = {}
sample_ids_with_more_than_12_entries = []

for index, sample_id in enumerate(ll):
    sample_id_tuple = tuple(sample_id)
    count = encountered_sample_ids.get(sample_id_tuple, 0)
    count += 1
    encountered_sample_ids[sample_id_tuple] = count
    if count > 12:
        sample_ids_with_more_than_12_entries.append((sample_id_tuple, index))

indices_to_remove = [index for _, index in sample_ids_with_more_than_12_entries]
ec_eeg_labels = [sample for i, sample in enumerate(ec_eeg_labels) if i not in indices_to_remove]
eeg_data = [data for i, data in enumerate(eeg_data) if i not in indices_to_remove]
print("Length of filtered ec_eeg_labels:", len(ec_eeg_labels))
print("Length of filtered eeg_data:", len(eeg_data))


###### Undersampling and preparing training data ########
ll = ec_eeg_labels
unique_sample_id = []
encountered_sample_ids = set()
print(len(ll))
for sample_id in ll:
    sample_id_tuple = tuple(sample_id)
    if sample_id_tuple not in encountered_sample_ids:
        unique_sample_id.append(sample_id)
        encountered_sample_ids.add(sample_id_tuple)
print(len(unique_sample_id))

num_samples_minority = 47
indices_maj = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "MDD"]
indices_min = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "HEALTHY"]
undersampled = np.random.choice(indices_maj, num_samples_minority, replace=False)

balanced_data_indices = np.concatenate([indices_min, undersampled])
# print(unique_sample_id)
balanced_unique_sample_id = [unique_sample_id[i] for i in balanced_data_indices]

# Extract all unique sample IDs from train_unique_sample_id
unique_sample_ids = [sample_id[0] for sample_id in balanced_unique_sample_id]
print(len(unique_sample_ids))
# Extract all indices from eeg_labels for sample IDs in train_unique_sample_id
indices = []
for i, sample_id in enumerate(ec_eeg_labels):
  # print(sample_id[0])
  if sample_id[0] in unique_sample_ids:
        indices.append(i)

# Convert indices to a NumPy array
indices = np.array(indices)
X_train = []
y_train = []
for i in indices:
    X_train.append(eeg_data[i])
    y_train.append(eeg_labels[i])

X_train = np.array(X_train)
y_train = np.array(y_train)

# Shuffle together with their indices
permutation = np.random.permutation(len(X_train))
X_train = X_train[permutation]
y_train = y_train[permutation]

print(X_train.shape)
# print(y_train)

sample_ids = []
for sample in y_train:
  sample_ids.append(sample[0])
sample_ids = np.array(sample_ids)
l = np.array([1 if label[1] == "MDD" else 0 for label in y_train])

Length of filtered ec_eeg_labels: 4248
Length of filtered eeg_data: 4248
4248
354
94
(1128, 64, 4975)


In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, confusion_matrix
from keras.models import Sequential
from keras.layers import LSTM, Dropout, BatchNormalization, Dense
from keras.optimizers import Adam
from keras.metrics import Precision, Recall
from sklearn.model_selection import StratifiedKFold

class EEGClassifier:
    def __init__(self, input_shape=(64, 4975)):
        self.input_shape = input_shape
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(LSTM(units=64, input_shape=self.input_shape, return_sequences=True))
        model.add(Dropout(0.5))
        model.add(BatchNormalization())

        model.add(LSTM(units=64, return_sequences=True))
        model.add(Dropout(0.5))

        model.add(LSTM(units=32))
        model.add(Dropout(0.5))

        model.add(Dense(units=32, activation='relu'))
        model.add(Dropout(0.5))

        model.add(Dense(units=1, activation='sigmoid'))

        optimizer = Adam(learning_rate=0.001)
        model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', Precision(), Recall()])
        return model

    def train(self, X_train, y_train, X_val, y_val, epochs=20, batch_size=32):
        # Calculate class weights
        class_weights = {0: 1, 1: 1}  # Initialize with equal weights
        num_minority = np.sum(y_train == 0)
        num_majority = np.sum(y_train == 1)
        total_samples = len(y_train)
        class_weights[0] = (1 / num_minority) * (total_samples / 2.0)
        class_weights[1] = (1 / num_majority) * (total_samples / 2.0)

        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), class_weight=class_weights, verbose=0)
        return history

    def evaluate(self, X_test, y_test):
        loss, accuracy, precision, recall = self.model.evaluate(X_test, y_test, verbose=0)
        y_pred = self.model.predict(X_test, verbose=0)
        y_pred_classes = np.round(y_pred)
        f1 = f1_score(y_test, y_pred_classes)
        cm = confusion_matrix(y_test, y_pred_classes)

        evaluation_metrics = {
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'confusion_matrix': cm
        }

        return evaluation_metrics

    def predict(self, X):
        return self.model.predict(X, verbose=0)

    def plot_loss(self, history):
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

def main():
    classifier = EEGClassifier()

    num_splits = 5
    cv = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

    overall_train_metrics = []
    overall_val_metrics = []

    for fold_idx, (train_index, val_index) in enumerate(cv.split(X_train, l), 1):
        print(f"Fold {fold_idx}:")

        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = l[train_index], l[val_index]

        history = classifier.train(X_train_fold, y_train_fold, X_val_fold, y_val_fold)

        # Evaluate on training set after training
        train_metrics = classifier.evaluate(X_train_fold, y_train_fold)
        print(f'Training Results - Loss: {train_metrics["loss"]}, Accuracy: {train_metrics["accuracy"]}, '
              f'Precision: {train_metrics["precision"]}, Recall: {train_metrics["recall"]}, '
              f'F1 Score: {train_metrics["f1_score"]}')
        overall_train_metrics.append(train_metrics)

        # Evaluate on the validation set after training
        val_metrics = classifier.evaluate(X_val_fold, y_val_fold)
        print(f'Validation Results - Loss: {val_metrics["loss"]}, Accuracy: {val_metrics["accuracy"]}, '
              f'Precision: {val_metrics["precision"]}, Recall: {val_metrics["recall"]}, '
              f'F1 Score: {val_metrics["f1_score"]}')
        overall_val_metrics.append(val_metrics)
        print()

    # Calculate and print overall metrics
    def calculate_overall_metrics(metrics_list):
        avg_metrics = {}
        for key in metrics_list[0].keys():
            avg_metrics[key] = np.mean([metrics[key] for metrics in metrics_list], axis=0)
        return avg_metrics

    overall_train_metrics = calculate_overall_metrics(overall_train_metrics)
    overall_val_metrics = calculate_overall_metrics(overall_val_metrics)

    print("Overall Training Metrics:")
    print(overall_train_metrics)
    print("\nOverall Validation Metrics:")
    print(overall_val_metrics)

if __name__ == "__main__":
    main()


  super().__init__(**kwargs)


Fold 1:
Training Results - Loss: 1.765676142871379852, Accuracy: 0.6188913536071777, Precision: 0.7285027502058202, Recall: 0.708344361782074, F1 Score: 0.6991714995857498
Validation Results - Loss: 2.5720951557159424, Accuracy: 0.5442478060722351, Precision: 0.6899224519729614, Recall: 0.5855262875556946, F1 Score: 0.6334519572953736

Fold 2:
Training Results - Loss: 0.322660944825038314, Accuracy: 0.7488913536071777, Precision: 0.8309397492791843, Recall: 0.7983471035957336, F1 Score: 0.7291728701406121
Validation Results - Loss: 0.4839482009410858, Accuracy: 0.7182300758361816, Precision: 0.7683561611175537, Recall: 0.75072847962379456, F1 Score: 0.69225589225589226

Fold 3:
Training Results - Loss: 0.512056710198521614, Accuracy: 0.8477827072143555, Precision: 0.7463826482682733, Recall: 0.7966942071914673, F1 Score: 0.783443708609272
Validation Results - Loss: 0.5872975587844849, Accuracy: 0.8149557638168335, Precision: 0.7139072895050049, Recall: 0.7139072895050049, F1 Score: 0.7