I utilized the SVM implementation specifically tailored for EEG data from the following GitHub repository: https://github.com/jayavardhanravi/EEG-Data-predection/blob/master/mypart1.py. This resource provided invaluable support for my SVM analysis.

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
ec_data_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EC"
eo_data_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EO"
ec_eeg_data = np.load(os.path.join(ec_data_dir, "normalized_epoch_eeg_data.npy"))
eo_eeg_data = np.load(os.path.join(eo_data_dir, "normalized_epoch_eeg_data.npy"))

In [4]:
print(ec_eeg_data.shape)
print(eo_eeg_data.shape)

(4356, 1, 32, 4975)
(4344, 1, 32, 4975)


In [5]:
ec_labels_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EC"
eo_labels_dir = "/content/drive/MyDrive/TD-BRAIN/training_data/data/EO"
ec_eeg_labels = np.load(os.path.join(ec_labels_dir, "labels_data.npy"))
eo_eeg_labels = np.load(os.path.join(eo_labels_dir, "labels_data.npy"))

In [6]:
print(ec_eeg_labels.shape)
print(eo_eeg_labels.shape)

(4356, 2)
(4344, 2)


In [7]:
for label in ec_eeg_labels:
  sample_id = label[0]
  if sample_id not in eo_eeg_labels[:, 0]:
        index_to_remove = np.where(ec_eeg_labels[:, 0] == sample_id)[0]
        ec_eeg_labels = np.delete(ec_eeg_labels, index_to_remove, axis=0)
        ec_eeg_data = np.delete(ec_eeg_data, index_to_remove, axis=0)
print(ec_eeg_labels.shape)
print(ec_eeg_data.shape)

eeg_data = np.concatenate((ec_eeg_data[:, 0], eo_eeg_data[:, 0]), axis=1)
eeg_data.shape

eeg_labels = ec_eeg_labels

(4344, 2)
(4344, 1, 32, 4975)


In [8]:
healthy_count, mdd_count = 0, 0
for sample in eeg_labels:
  if sample[1] == "MDD":
      mdd_count += 1
  else:
      healthy_count += 1

print(f"Number of MDD participants: {mdd_count}")
print(f"Number of Healthy participants: {healthy_count}")

Number of MDD participants: 3780
Number of Healthy participants: 564


#Extracting data for female participants

In [9]:
df_participants = pd.read_pickle('/content/drive/MyDrive/TD-BRAIN/TDBRAIN_participants_V2_data/df_participants.pkl')
female_count = 0

eeg_data_female = []
eeg_label_female = []

for i, labels in enumerate(eeg_labels):
     sample_id = labels[0]
     index = df_participants.loc[df_participants['participants_ID'] == sample_id].index
     if [value for value in df_participants.loc[index, 'gender']][0] == 1:
       eeg_data_female.append(eeg_data[i])
       eeg_label_female.append(eeg_labels[i])

eeg_data_female = np.array(eeg_data_female)
eeg_label_female = np.array(eeg_label_female)

print(eeg_data_female.shape, eeg_label_female.shape)

(1932, 64, 4975) (1932, 2)


In [10]:
healthy_count_female, mdd_count_female = 0, 0
for sample in eeg_label_female:
  if sample[1] == "MDD":
      mdd_count_female += 1
  else:
      healthy_count_female += 1

print(f"Number of MDD female participants: {mdd_count_female}")
print(f"Number of Healthy female participants: {healthy_count_female}")

Number of MDD female participants: 1740
Number of Healthy female participants: 192


# **Model**

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils import resample, shuffle

In [12]:
ll = eeg_label_female
encountered_sample_ids = {}
sample_ids_with_more_than_12_entries = []

for index, sample_id in enumerate(ll):
    sample_id_tuple = tuple(sample_id)
    count = encountered_sample_ids.get(sample_id_tuple, 0)
    count += 1
    encountered_sample_ids[sample_id_tuple] = count
    if count > 12:
        sample_ids_with_more_than_12_entries.append((sample_id_tuple, index))

indices_to_remove = [index for _, index in sample_ids_with_more_than_12_entries]
eeg_label_female = [sample for i, sample in enumerate(eeg_label_female) if i not in indices_to_remove]
eeg_data_female = [data for i, data in enumerate(eeg_data_female) if i not in indices_to_remove]
print("Length of filtered eeg_label_female:", len(eeg_label_female))
print("Length of filtered eeg_data_female:", len(eeg_data_female))


###### Undersampling and preparing training data ########
ll = eeg_label_female
unique_sample_id = []
encountered_sample_ids = set()
print(len(ll))
for sample_id in ll:
    sample_id_tuple = tuple(sample_id)
    if sample_id_tuple not in encountered_sample_ids:
        unique_sample_id.append(sample_id)
        encountered_sample_ids.add(sample_id_tuple)
print(len(unique_sample_id))

num_samples_minority = 16
indices_maj = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "MDD"]
indices_min = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "HEALTHY"]
undersampled = np.random.choice(indices_maj, num_samples_minority, replace=False)

balanced_data_indices = np.concatenate([indices_min, undersampled])
# print(unique_sample_id)
balanced_unique_sample_id = [unique_sample_id[i] for i in balanced_data_indices]

# Extract all unique sample IDs from train_unique_sample_id
unique_sample_ids = [sample_id[0] for sample_id in balanced_unique_sample_id]
print(len(unique_sample_ids))
# Extract all indices from eeg_labels for sample IDs in train_unique_sample_id
indices = []
for i, sample_id in enumerate(eeg_label_female):
  # print(sample_id[0])
  if sample_id[0] in unique_sample_ids:
        indices.append(i)

# Convert indices to a NumPy array
indices = np.array(indices)
X_train = []
y_train = []
for i in indices:
    X_train.append(eeg_data_female[i])
    y_train.append(eeg_label_female[i])

X_train = np.array(X_train)
y_train = np.array(y_train)

# Shuffle together with their indices
permutation = np.random.permutation(len(X_train))
X_train = X_train[permutation]
y_train = y_train[permutation]

print(X_train.shape)
# print(y_train)

sample_ids = []
for sample in y_train:
  sample_ids.append(sample[0])
sample_ids = np.array(sample_ids)
l = np.array([1 if label[1] == "MDD" else 0 for label in y_train])

Length of filtered eeg_label_female: 1920
Length of filtered eeg_data_female: 1920
1920
160
32
(384, 64, 4975)


# Results

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from keras.metrics import Precision, Recall
import matplotlib.pyplot as plt

class EEGClassifier:
    def __init__(self, input_shape=(64, 4975), lstm_units=64):
        self.input_shape = input_shape
        self.lstm_units = lstm_units
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Conv1D(filters=32, kernel_size=4, activation='relu', input_shape=self.input_shape))
        model.add(MaxPooling1D(pool_size=2))
        model.add(LSTM(units=128, return_sequences=True))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        model.add(LSTM(units=self.lstm_units))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))
        model.add(Dense(units=64, activation='sigmoid'))
        model.add(Dense(units=1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])
        return model

    def train(self, X_train, y_train, X_val, y_val, epochs=20, batch_size=1):
        class_weights = {0: 1, 1: 1}  # Initialize with equal weights
        num_minority = np.sum(y_train == 0)
        num_majority = np.sum(y_train == 1)
        total_samples = len(y_train)
        class_weights[0] = (1 / num_minority) * (total_samples / 2.0)
        class_weights[1] = (1 / num_majority) * (total_samples / 2.0)

        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), class_weight=class_weights, verbose=0)
        return history

    def evaluate(self, X_test, y_test):
        loss, accuracy, precision, recall = self.model.evaluate(X_test, y_test, verbose=0)
        y_pred = self.model.predict(X_test)
        y_pred_classes = np.round(y_pred)
        f1 = f1_score(y_test, y_pred_classes)
        cm = confusion_matrix(y_test, y_pred_classes)
        evaluation_metrics = {
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'confusion_matrix': cm
        }
        return evaluation_metrics

    def plot_loss(self, history):
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

def main():
    global X, y
    df_participants = pd.read_pickle('/content/drive/MyDrive/TD-BRAIN/TDBRAIN_participants_V2_data/df_participants.pkl')
    female_count = 0

    eeg_data_female = []
    eeg_label_female = []

    for i, labels in enumerate(eeg_labels):
      sample_id = labels[0]
      index = df_participants.loc[df_participants['participants_ID'] == sample_id].index
      if [value for value in df_participants.loc[index, 'gender']][0] == 1:
        eeg_data_female.append(eeg_data[i])
        eeg_label_female.append(eeg_labels[i])

    eeg_data_female = np.array(eeg_data_female)
    eeg_label_female = np.array(eeg_label_female)

    print(eeg_data_female.shape, eeg_label_female.shape)

    # Step 1: Filter out entries with more than 12 instances
    ll = eeg_label_female
    encountered_sample_ids = {}
    sample_ids_with_more_than_12_entries = []

    for index, sample_id in enumerate(ll):
        sample_id_tuple = tuple(sample_id)
        count = encountered_sample_ids.get(sample_id_tuple, 0)
        count += 1
        encountered_sample_ids[sample_id_tuple] = count
        if count > 12:
            sample_ids_with_more_than_12_entries.append((sample_id_tuple, index))

    indices_to_remove = [index for _, index in sample_ids_with_more_than_12_entries]
    eeg_label_female = [sample for i, sample in enumerate(eeg_label_female) if i not in indices_to_remove]
    eeg_data_female = [data for i, data in enumerate(eeg_data_female) if i not in indices_to_remove]
    print("Length of filtered eeg_label_female:", len(eeg_label_female))
    print("Length of filtered eeg_data_female:", len(eeg_data_female))

    # Step 2: Undersampling and preparing training data
    ll = eeg_label_female
    unique_sample_id = []
    encountered_sample_ids = set()
    print(len(ll))
    for sample_id in ll:
        sample_id_tuple = tuple(sample_id)
        if sample_id_tuple not in encountered_sample_ids:
            unique_sample_id.append(sample_id)
            encountered_sample_ids.add(sample_id_tuple)
    print(len(unique_sample_id))

    num_samples_minority = 16
    indices_maj = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "MDD"]
    indices_min = [index for index, sample in enumerate(unique_sample_id) if sample[1] == "HEALTHY"]
    undersampled = np.random.choice(indices_maj, num_samples_minority, replace=False)

    balanced_data_indices = np.concatenate([indices_min, undersampled])
    balanced_unique_sample_id = [unique_sample_id[i] for i in balanced_data_indices]

    # Extract all unique sample IDs from balanced_unique_sample_id
    unique_sample_ids = [sample_id[0] for sample_id in balanced_unique_sample_id]
    print(len(unique_sample_ids))
    # Extract all indices from eeg_label_female for sample IDs in balanced_unique_sample_id
    indices = []
    for i, sample_id in enumerate(eeg_label_female):
        if sample_id[0] in unique_sample_ids:
            indices.append(i)

    # Convert indices to a NumPy array
    indices = np.array(indices)
    X = np.array([eeg_data_female[i] for i in indices])
    y = np.array([1 if label[1] == "MDD" else 0 for label in eeg_label_female if label[0] in unique_sample_ids])

    # Shuffle together with their indices
    permutation = np.random.permutation(len(X))
    X = X[permutation]
    y = y[permutation]

    print(X.shape)

    # Split the data into training, validation, and test sets
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

    classifier = EEGClassifier()

    num_splits = 5
    cv = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

    val_accuracies = []
    val_precisions = []
    val_recalls = []
    val_f1_scores = []
    val_confusion_matrices = []

    for fold_idx, (train_index, val_index) in enumerate(cv.split(X_train, y_train), 1):
        print(f"Fold {fold_idx}:")

        X_train_fold = X_train[train_index]
        y_train_fold = y_train[train_index]
        X_val_fold = X_train[val_index]
        y_val_fold = y_train[val_index]

        history = classifier.train(X_train_fold, y_train_fold, X_val_fold, y_val_fold)

        train_metrics = classifier.evaluate(X_train_fold, y_train_fold)
        print(f'Training Results - Loss: {train_metrics["loss"]}, Accuracy: {train_metrics["accuracy"]}, '
              f'Precision: {train_metrics["precision"]}, Recall: {train_metrics["recall"]}, '
              f'F1 Score: {train_metrics["f1_score"]}')

        val_metrics = classifier.evaluate(X_val_fold, y_val_fold)
        print(f'Validation Results - Loss: {val_metrics["loss"]}, Accuracy: {val_metrics["accuracy"]}, '
              f'Precision: {val_metrics["precision"]}, Recall: {val_metrics["recall"]}, '
              f'F1 Score: {val_metrics["f1_score"]}')
        print('Validation Confusion Matrix:')
        print(val_metrics['confusion_matrix'])
        print()

        val_accuracies.append(val_metrics["accuracy"])
        val_precisions.append(val_metrics["precision"])
        val_recalls.append(val_metrics["recall"])
        val_f1_scores.append(val_metrics["f1_score"])
        val_confusion_matrices.append(val_metrics["confusion_matrix"])

    # Calculate average validation metrics
    avg_val_accuracy = np.mean(val_accuracies)
    avg_val_precision = np.mean(val_precisions)
    avg_val_recall = np.mean(val_recalls)
    avg_val_f1_score = np.mean(val_f1_scores)
    avg_val_confusion_matrix = np.mean(val_confusion_matrices, axis=0)

    print(f'Average Validation Accuracy: {avg_val_accuracy}')
    print(f'Average Validation Precision: {avg_val_precision}')
    print(f'Average Validation Recall: {avg_val_recall}')
    print(f'Average Validation F1 Score: {avg_val_f1_score}')
    print(f'Average Validation Confusion Matrix: \n{avg_val_confusion_matrix}')

    # Evaluate on the test set
    test_metrics = classifier.evaluate(X_test, y_test)
    print(f'Test Results - Accuracy: {test_metrics["accuracy"]}, Precision: {test_metrics["precision"]}, '
          f'Recall: {test_metrics["recall"]}, F1 Score: {test_metrics["f1_score"]}')
    print('Test Confusion Matrix:')
    print(test_metrics['confusion_matrix'])

if __name__ == "__main__":
    main()


(1932, 64, 4975) (1932, 2)
Length of filtered eeg_label_female: 1920
Length of filtered eeg_data_female: 1920
1920
160
32
(384, 64, 4975)
Fold 1:
Training Results - Loss: 0.730476975440979, Accuracy: 0.5276873111724854, Precision: 0.5279502868652344, Recall: 0.551948070526123, F1 Score: 0.5396825396825397
Validation Results - Loss: 0.7136221528053284, Accuracy: 0.5194805264472961, Precision: 0.5151515007019043, Recall: 0.44736841320991516, F1 Score: 0.4788732394366197
Validation Confusion Matrix:
[[20  12]
 [18 12]]

Fold 2:
Training Results - Loss: 0.7140184044837952, Accuracy: 0.5179153084754944, Precision: 0.5306122303009033, Recall: 0.33766233921051025, F1 Score: 0.4126984126984127
Validation Results - Loss: 0.7472937703132629, Accuracy: 0.41558441519737244, Precision: 0.37037035822868347, Recall: 0.2631579041481018, F1 Score: 0.30769230769230765
Validation Confusion Matrix:
[[21 7]
 [29 4]]

Fold 3:
Training Results - Loss: 0.7454760670661926, Accuracy: 0.45928338170051575, Precis