# EEG Classification

In [1]:
!wget -O file.zip "https://figshare.com/ndownloader/articles/4244171/versions/2"

--2024-03-06 15:28:30--  https://figshare.com/ndownloader/articles/4244171/versions/2
Resolving figshare.com (figshare.com)... 34.249.32.236, 34.249.22.33, 2a05:d018:1f4:d003:8d1a:b96f:3b54:baed, ...
Connecting to figshare.com (figshare.com)|34.249.32.236|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 851639264 (812M) [application/zip]
Saving to: 'file.zip'


2024-03-06 15:29:02 (25.7 MB/s) - 'file.zip' saved [851639264/851639264]



In [3]:
!unzip -q file.zip -d dataset

In [2]:
# importing necessary libraries
from glob import glob
import mne
import numpy as np 
import matplotlib.pyplot as plt 

In [4]:
# Reading all files path
all_files_path = glob('dataset/*.edf')
len(all_files_path)

181

In [5]:
# Checking a file path
all_files_path[0]

'dataset/H S15 EC.edf'

In [6]:
# Separating healthy and patients edf files
healthy_file_path = [i for i in all_files_path if 'H' in i.split('/')[1]]
patient_file_path=[i for i in all_files_path if  'MDD' in i.split('/')[1] ]
print(len(healthy_file_path),len(patient_file_path))

86 95


In [7]:
# defining a function to read data
def read_data(file_path):
    datax=mne.io.read_raw_edf(file_path,preload=True)
    datax.set_eeg_reference()
    datax.filter(l_freq=1,h_freq=45)
    epochs=mne.make_fixed_length_epochs(datax,duration=5,overlap=1)
    epochs=epochs.get_data()
    return epochs #trials,channel,length

In [8]:
# testing read data function with one file path
data = read_data(healthy_file_path[0])

Extracting EDF parameters from /kaggle/working/dataset/H S15 EC.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 76799  =      0.000 ...   299.996 secs...
EEG channel type selected for re-referencing
Applying average reference.
Applying a custom ('EEG',) reference.
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 1 - 45 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 1.00
- Lower transition bandwidth: 1.00 Hz (-6 dB cutoff frequency: 0.50 Hz)
- Upper passband edge: 45.00 Hz
- Upper transition bandwidth: 11.25 Hz (-6 dB cutoff frequency: 50.62 Hz)
- Filter length: 845 samples (3.301 s)

Not setting metadata
74 matching events found
No baseline correction applied
0 projection items activated
Using data f

[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.1s


In [9]:
# Checking the shape of data
data.shape

(74, 22, 1280)

### Reading All Files

In [10]:
%%capture
# capture is the magic function used to hide the large output getting during read function
# reading the healthy and patient data using our read data function with list comprehension 
control_epochs_array=[read_data(subject) for subject in healthy_file_path]
patients_epochs_array=[read_data(subject) for subject in patient_file_path]

In [11]:
# Creating a labels array with respect to epochs array
control_epochs_labels=[len(i)*[0] for i in control_epochs_array]
patients_epochs_labels=[len(i)*[1] for i in patients_epochs_array]
print(len(control_epochs_labels),len(patients_epochs_labels))

86 95


In [12]:
# Combining the data
data_list=control_epochs_array+patients_epochs_array
label_list=control_epochs_labels+patients_epochs_labels
print(len(data_list),len(label_list))

181 181


In [13]:
# Combining/grouping the data according to patients
groups_list=[[i]*len(j) for i, j in enumerate(data_list)]

In [14]:
# Find the maximum number of channels and length of epochs among all subjects
max_num_channels = max([epoch.shape[1] for epoch in data_list])
max_epochs_length = max([epoch.shape[2] for epoch in data_list])

# Pad or truncate epochs to the maximum number of channels and length
padded_data_list = []
for epoch in data_list:
    num_trials, num_channels, epoch_length = epoch.shape
    if num_channels < max_num_channels:
        # Pad the epoch along the channels dimension (axis=1)
        pad_width_channels = ((0, 0), (0, max_num_channels - num_channels), (0, 0))
        epoch = np.pad(epoch, pad_width_channels, mode='constant')
    elif num_channels > max_num_channels:
        # Truncate the epoch along the channels dimension (axis=1)
        epoch = epoch[:, :max_num_channels, :]
    
    if epoch_length < max_epochs_length:
        # Pad the epoch along the time dimension (axis=2)
        pad_width_time = ((0, 0), (0, 0), (0, max_epochs_length - epoch_length))
        epoch = np.pad(epoch, pad_width_time, mode='constant')
    elif epoch_length > max_epochs_length:
        # Truncate the epoch along the time dimension (axis=2)
        epoch = epoch[:, :, :max_epochs_length]
    
    padded_data_list.append(epoch)

In [15]:
data_array = np.vstack(padded_data_list)
label_array = np.hstack(label_list)
group_array = np.hstack(groups_list)
print(data_array.shape,label_array.shape,group_array.shape)

(18387, 22, 1280) (18387,) (18387,)


### Feature Engineering

In [16]:
from scipy import stats
def mean(data):
    return np.mean(data,axis=-1)
    
def std(data):
    return np.std(data,axis=-1)

def ptp(data):
    return np.ptp(data,axis=-1)

def var(data):
        return np.var(data,axis=-1)

def minim(data):
      return np.min(data,axis=-1)


def maxim(data):
      return np.max(data,axis=-1)

def argminim(data):
      return np.argmin(data,axis=-1)


def argmaxim(data):
      return np.argmax(data,axis=-1)

def mean_square(data):
      return np.mean(data**2,axis=-1)

def rms(data): #root mean square
      return  np.sqrt(np.mean(data**2,axis=-1))  

def abs_diffs_signal(data):
    return np.sum(np.abs(np.diff(data,axis=-1)),axis=-1)


def skewness(data):
    return stats.skew(data,axis=-1)

def kurtosis(data):
    return stats.kurtosis(data,axis=-1)

def concatenate_features(data):
    return np.concatenate((mean(data),std(data),ptp(data),var(data),minim(data),maxim(data),argminim(data),argmaxim(data),
                          mean_square(data),rms(data),abs_diffs_signal(data),
                          skewness(data),kurtosis(data)),axis=-1)

In [17]:
from tqdm import tqdm_notebook
features=[]
for data in tqdm_notebook(data_array):
    features.append(concatenate_features(data))
features=np.array(features)
features.shape

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm_notebook(data_array):


  0%|          | 0/18387 [00:00<?, ?it/s]

(18387, 286)

In [18]:
# removing nan values
nan_mask = np.isnan(features).any(axis=0)

# Select columns that do not contain NaN values
features_cleaned = features[:, ~nan_mask]

print("Original features shape:", features.shape)
print("Features shape after dropping columns with NaN values:", features_cleaned.shape)

Original features shape: (18387, 286)
Features shape after dropping columns with NaN values: (18387, 282)


### Train Test Split

In [19]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
split = splitter.split(X=features_cleaned,y=label_array,groups=group_array)
train_inds, test_inds = next(split)

X_train,X_test = features_cleaned[train_inds], features_cleaned[test_inds]
y_train,y_test = label_array[train_inds], label_array[test_inds]
g_train,g_test = group_array[train_inds], group_array[test_inds]

In [20]:
print((X_train.shape,y_train.shape,g_train.shape,))
print((X_test.shape,y_test.shape,g_test.shape,))
group_array.max()

((14411, 282), (14411,), (14411,))
((3976, 282), (3976,), (3976,))


180

In [21]:
# dictionary to store KPIs
kpis = {}

# Logistic Regression

### Training Model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [23]:
clf=LogisticRegression(max_iter=1000)
LGRP=Pipeline([('scaler',StandardScaler()),('classifier',clf)])
LGRP.fit(X_train,y_train)

### Model Evaluation

In [24]:
train_pred = LGRP.predict(X_train)
test_pred = LGRP.predict(X_test)

In [25]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score

# Calculate precision
precision = precision_score(y_train, train_pred)

# Calculate recall
recall = recall_score(y_train, train_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_train, train_pred)

# Calculate accuracy
accuracy = accuracy_score(y_train, train_pred)

kpis['LGR_train'] = {
    'precision': precision,
    'recall': recall,
    'confusion_matrix' : conf_matrix,
    "accuracy": accuracy
}

print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

Precision: 0.9392126860481291
Recall: 0.9667812142038946
Confusion Matrix:
 [[6990  437]
 [ 232 6752]]
Accuracy: 0.9535771285823329


In [26]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score

# Calculate precision
precision = precision_score(y_test, test_pred)

# Calculate recall
recall = recall_score(y_test, test_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test, test_pred)
kpis['LGR_test'] = {
    'precision': precision,
    'recall': recall,
    'confusion_matrix' : conf_matrix,
    "accuracy": accuracy
}
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

Precision: 0.9621730382293763
Recall: 0.8625541125541125
Confusion Matrix:
 [[1110   94]
 [ 381 2391]]
Accuracy: 0.880533199195171


# Random Forest Classifier

### Training Model

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [28]:
clf=RandomForestClassifier()
RFCP=Pipeline([('scaler',StandardScaler()),('classifier',clf)])
RFCP.fit(X_train,y_train)

### Model Evaluation

In [29]:
train_pred = RFCP.predict(X_train)
test_pred = RFCP.predict(X_test)

In [30]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score

# Calculate precision
precision = precision_score(y_train, train_pred)

# Calculate recall
recall = recall_score(y_train, train_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_train, train_pred)

# Calculate accuracy
accuracy = accuracy_score(y_train, train_pred)

kpis['RFC_train'] = {
    'precision': precision,
    'recall': recall,
    'confusion_matrix' : conf_matrix,
    "accuracy": accuracy
}

print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

Precision: 1.0
Recall: 1.0
Confusion Matrix:
 [[7427    0]
 [   0 6984]]
Accuracy: 1.0


In [31]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score

# Calculate precision
precision = precision_score(y_test, test_pred)

# Calculate recall
recall = recall_score(y_test, test_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test, test_pred)

kpis['RFC_test'] = {
    'precision': precision,
    'recall': recall,
    'confusion_matrix' : conf_matrix,
    "accuracy": accuracy
}

print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

Precision: 0.9604685212298683
Recall: 0.9466089466089466
Confusion Matrix:
 [[1096  108]
 [ 148 2624]]
Accuracy: 0.9356136820925554


# Support Vector Classifier

### Training Model

In [32]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [33]:
clf=SVC(max_iter=1000)
SVCP=Pipeline([('scaler',StandardScaler()),('classifier',clf)])
SVCP.fit(X_train,y_train)



### Model Evaluation

In [34]:
train_pred = SVCP.predict(X_train)
test_pred = SVCP.predict(X_test)

In [35]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score

# Calculate precision
precision = precision_score(y_train, train_pred)

# Calculate recall
recall = recall_score(y_train, train_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_train, train_pred)

# Calculate accuracy
accuracy = accuracy_score(y_train, train_pred)

kpis['SVC_train'] = {
    'precision': precision,
    'recall': recall,
    'confusion_matrix' : conf_matrix,
    "accuracy": accuracy
}

print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

Precision: 0.6653830963665087
Recall: 0.9649198167239404
Confusion Matrix:
 [[4038 3389]
 [ 245 6739]]
Accuracy: 0.7478315175907293


In [36]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score

# Calculate precision
precision = precision_score(y_test, test_pred)

# Calculate recall
recall = recall_score(y_test, test_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test, test_pred)

kpis['SVC_test'] = {
    'precision': precision,
    'recall': recall,
    'confusion_matrix' : conf_matrix,
    "accuracy": accuracy
}

print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

Precision: 0.8664648503195426
Recall: 0.9292929292929293
Confusion Matrix:
 [[ 807  397]
 [ 196 2576]]
Accuracy: 0.8508551307847082


In [37]:
kpis

{'LGR_train': {'precision': 0.9392126860481291,
  'recall': 0.9667812142038946,
  'confusion_matrix': array([[6990,  437],
         [ 232, 6752]]),
  'accuracy': 0.9535771285823329},
 'LGR_test': {'precision': 0.9621730382293763,
  'recall': 0.8625541125541125,
  'confusion_matrix': array([[1110,   94],
         [ 381, 2391]]),
  'accuracy': 0.880533199195171},
 'RFC_train': {'precision': 1.0,
  'recall': 1.0,
  'confusion_matrix': array([[7427,    0],
         [   0, 6984]]),
  'accuracy': 1.0},
 'RFC_test': {'precision': 0.9604685212298683,
  'recall': 0.9466089466089466,
  'confusion_matrix': array([[1096,  108],
         [ 148, 2624]]),
  'accuracy': 0.9356136820925554},
 'SVC_train': {'precision': 0.6653830963665087,
  'recall': 0.9649198167239404,
  'confusion_matrix': array([[4038, 3389],
         [ 245, 6739]]),
  'accuracy': 0.7478315175907293},
 'SVC_test': {'precision': 0.8664648503195426,
  'recall': 0.9292929292929293,
  'confusion_matrix': array([[ 807,  397],
         [ 

In [42]:
import pandas as pd
kpi_df = pd.DataFrame(kpis).transpose()

In [43]:
kpi_df.to_excel('performace.xlsx')

In [44]:
kpi_df

Unnamed: 0,precision,recall,confusion_matrix,accuracy
LGR_train,0.939213,0.966781,"[[6990, 437], [232, 6752]]",0.953577
LGR_test,0.962173,0.862554,"[[1110, 94], [381, 2391]]",0.880533
RFC_train,1.0,1.0,"[[7427, 0], [0, 6984]]",1.0
RFC_test,0.960469,0.946609,"[[1096, 108], [148, 2624]]",0.935614
SVC_train,0.665383,0.96492,"[[4038, 3389], [245, 6739]]",0.747832
SVC_test,0.866465,0.929293,"[[807, 397], [196, 2576]]",0.850855
