#Install Required Packages

In [None]:
%%capture
#Install webget into colab
!pip install wget
#Package to read edf files
!pip install pyedflib

#Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


#Import Packages

In [None]:
#Import required packages
from wget import download
from os import chdir, getcwd, mkdir, path, listdir
import numpy as np
import pandas as pd
from pathlib import Path
from pyedflib import EdfReader
from tqdm.notebook import tqdm

#Download Dataset

In [None]:
#Total 24 patients
folders = 24
dataset_path = 'CHB-MIT'
#Dataset URL
base_url = 'https://archive.physionet.org/pn6/chbmit/'

#Empty list to store folder names
list_of_folders = []

#Labeling folders with names
for i in range (1, folders+1):
    if i<10:
        name = 'chb0'
    else:
        name = 'chb'
    list_of_folders.append(name + str(i))

#This function helps in finding files
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start + len(needle))
        n -= 1
    return start

#Get current working directory
initial_path = getcwd()

#Create main dataset folder
if not path.exists(dataset_path):
    mkdir(dataset_path)

#Change directory to the created folder  
chdir(dataset_path)

#List to store files names
files = []

#Iterate over each folder
for l in list_of_folders:
  
    #Create sub-folder if does not exist already
    if not path.exists(l):
        mkdir(l)
        print('Directory', l, 'Created')
    else:
        print ('Directory', l,  'Exists')

    #Change directory to the folder
    chdir(l)
    
    #Get MD5 file for each patient
    folder_url = base_url + l 
    url = folder_url + '/' + 'MD5SUMS'
    print ('Downloading ', url, 'to ', getcwd())
    if not path.isfile('MD5SUMS'):
        download(url)
    else:
        print('File MD5SUMS already exists to ', getcwd())
    #Read sub-file names    
    f = open('MD5SUMS','r')
    lines = f.readlines()
    f.close()

    #Iterate over each file
    for line in lines:
        fname = line[line.find('chb'):line.find('\n')].strip()
        url = folder_url + '/' + fname
        
        #Download each file
        if not path.isfile(fname) and fname.endswith('.seizures'):
            print ('Downloading ', url[:find_nth(url, '.seizures', 1)], 'to ', getcwd())
            download(url[:find_nth(url, '.seizures', 1)])
            print ('Downloading ', url, 'to ', getcwd())
            download(url)
        
    chdir('..')
chdir(initial_path)

Directory chb01 Created
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/MD5SUMS to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_03.edf to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_03.edf.seizures to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_04.edf to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_04.edf.seizures to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_15.edf to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_15.edf.seizures to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_16.edf to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn6/chbmit/chb01/chb01_16.edf.seizures to  /content/CHB-MIT/chb01
Downloading  https://archive.physionet.org/pn

#Load Dataset

In [None]:
#Method to read single EDF file
def Edf_Read(file_path, channels):
  #Open file in read mode
  f_read = EdfReader(file_path)
  num_sig = f_read.signals_in_file
  sig_labels = f_read.getSignalLabels()
  sig_labels.append('seizure')
  sigbufs = np.zeros((f_read.getNSamples()[0],num_sig+1))
  #Iterate over number of signals
  for i in np.arange(num_sig): sigbufs[:,i] = f_read.readSignal(i)
  sigbufs[:, num_sig]= 0.0
  #Store the values in data frame
  df =  pd.DataFrame(data = sigbufs, columns = sig_labels)
  df = df.loc[:, channels]
  df = df.loc[:, ~df.columns.duplicated()]
  f_read._close()
  return df.values

#Obtain seizure period from file
def Seizure_Period(f_loc):
  bytes_array = []
  for byte in Path(f_loc).read_bytes(): bytes_array.append(byte)
  return int(str(bin(bytes_array[38]))[2:] + str(bin(bytes_array[41]))[2:],2), bytes_array[49]

#Load paitent data to pandas dataframe
def Load_data (data_dir, Fs, channels):
  #Base data directory
  base_dir = getcwd()
  #Change working directory
  chdir(data_dir)
  #Paitent ids. Not all patients data is considered due to memory issues
  patients = ['chb01','chb04','chb07','chb10','chb13','chb16','chb19']
  #Array to store data
  arr = np.array([], dtype=np.float64).reshape(0, len(channels))
  #Iterate over each patient
  for patient in patients:
      #Change directory to patient
      chdir(patient)
      #Current paitient data
      print('Data reading from patient: ', patient)
      #Gather patient non seizure data
      EDF_Files = [f for f in listdir() if path.isfile(f) and f.endswith('edf')]
      #Arrange the data files
      EDF_Files.sort()
      #Get patient seizure attack fiels
      Seizures_files = [f for f in listdir() if path.isfile(f) and f.endswith('seizures')]
      #Arrange the data
      Seizures_files.sort()
      #Iterate over data files
      for file in tqdm(EDF_Files):
          #Read data from current file
          sigbufs = Edf_Read(file, channels)
          #Check if the file is of seizure attack
          if Seizures_files and Seizures_files[0].startswith(file):
              (start, length) = Seizure_Period(Seizures_files[0])
              for i in range(start * Fs, (start+length)*Fs + 1):
                  sigbufs[i][len(channels)-1] = 1.0
              #Remove the loaded file
              Seizures_files.pop(0)
      #Concatenate the data
      arr = np.concatenate([arr, sigbufs])
      #Change directory
      chdir('..')
  chdir(base_dir)
  #Create a pandas data frame
  df = pd.DataFrame(arr, columns = channels)
  df.reset_index(drop = True, inplace = True)
  return df

In [None]:
#EEG Channels
channels = ['FP1-F7',
            'F7-T7',
            'T7-P7',
            'P7-O1',
            'FP1-F3',
            'F3-C3',
            'C3-P3',
            'P3-O1',
            'FP2-F4',
            'F4-C4',
            'C4-P4',
            'P4-O2',
            'FP2-F8',
            'F8-T8',
            'T8-P8',
            'P8-O2',
            'FZ-CZ',
            'CZ-PZ',
            'seizure']

#Data folder
Data_dir = 'CHB-MIT'

#Sampling frequency
Fs = 256

#Load the dataset to pandas frame
df = Load_data(Data_dir, Fs, channels)

#Save the frame to a csv file
df.to_csv('data.csv', index = False)

#Move the csv file to gdrive to store
!mv data.csv /content/gdrive/MyDrive/

Data reading from patient:  chb01


  0%|          | 0/7 [00:00<?, ?it/s]

Data reading from patient:  chb04


  0%|          | 0/3 [00:00<?, ?it/s]

Data reading from patient:  chb07


  0%|          | 0/3 [00:00<?, ?it/s]

Data reading from patient:  chb10


  0%|          | 0/7 [00:00<?, ?it/s]

Data reading from patient:  chb13


  0%|          | 0/8 [00:00<?, ?it/s]

Data reading from patient:  chb16


  0%|          | 0/6 [00:00<?, ?it/s]

Data reading from patient:  chb19


  0%|          | 0/3 [00:00<?, ?it/s]