# LIBRARIES & PACKAGES

In [None]:
import zipfile
import pandas as pd
import os
from google.colab import drive

In [None]:
drive.mount('/content/gdrive') # gdrive folder mounting

Mounted at /content/gdrive


# FUNCTIONS

In [None]:
def unzip_all(zip_file_path, zip_dir):
    """
    Unzips folder containing .txt files of eeg signals, reading from zip_file_path and storing unzipped data in zip_dir.

    Args:
        zip_file_path (str): Path to the zip folder containing txt files.
        zip_dir (str): Directory to store unzipped data.

    """

    zip_files = [f for f in os.listdir(zip_file_path) if f.endswith('.zip')]

    for zip_file in zip_files:
        path_zip = os.path.join(zip_file_path, zip_file)
        folder_name = os.path.splitext(zip_file)[0]
        path_extraction = os.path.join(zip_dir, folder_name)
        os.makedirs(path_extraction, exist_ok=True)

        with zipfile.ZipFile(path_zip, 'r') as zip_ref:
            zip_ref.extractall(path_extraction)

        print(f'Files of {zip_file} have been extracted in {path_extraction}')

In [None]:
def concatenate_txt(directory_path: str) -> pd.DataFrame:
    """
    Concatenates all txt files in the specified directory into a single
    DataFrame.

    Args:
        directory_path (str): Path to the directory containing CSV files.

    Returns:
        pd.DataFrame: Concatenated DataFrame.
    """
    txt_files = [
        file for file in os.listdir(directory_path) if file.endswith('.txt') or file.endswith('.TXT')
    ]
    # print("txt_files :", txt_files)
    dfs = []
    for txt in txt_files:
        single_sj = pd.read_table(os.path.join(directory_path, txt), header=None)
        dfs.append(single_sj.T)

    result = pd.concat(dfs, ignore_index=True)

    return result

#**Building the dataset**

The data (also known as BONN dataset) were downloaded from https://www.ukbonn.de/epileptologie/arbeitsgruppen/ag-lehnertz-neurophysik/downloads/. Specifically, five folders, each containing 100 .txt files, were obtained in .zip format for subsequent extraction.

Each of .txt file contains 4097 numerical value (either positives and negatives) representing the analogic-to-digital 12 bit conversion resulting from 128-signal channels recording procedure

In [None]:
# Setting zip path and unzipped folder directory variables
zip_file_path = '/content/gdrive/MyDrive/zip'
zip_dir = '/content/gdrive/MyDrive/zip/unzip'


unzip_all(zip_file_path, zip_dir) # extracting .zip files

Files of A.zip have been extracted in /content/gdrive/MyDrive/zip/unzip/A
Files of B.zip have been extracted in /content/gdrive/MyDrive/zip/unzip/B
Files of C.zip have been extracted in /content/gdrive/MyDrive/zip/unzip/C
Files of D.zip have been extracted in /content/gdrive/MyDrive/zip/unzip/D
Files of E.zip have been extracted in /content/gdrive/MyDrive/zip/unzip/E


In order to keep each 100 signals blocks labeled, we added a final column named 'class' with which we traced the specific EEG class belonging.
Specifically:
* **A** --> tracings recorded from healthy volunteers which were asked to be awake with eyes open;
* **B** --> tracings recorded from healthy volunteers which were asked to be awake with eyes closed;
* **C** --> tracings recorded from patients with epilepsy. The electrodes were placed on the hyppocampus of the opposite brain hemisphere with respect to the epileptogenic zone;
* **D** --> tracings recorded from patients with epilepsy. The electrodes were placed on the epileptogenic zone;
* **E** --> tracings recorded from patients with epilepsy during seizure activity.

In [None]:
# Creating concatenated sub_dataframes for each class, adding 'class' column

A = concatenate_txt(r"/content/gdrive/MyDrive/zip/unzip/A/Z")
A['class'] = 'A'
B = concatenate_txt(r"/content/gdrive/MyDrive/zip/unzip/B/O")
B['class'] = 'B'
C = concatenate_txt(r"/content/gdrive/MyDrive/zip/unzip/C/N")
C['class'] = 'C'
D = concatenate_txt(r"/content/gdrive/MyDrive/zip/unzip/D/F")
D['class'] = 'D'
E = concatenate_txt(r"/content/gdrive/MyDrive/zip/unzip/E/S")
E['class'] = 'E'

In [None]:
# Obtaining the working dataframe by concatenating each sub_dataframes into a single one of shape (500, 4098)
eeg = pd.concat([A,B,C,D,E])
eeg.to_csv(r'/content/gdrive/MyDrive/eeg.csv', # writing the .csv
          sep=";", header=True, index=False)

print("dimension of dataset: ", eeg.shape)
eeg.head()

(500, 4098)
