# Convert File Format

This notebook converts the signal files generated by the `02_Data_Curation2.ipynb` notebook from the `EDF` file format to `PyArrow Feather` and `NumPy memmap` in order for accelerating the training speed.  
Result files to be generated by this notebook are not subject to deployment.

-----

## Configure environments

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%cd ..

C:\Users\Minjae\Desktop\EEG_Project


In [2]:
# Load some packages
import os
import glob

import math
import json
import pyedflib
import numpy as np
import pandas as pd
import pyarrow.feather as feather

import pprint
from tqdm.auto import tqdm

# custom package
from datasets.caueeg_data_curation import *
from datasets.pipeline import *

In [3]:
# Data file path
curate_path = r'local/dataset/02_Curated_Data_220419'

---
## Simple preprocessing with converting

#### Test trailing zero signals trimming

In [5]:
for i, f in enumerate(glob.glob(os.path.join(curate_path, 'signal/*.edf'))):
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(f)
    signals = trim_trailing_zeros(signals)  # trim garbage zeros
    print(f, end='\t\t')
    print(edf_header['startdate'], end='\t')
    print(edf_header['startdate'] + datetime.timedelta(seconds = signals.shape[1] / 200), end='\t')
    print()
    
    if i > 10:
        break

local/dataset/02_Curated_Data_220419\signal\00001.edf		2100-01-01 00:00:00	2100-01-01 00:12:05	
local/dataset/02_Curated_Data_220419\signal\00002.edf		2100-01-01 00:00:00	2100-01-01 00:17:18	
local/dataset/02_Curated_Data_220419\signal\00003.edf		2100-01-01 00:00:00	2100-01-01 00:10:50	
local/dataset/02_Curated_Data_220419\signal\00004.edf		2100-01-01 00:00:00	2100-01-01 00:14:01	
local/dataset/02_Curated_Data_220419\signal\00005.edf		2100-01-01 00:00:00	2100-01-01 00:15:15	
local/dataset/02_Curated_Data_220419\signal\00006.edf		2100-01-01 00:00:00	2100-01-01 00:13:57	
local/dataset/02_Curated_Data_220419\signal\00007.edf		2100-01-01 00:00:00	2100-01-01 00:15:00	
local/dataset/02_Curated_Data_220419\signal\00008.edf		2100-01-01 00:00:00	2100-01-01 00:08:18	
local/dataset/02_Curated_Data_220419\signal\00010.edf		2100-01-01 00:00:00	2100-01-01 00:16:42	
local/dataset/02_Curated_Data_220419\signal\00011.edf		2100-01-01 00:00:00	2100-01-01 00:15:16	
local/dataset/02_Curated_Data_220419\sig

---
## Convert and Save

In [None]:
save_feather = False
save_memmap = True

In [None]:
if save_feather:
    os.makedirs(os.path.join(curate_path, 'signal/feather'), exist_ok=True)

if save_memmap:
    os.makedirs(os.path.join(curate_path, 'signal/memmap'), exist_ok=True)

for f in tqdm(glob.glob(os.path.join(curate_path, 'signal/*.edf'))):
    # file name
    serial = f.split('.edf')[0][-5:]
    
    # load signal
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(f)
    signals = trim_trailing_zeros(signals)
    signals = signals.astype('int32')
    
    # save as feather
    if save_feather:
        df = pd.DataFrame(data=signals.T, columns=[s_h['label'] for s_h in signal_headers], dtype=np.int32)
        feather.write_feather(df, os.path.join(curate_path, 'signal/feather', serial + '.feather'))

    # save as numpy memmap
    if save_memmap:
        fp = np.memmap(os.path.join(curate_path, 'signal/memmap', serial + '.dat'), 
                       dtype='int32', mode='w+', shape=signals.shape)
        fp[:] = signals[:]
        fp.flush()