# Convert File Format

This notebook converts the signal files generated by the `02_Data_Curation2.ipynb` notebook from the `EDF` file format to `PyArrow Feather` and `NumPy memmap` in order for accelerating the training speed.  
Result files to be generated by this notebook are not subject to deployment.

-----

## Configure environments

In [None]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%cd ..

In [None]:
# Load some packages
import os
import glob

import math
import json
import pyedflib
import numpy as np
import pandas as pd
import pyarrow.feather as feather

import pprint
from tqdm.auto import tqdm

# custom package
from datasets.caueeg_data_curation import *
from datasets.pipeline import *

In [3]:
# Data file path
curate_path = r'/workspace/caueeg-dataset/'

---
## Simple preprocessing with converting

#### Test trailing zero signals trimming

In [None]:
for i, f in enumerate(glob.glob(os.path.join(curate_path, 'signal/edf/*.edf'))):
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(f)
    signals = trim_trailing_zeros(signals)  # trim garbage zeros
    print(f, end='\t\t')
    print(edf_header['startdate'], end='\t')
    print(edf_header['startdate'] + datetime.timedelta(seconds = signals.shape[1] / 200), end='\t')
    print()
    
    if i > 10:
        break

---
## Convert and Save

In [5]:
save_feather = False
save_memmap = True

In [None]:
if save_feather:
    os.makedirs(os.path.join(curate_path, 'signal/feather'), exist_ok=True)

if save_memmap:
    os.makedirs(os.path.join(curate_path, 'signal/memmap'), exist_ok=True)

for f in tqdm(glob.glob(os.path.join(curate_path, 'signal/edf/*.edf'))):
    # file name
    serial = f.split('.edf')[0][-5:]
    
    # load signal
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(f)
    signals = trim_trailing_zeros(signals)
    signals = signals.astype('int32')
    
    # save as feather
    if save_feather:
        df = pd.DataFrame(data=signals.T, columns=[s_h['label'] for s_h in signal_headers], dtype=np.int32)
        feather.write_feather(df, os.path.join(curate_path, 'signal/feather', serial + '.feather'))

    # save as numpy memmap
    if save_memmap:
        fp = np.memmap(os.path.join(curate_path, 'signal/memmap', serial + '.dat'), 
                       dtype='int32', mode='w+', shape=signals.shape)
        fp[:] = signals[:]
        fp.flush()

In [None]:
(signals % 1 > 0).any()

In [None]:
import struct

# 파일 경로
file_path = '/workspace/caueeg-dataset/signal/memmap/00001.dat'

# 바이너리 파일 읽기
with open(file_path, 'rb') as file:
    byte = file.read(4)  # 예: 4바이트씩 읽기
    data = struct.unpack('f', byte)  # 예: float 형식으로 변환 (파일 형식에 맞게 수정)

    # 읽은 데이터 출력
    print(data)



In [3]:
import pandas as pd
import numpy as np
featherdata = pd.read_feather('/workspace/caueeg-dataset/signal/feather/00001.feather')

# 파일 경로 지정
file_path = '/workspace/caueeg-dataset/signal/memmap/00001.dat'

# 메모리 매핑을 통해 파일 읽기
# dtype: 파일 내 데이터의 데이터 형식 (예: float32, int32 등)
# mode: 'r' 읽기 전용, 'w+' 쓰기 가능 모드 등
# shape: 파일에 들어 있는 데이터의 배열 형태
memdata = np.memmap(file_path, dtype=np.int32, mode='r').reshape(21, -1)

# 데이터 일부 확인
print(len(memdata[0]))  # 처음 10개의 값을 출력


145000
