# ECG database generator
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2018-10-25
License: MIT
v0.4.2

DESCRIPTION:
Generate a CSV file listing all the subjects from a folder containing all the ECG/PPG files with the subject's name in the filename. This script works only using filenames.

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.13

USAGE: Input the directory containing all ECG/PPG files. The files will be searched recursively (so subfolders will also be explored).

TODO:
* Nothing here.

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.aux_funcs import save_dict_as_csv, _tqdm, recwalk, cleanup_name, cleanup_name_customregex

In [None]:
# PARAMETERS - EDIT ME

# Path where all the ECG/PPG files are
inputpath = r'G:\dropbox\Dropbox\Carol_Francesco'
# Output database (csv file)
out_db = r'databases_output\ecg_subjects.csv'

In [None]:
filetypes_to_include = ['ecg', 'ecg 2', 'puls', 'puls 2', 'resp', 'resp 2']
total = sum(1 for _ in recwalk(inputpath, sorting=True, folders=False, topdown=True, filetype=filetypes_to_include))

res = {}
for dirpath, filename in _tqdm(recwalk(inputpath, sorting=True, folders=False, topdown=True, filetype=filetypes_to_include), total=total):
    # Separate filename and extension
    filenameonly,extension = os.path.splitext(filename)
    # Cleanup name (remove weird characters, accentuated characters and metadata like numbers and sequence infos like 'repos', 'spatial', etc)
    (name, matches) = cleanup_name_customregex(cleanup_name(str(filenameonly)), customregex = {'_': ' ',
                       '(repos|rest)': '',
                       'ecg': '',
                       '[0-9]+': '',
                       'pulse?': '',
                       'respi?': '',
                       'spa(t|c)ial': '',
                       'tennis': '',
                       'musique': '',
                       '(compassion|meditation|neutre|opacite)': '',
                       'sedation': '',
                      }, returnmatches=True)
    name = name.strip()
    # Name is empty (eg, filename is '.thumbnail'), skip
    if not name:
        continue
    # Add the name if not already in the dict
    if not name in res:
        res[name] = {}
        # Create a unique set in the dict (so that we can add data types)
        res[name]['ecg_types'] = set()
        res[name]['ecg_infos'] = set()
    # Add the data type (eg, ecg, puls, resp, etc)
    res[name]['ecg_types'].add(extension.replace('.', ''))
    res[name]['ecg_infos'].update(matches)

print('Subjects count: %i' % len(res))
res

In [None]:
# Convert the dict to a list of dict (to create a column for the key=subject name)
res2 = [{'name': name, 'ecg_types': list(value['ecg_types']), 'ecg_infos': list(value['ecg_infos'])} for name,value in res.items()]
res2

In [None]:
if save_dict_as_csv(res2, out_db, fields_order=['name', 'ecg_types', 'ecg_infos'], csv_order_by='name'):
    print('File successfully saved in: %s' % out_db)
else:
    print('Error when trying to save the database!')