# EEG db cleaner
v0.0.2
By Stephen Karl Larroque
License: MIT

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from collections import OrderedDict
from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, merge_two_df, df_remap_names, concat_vals, df_literal_eval, df_to_unicode, df_to_unicode_fast, reorder_cols_df, df_encode


In [None]:
# PARAMETERS

# Input EEG database
eeg_db = r'databases_original\EEG_database_complete.xlsx'
eeg_db_to_unicode = True

# Output cleaned database
out_db = r'databases_output\eggdbclean.csv'

In [None]:
import pandas as pd

# Load first database
if eeg_db.endswith('.csv'):
    ceeg = pd.read_csv(eeg_db, sep=';', low_memory=False).dropna(how='all')
else:
    ceeg = pd.read_excel(eeg_db).dropna(how='all')
if eeg_db_to_unicode:  # convert to unicode if required (can fix issues with accentuated characters)
    ceeg = df_to_unicode_fast(ceeg, progress_bar=True)
ceeg

In [None]:
# Extract names

def camel_case_split(identifier):
    # Split a mixed case string into a list
    # By 200_success: https://stackoverflow.com/a/29920015
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]
def split_oldcode(x):
    if pd.isnull(x):
        return None
    else:
        # Get only the first part of the string
        s = x.split('_')[0] 
        # Test if camel case
        if not (s.islower() or s.isupper()):
            s = ' '.join(camel_case_split(s))
        return s

# First try to recompose the full name from last name and first name if present
ceeg.loc[:, 'name'] = ceeg.loc[:, ['Last name', 'First name']].apply(lambda x: ' '.join(x).strip() if not (pd.isnull(x[0]) and pd.isnull(x[1])) else None, axis=1)
# Else try to convert the old code into a full name (or at least the last name), first by splitting on '_', then by adding space between mixed case words (which indicates a composite name)
ceeg.loc[ceeg['name'].isnull() & ~ceeg['Old code'].isnull(), 'name'] = ceeg.loc[:, 'Old code'].apply(split_oldcode)
# Display the result
ceeg.loc[:, ['Last name', 'First name', 'Old code', 'name']]

In [None]:
# Add a column to signal that eeg is available for all these subjects
ceeg.loc[:, 'hasEEG'] = True

In [None]:
# Save the merge mapping and unified database as csv files
ceeg_unicode = df_to_unicode_fast(ceeg)
if save_df_as_csv(ceeg_unicode, out_db, fields_order=False, csv_order_by='name'):
    print('Cleaned EEG database successfully saved in %s!' % out_db)
else:
    print('ERROR: the cleaned EEG database could not be saved!')