# FMP DB cleaner
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2018-02-15
License: MIT
v1.2.3
2018-2019

DESCRIPTION:
This script compiles all CRS-R sessions of one subject in one row, extracting the best diagnosis and detecting misdiagnosis. It also cleans buggy rows/columns so that everything is easier to process in subsequent notebooks.

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.15

USAGE:
Export all records and columns from the FileMakerPro interface into a CSV file, and input this file here.

TODO:

    * X Keep rows per CRS-R (use hierarchical index on name) so we can easily process per crs-r
    * X CRS-R 2 & 3 if non null rename as CRS-R 1 and add as additional rows (copy date, means that multiple CRS-R were performed on same day)
    * X extract CRS-R table separately from other fields? other fields are one row, CRS-R only are multiple rows.
    * X make one column "subscore" summary in one field eg 123456. If non integer then just replace by X
    * X Can concatenate CRS-R on one row and one field: dict with key = date and value = subitem scores. Then no loss of info.
    * X implement Sarah's rules to detect misdiagnosis.

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, compute_best_diag, reorder_cols_df, find_columns_matching, concat_strings, concat_vals_unique, df_to_unicode, df_to_unicode_fast, df_subscores_concat


In [None]:
# PARAMETERS

# FileMakerPro (FMP) database
fmp_csv = r'databases_original\fmp-db-export_2018-01-15-reordered2.csv'
# Directory where to save resulting csv files
output_dir = r'databases_output'

In [None]:
# AUX FUNCTIONS

# Moved to aux file

In [None]:
import pandas as pd
import numpy as np

cf = pd.read_csv(fmp_csv, sep=';', low_memory=False).dropna(axis=0, how='all')  # drop empty lines
cf = df_to_unicode_fast(cf, progress_bar=True)
cf

In [None]:
# Drop weird columns that does not bring any info (the info is either redundant or constant)
cf.drop(columns=['CRSr::PatientIDMERGEFIELD', 'CRSr::PatientNAMEMERGEFIELD', 'CRSr::consistent movement to commandtest'], inplace=True)

In [None]:
# Extract CRS-R fields in a separate column
cf_crsr_columns = find_columns_matching(cf, ['crsr', 'crs-r'])
print(cf_crsr_columns)
cf_crsr = cf[['Name'] + cf_crsr_columns]
cf_crsr

In [None]:
# Drop empty rows (not even a date!)
cf_crsr = cf_crsr.dropna(axis=0, how='all')
# Fill missing name for consecutive CRS-Rs: this is how FileMakerPro works, the name is set only for the first row, then it is empty to imply it is the same until a new name shows up
cf_crsr.loc[:, 'Name'].fillna(method='ffill', inplace=True)
cf_crsr

In [None]:
# Extract unique patients details (unique in the sense that one row = one patient's data) by removing CRS-Rs
cf.drop(columns=cf_crsr_columns, inplace=True)
cf

In [None]:
# Drop empty rows (after removing CRS-Rs)
cf.dropna(axis=0, how='all', inplace=True)
# Show rows where there is still info (eg, PET)
cf.loc[pd.isnull(cf['Name'])]

In [None]:
# Fill missing name for additional infos such as PET
cf.loc[:, 'Name'].fillna(method='ffill', inplace=True)

In [None]:
# Aggregate together in one row the additional infos, so we have all the infos and one row per patient (but we don't have the date of PET scan but at least we know the existence)
cf_agg = cf.groupby('Name').agg(concat_vals_unique)
cf_agg

In [None]:
# Find all subjects where all infos are missing in the db except the name
cf_agg_missing_all = cf_agg.loc[cf_agg.drop(columns='PatientID').isnull().all(axis=1)]  # need to drop patientid which is an automatic field automatically added by FMP
save_df_as_csv(df_to_unicode(cf_agg_missing_all.reset_index()), os.path.join(output_dir, 'fmp_db_subjects_missing_all_infos.csv'), fields_order=['Name'], csv_order_by='Name')  # use reset_index() to put the Name back as a column instead of index
print('Saved in fmp_db_subjects_missing_all_infos.csv')
cf_agg_missing_all

In [None]:
# Remove patients with null infos from both dataframes
if len(cf_agg_missing_all.index) > 0:
    cf_agg.drop(labels=cf_agg_missing_all.index, inplace=True)
    try:
        cf_crsr.drop(labels=cf_agg_missing_all.index, inplace=True)
    except (ValueError, KeyError) as exc:
        pass

In [None]:
# Stack CRSr 2/3 columns as rows and rename as CRSr (copy the date of CRSr over)
cf_crsr_columns2 = find_columns_matching(cf_crsr, 'CRSr 2')
cf_crsr_columns3 = find_columns_matching(cf_crsr, 'CRSr 3')
# Extract the columns as separate dataframes
cf_crsr2 = cf_crsr[['Name'] + cf_crsr_columns2]
cf_crsr3 = cf_crsr[['Name'] + cf_crsr_columns3]
# Drop empty rows
cf_crsr2 = cf_crsr2.set_index('Name').dropna(axis=0, how='all')
cf_crsr3 = cf_crsr3.set_index('Name').dropna(axis=0, how='all')
# Rename columns
cf_crsr2.columns = [x.replace('CRSr 2', 'CRSr') for x in cf_crsr2.columns]
cf_crsr3.columns = [x.replace('CRSr 3', 'CRSr') for x in cf_crsr2.columns]
# Append back into original dataframe
cf_crsr_all = pd.concat([cf_crsr.drop(columns=cf_crsr_columns2 + cf_crsr_columns3), cf_crsr2.reset_index(), cf_crsr3.reset_index()], sort=True)
assert(not [x for x in cf_crsr.drop(columns=cf_crsr_columns2 + cf_crsr_columns3).columns if x not in cf_crsr_all.columns]) # check we are not losing any column in the merge
# Clean up a bit by trimming empty rows (eg, only CRSr 3::* was filled but not CRSr::*, then we will be left with empty CRSr::*)
# Note: this will also drop subjects that have no CRS-R at all
cf_crsr_all = cf_crsr_all.set_index('Name').dropna(axis=0, how='all').reset_index()
# Change the index by Name and Date
cf_crsr_all = cf_crsr_all.set_index(['Name', 'CRSr::Date of CRSr'])
# Display the result
cf_crsr_all

In [None]:
# Create CRS-R subscores summary (eg, 123456)
cf_crsr_all = df_subscores_concat(cf_crsr_all, cols=['CRSr::Auditory Total Score', 'CRSr::Visual Total Score', 'CRSr::Motor Total Score', 'CRSr::Oromotor Total Score', 'CRSr::communication Total Score', 'CRSr::Arousal Total Score'], col_out='CRSr::Subscores')
cf_crsr_all

In [None]:
# Reorder the columns of the CRS-R dataframe, will be easier to have a quick look
reorder_cols = ['CRSr::Diagnosis', 'CRSr::Computed Outcome', 'CRSr::Subscores','CRSr::Auditory Total Score', 'CRSr::Visual Total Score', 'CRSr::Motor Total Score', 'CRSr::Oromotor Total Score', 'CRSr::communication Total Score', 'CRSr::Arousal Total Score']
cf_crsr_all = reorder_cols_df(cf_crsr_all, reorder_cols)
cf_crsr_all

In [None]:
# Find all subjects where all CRS-R are missing (ie, not even a single CRS-R recorded) in the db except the name

## OLD WAY
#cf_crsr_all_missing = cf_crsr_all.loc[cf_crsr_all.drop(columns=['CRSr::Subscores']).isnull().all(axis=1)]  # need to drop patientid which is an automatic field automatically added by FMP
# Get the indices (=patients names) that are missing crsr and those that have at least one available
#idx_missing = cf_crsr_all_missing.index.get_level_values(0)
#idx_present = cf_crsr_all.loc[~cf_crsr_all.drop(columns=['CRSr::Subscores']).isnull().all(axis=1)].index.get_level_values(0)
# Find patients without any CRSr (exclusion between two lists of indices)
#cf_crsr_all_really_missing = cf_crsr_all_missing[~idx_missing.isin(idx_present)]

## NEW WAY
missing_all_crsr = list(set(cf_crsr.reset_index()['Name'].unique()) - set(cf_crsr_all.reset_index()['Name'].unique()))
cf_crsr_all_really_missing = pd.DataFrame(missing_all_crsr, columns=['Name'])
# Save the results
save_df_as_csv(df_to_unicode(cf_crsr_all_really_missing), os.path.join(output_dir, 'fmp_db_subjects_missing_all_crsr.csv'), fields_order=['Name'], csv_order_by='Name')  # use reset_index() to put the Name back as a column instead of index
print('Saved in fmp_db_subjects_missing_all_crsr.csv')
cf_crsr_all_really_missing

In [None]:
# Debug
#cf_crsr.reset_index()[cf_crsr.reset_index()['Name'] == 'Some Subject']

In [None]:
# Sort both dataframes
cf_crsr_all.sort_index(inplace=True)
cf_agg.sort_index(inplace=True)

In [None]:
# Save crsr database
save_df_as_csv(df_to_unicode(cf_crsr_all), os.path.join(output_dir, 'fmp_db_subjects_crsr.csv'), fields_order=['CRSr::Diagnosis', 'CRSr::Computed Outcome', 'CRSr::Subscores','CRSr::Auditory Total Score', 'CRSr::Visual Total Score', 'CRSr::Motor Total Score', 'CRSr::Oromotor Total Score', 'CRSr::communication Total Score', 'CRSr::Arousal Total Score'], keep_index=True)

In [None]:
# Compute best diagnosis for each patient

# Compute best diagnosis by using Pandas categories
cf_crsr_bestdiag1 = compute_best_diag(cf_crsr_all.loc[:,'CRSr::Computed Outcome'], ['coma', 'vs/uws', 'mcs', 'mcs-', 'mcs+', 'emcs', 'lis'])
cf_crsr_bestdiag2 = compute_best_diag(cf_crsr_all.loc[:,'CRSr::Diagnosis'], ['coma', 'vs/uws', 'vs',  'mcs', 'mcs-', 'mcs+', 'emcs', 'lis', 'lis incomplete'])
# Place them back into the original dataframe as new columns
cf_agg.loc[:,'CRSr::Best Computed Outcome'] = cf_crsr_bestdiag1
cf_agg.loc[:,'CRSr::Best Diagnosis'] = cf_crsr_bestdiag2
# Bonus: add the best total score
cf_agg.loc[:,'CRSr::Best total'] = cf_crsr_all.loc[:,'CRSr::total'].max(level=0)

In [None]:
# Reorder columns for better visibility and show the result
cf_agg = reorder_cols_df(cf_agg, ['CRSr::Best Computed Outcome', 'CRSr::Best Diagnosis', 'Final diagnosis', 'CRSr::Best total', 'Date of Accident', 'Date of Birth', 'Date of Death', 'Etiology', 'Etiology specified.'])
cf_agg

In [None]:
# Add back the CRS-R infos as aggregated lists
cf_crsr_all_agg = cf_crsr_all.reset_index().groupby('Name').agg(concat_vals_unique)
#cf.groupby(cf['Name']).agg(list)
cf_agg_all = pd.concat([cf_agg, cf_crsr_all_agg], axis=1)
cf_agg_all.index.name = 'Name'  # add the name of the index
# Add an aggregate of CRSr dates with related subscore, so we don't lose any information (or at least we keep the essential)
cf_agg_all['CRSr::Date and subscores'] = cf_crsr_all.reset_index(level=1)[['CRSr::Date of CRSr', 'CRSr::Subscores']].apply(lambda x: concat_strings(x, sep=':'), axis=1).reset_index().groupby('Name').agg(concat_vals_unique)
#cf_agg.append(cf_crsr_all_agg)
save_df_as_csv(df_to_unicode(cf_agg_all), os.path.join(output_dir, 'fmp_db_subjects_aggregated.csv'), fields_order=False, keep_index=True)
print('Saved in fmp_db_subjects_aggregated.csv')
cf_agg_all

-----------------------------
## Additional stats and inference (optional)

In [None]:
# Patients missing any manual diagnosis
cf_missing_diag1 = cf_agg[cf_agg['CRSr::Best Diagnosis'].isnull()]['CRSr::Best Diagnosis']
save_df_as_csv(df_to_unicode(pd.DataFrame(cf_missing_diag1)), os.path.join(output_dir, 'fmp_db_subjects_missing_diagnosis.csv'), fields_order=False, keep_index=True)
cf_missing_diag1

In [None]:
# Patients missing any automatic diagnosis
cf_missing_diag2 = cf_agg[cf_agg['CRSr::Best Computed Outcome'].isnull()]['CRSr::Best Computed Outcome']
save_df_as_csv(df_to_unicode(pd.DataFrame(cf_missing_diag2)), os.path.join(output_dir, 'fmp_db_subjects_missing_computed_outcome.csv'), fields_order=False, keep_index=True)
cf_missing_diag2

In [None]:
# Patients missing final diagnosis
cf_missing_diag3 = cf_agg[cf_agg['Final diagnosis'].isnull()]['Final diagnosis']
save_df_as_csv(df_to_unicode(pd.DataFrame(cf_missing_diag3)), os.path.join(output_dir, 'fmp_db_subjects_missing_finaldiagnosis.csv'), fields_order=False, keep_index=True)
cf_missing_diag3

In [None]:
# Find patients missing ALL kinds of diagnosis
cf_missing_all_diags = cf_missing_diag1
for idxs in [cf_missing_diag2.index, cf_missing_diag3.index]:
    cf_missing_all_diags = cf_missing_all_diags[cf_missing_all_diags.index.isin(idxs)]
save_df_as_csv(df_to_unicode(pd.DataFrame(cf_missing_all_diags)), os.path.join(output_dir, 'fmp_db_subjects_missing_alldiags.csv'), fields_order=False, keep_index=True)
cf_missing_all_diags

In [None]:
# Find all patients where the diagnosis is different between the automatic and the manual
def compare_diff_2cols(x):
    if not pd.isnull(x.iloc[0]) and not pd.isnull(x.iloc[1]) and x.iloc[0].lower().strip() != x.iloc[1].lower().strip():
        if 'vs' in x.iloc[0].lower().strip() and 'vs' in x.iloc[1].lower().strip():  # special case of equivalence (TODO: do it more elegantly by replacing directly in the dataframe)
            return False
        else:
            return True
    else:
        return False

def compare_diff_3cols(x):
    # Note: for this function to work, one must use concat_vals_unique to ensure that only one value remains for each diagnosis column
    # Compare 1st and 2nd columns
    if not pd.isnull(x.iloc[0]) and not pd.isnull(x.iloc[1]) and x.iloc[0].lower().strip() != x.iloc[1].lower().strip():
        if 'vs' in x.iloc[0].lower().strip() and 'vs' in x.iloc[1].lower().strip():  # special case of equivalence (TODO: do it more elegantly by replacing directly in the dataframe)
            return False
        else:
            return True
    else:
        # Compare 2nd and 3rd columns
        try:
            if not pd.isnull(x.iloc[1]) and not pd.isnull(x.iloc[2]) and x.iloc[1].lower().strip() != x.iloc[2].lower().strip():
                return True
            else:
                return False
        except Exception as exc:
            print(x.iloc[2])
            print(x.iloc[1])
            print(x)
            raise

cf_different_diags = cf_agg[cf_agg[['CRSr::Best Computed Outcome', 'CRSr::Best Diagnosis']].apply(compare_diff_2cols, axis=1)]
cf_different_diags_3cols = cf_agg[cf_agg[['CRSr::Best Computed Outcome', 'CRSr::Best Diagnosis', 'Final diagnosis']].apply(compare_diff_3cols, axis=1)]
save_df_as_csv(df_to_unicode(cf_different_diags), os.path.join(output_dir, 'fmp_db_subjects_different_diags.csv'), fields_order=df_to_unicode(cf_agg).columns, keep_index=True)
save_df_as_csv(df_to_unicode(cf_different_diags_3cols), os.path.join(output_dir, 'fmp_db_subjects_different_diags_3cols.csv'), fields_order=df_to_unicode(cf_agg).columns, keep_index=True)
cf_different_diags

In [None]:
# Find all patients who have a CRSr but missing the date
cf_missing_crsr_date = cf_crsr_all.reset_index(level=1)[cf_crsr_all.reset_index(level=1)['CRSr::Date of CRSr'].isnull()]
save_df_as_csv(df_to_unicode(cf_missing_crsr_date), os.path.join(output_dir, 'fmp_db_subjects_missing_crsr_date.csv'), fields_order=False, keep_index=True)
cf_missing_crsr_date

In [None]:
# Find all patients with missing both etiology and etiology specified fields (no etiology at all!)
cf_missing_etio = cf_agg[cf_agg[['Etiology', 'Etiology specified.']].isnull().all(axis=1)][['Etiology', 'Etiology specified.']]
save_df_as_csv(df_to_unicode(cf_missing_etio), os.path.join(output_dir, 'fmp_db_subjects_missing_etiology.csv'), fields_order=False, keep_index=True)
cf_missing_etio

In [None]:
# Find patients missing the sedation info about MRI EPI
cf_missing_episedation = cf_agg[cf_agg['MRI::Sedation EPI'].isnull()][find_columns_matching(cf_agg, 'sedat')]
save_df_as_csv(df_to_unicode(cf_missing_episedation), os.path.join(output_dir, 'fmp_db_subjects_missing_episedation.csv'), fields_order=False, keep_index=True)
cf_missing_episedation

In [None]:
# Find patients missing any sedation info (either MRI EPI or PET)
cf_missing_anysedation = cf_agg[cf_agg[['MRI::Sedation EPI', 'PET::Sedation']].isnull().any(axis=1)][find_columns_matching(cf_agg, 'sedat')]
save_df_as_csv(df_to_unicode(cf_missing_anysedation), os.path.join(output_dir, 'fmp_db_subjects_missing_anysedation.csv'), fields_order=False, keep_index=True)
cf_missing_anysedation