# Sarah DB cleaner & FMP comparison
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2018-02-16
License: MIT
v1.0.3

DESCRIPTION:
This script compiles all CRS-R sessions of one subject and create a hierarchical multiindex by subject name and crs-r date, which can then be compared with filemakerpro

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.13

USAGE:
Please input Sarah's database prepared as following:
*  in csv format
* add a "Name" for the 1st column where there are all the names.
* Fix the CRS-R columns that have changing names between iterations (else you will get an error: "AssertionError: Cannot stack the columns as they have changing names" after running a few cells below)

TODO:


In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

# For DB reorganization
from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, merge_two_df, remove_strings_from_df, find_columns_matching, reorder_cols_df, compute_best_diag

# For multi DB comparison
import re
from csg_fileutil_libs.distance import distance
from csg_fileutil_libs.aux_funcs import distance_jaccard_words_split


In [None]:
# PARAMETERS

# Input databases
# Sarah's database
sarah_csv = r'databases_original\sarah_BDD_rCRS-R_.csv'
# FileMakerPro (clinical) database, will be used as a reference to cleanup Sarah's database
fmp_csv = r'databases_output\fmp_db_subjects_crsr.csv'

In [None]:
# AUX FUNCTIONS

# Moved to aux file

In [None]:
import pandas as pd
import numpy as np

cf = pd.read_csv(sarah_csv, sep=';', low_memory=False).dropna(axis=0, how='all')  # drop empty lines (but NOT columns else we might lose important columns such as 13_since_onset which was never filled but is necessary for the stacking!)
cf.drop(columns=find_columns_matching(cf, 'Unnamed'), inplace=True)  # drop all unnamed column
cf

In [None]:
# Extract CRS-R fields in a separate column
cf_crsr_columns = find_columns_matching(cf, ['%i_' % i for i in xrange(1,19)])
print(cf_crsr_columns)
cf_crsr = cf[['Name'] + cf_crsr_columns].set_index('Name')
cf_crsr

In [None]:
# Stack CRSr 2/3 columns as rows and rename as CRSr (copy the date of CRSr over)
cf_crsr_columns_all = [find_columns_matching(cf_crsr, '%i_' % i, startswith=True) for i in xrange(1,19)]
cf_crsr_allblocks = []
for i, cols in enumerate(cf_crsr_columns_all):
    # Extract the columns as separate dataframes
    cf_crsr_temp = cf_crsr[cols]
    # Drop empty rows
    cf_crsr_temp = cf_crsr_temp.dropna(axis=0, how='all')
    # Rename columns
    cf_crsr_temp.columns = [x.replace('%i_' % (i+1), '').lower() for x in cf_crsr_temp.columns]
    # Set assessment date as key too to allow for concatenation later on
    cf_crsr_temp = cf_crsr_temp.reset_index().set_index(['Name', 'date_assess'])
    # Add to the stack of dataframes, we will concatenate after
    cf_crsr_allblocks.append(cf_crsr_temp)

# Sanity check: all stacking dataframes should have exactly the same columns
start = cf_crsr_allblocks[2].columns.tolist()
for i, df in enumerate(cf_crsr_allblocks):
    try:
        assert df.columns.tolist() == start
    except AssertionError as exc:
        raise AssertionError('Cannot stack the columns as they have changing names: %i, %s vs 0, %s' % (i, df.columns.tolist(), start))

# Stack/Concatenate vertically all CRS-Rs
cf_crsr_all = pd.concat(cf_crsr_allblocks).sort_index()
# Drop empty CRS-Rs
cf_crsr_all = cf_crsr_all.reset_index().dropna(subset=['date_assess']).set_index(['Name', 'date_assess']).sort_index()
#Display!
cf_crsr_all

In [None]:
# Find patients with weird date_assess, and drop them
cf_crsr_weird_dates = cf_crsr_all.reset_index().set_index('Name')[cf_crsr_all.reset_index().set_index('Name')['date_assess'] == 'XXX']
cf_crsr_weird_dates = cf_crsr_weird_dates.reset_index().set_index(['Name','date_assess'])
cf_crsr_all = cf_crsr_all.drop(cf_crsr_weird_dates.index)
cf_crsr_weird_dates

In [None]:
# Find problematic dates and remove them
dates = pd.to_datetime(cf_crsr_all.reset_index().set_index('Name')['date_assess'], dayfirst=True, errors='coerce')
cf_crsr_buggy_dates = cf_crsr_all.reset_index().set_index('Name')[dates.isnull()].reset_index().set_index(['Name', 'date_assess'])
cf_crsr_all = cf_crsr_all.drop(cf_crsr_buggy_dates.index)
save_df_as_csv(cf_crsr_buggy_dates, 'SarahBDD_buggy_crsr_dates.csv', fields_order=cf_crsr_buggy_dates.columns, keep_index=True)
cf_crsr_buggy_dates

In [None]:
# Finally reformat all date_assess CRS-R dates in a uniform format
# Convert to uniformized dates, there should be no error here, else add errors='coerce'
dates = pd.to_datetime(cf_crsr_all.reset_index().set_index('Name')['date_assess'], dayfirst=True)
# Reassign similar index (based on Name only, to be able to merge)
cf_crsr_all = cf_crsr_all.reset_index().set_index('Name')
# Replace the date column
cf_crsr_all['date_assess'] = dates
# Reset the hierarchical name/date index and sort
cf_crsr_all = cf_crsr_all.reset_index().set_index(['Name', 'date_assess']).sort_index()
# Display!
cf_crsr_all

------------------
## Checking misdiagnosis and typos and maximum limits (sanity check of db), based on Sarah's rules

In [None]:
# Remove buggy CRS-Rs (with string or non numeric numbers)
# Make a copy
cf_crsr_all_nostr = cf_crsr_all.copy()
# Find all CRS-R subscore related columns
crs_cols = find_columns_matching(cf_crsr_all, 'crs', startswith=True)
# Remove strings from these columns (replace by nan)
cf_crsr_all_nostr[crs_cols] = remove_strings_from_df(cf_crsr_all[crs_cols])
# Save and display the buggy assessments
cf_crsr_buggy2 = cf_crsr_all[cf_crsr_all_nostr[crs_cols].isnull().any(axis=1)]
save_df_as_csv(cf_crsr_buggy2, 'SarahBDD_buggy_crsr_nonnumeric.csv', fields_order=cf_crsr_buggy2.columns, keep_index=True)
cf_crsr_buggy2

In [None]:
# Check if subscores are outside limits
#
#* au: 0-4
#* vis: 0-5
#* mot: 0-6
#* oromot/verbal: 0-3
#* communication: 0-2
#* éveil/arousal: 0-3 (attention étant binaire, score max)

def find_outside_range_df(df, col, mini, maxi):
    return df[(df[col] > maxi) | (df[col] < mini)]

limits_check = [('crs_au', 0, 4),
                ('crs_vis', 0, 5),
                ('crs_mot', 0, 6),
                ('crs_ver', 0, 3),
                ('crs_com', 0, 2),
                ('crs_ar', 0, 3),
               ]

for lim in limits_check:
    res = find_outside_range_df(cf_crsr_all_nostr, lim[0], lim[1], lim[2])
    if len(res):
        save_df_as_csv(res, 'SarahBDD_buggylimits_%s.csv' % lim[0], fields_order=res.columns, keep_index=True)
        print('Found an outside limits for %s:' % lim[0])
        print(res)

In [None]:
# Check impossible combinations of scores
#
#Scorages impossibles:
#1. auditif 0-2 + visuel 5
#2. auditif 0-2 + com 1-2
#
#Scorage bizarres:
#3. éveil 3 + UWS

cf_crsr_impossible1 = cf_crsr_all_nostr[(cf_crsr_all_nostr['crs_au'] <= 2) & (cf_crsr_all_nostr['crs_vis'] >= 5)]
cf_crsr_impossible2 = cf_crsr_all_nostr[(cf_crsr_all_nostr['crs_au'] <= 2) & (cf_crsr_all_nostr['crs_com'] >= 1)]
cf_crsr_impossible3 = cf_crsr_all_nostr[(cf_crsr_all_nostr['crs_ar'] >= 3) & (cf_crsr_all_nostr['diagn_crs'].str.lower().str.strip().isin(['vs','coma']))]
save_df_as_csv(cf_crsr_impossible1, 'SarahBDD_impossible_aud0-2_vis5.csv', fields_order=cf_crsr_impossible1.columns, keep_index=True)
save_df_as_csv(cf_crsr_impossible2, 'SarahBDD_impossible_aud0-2_com1-2.csv', fields_order=cf_crsr_impossible2.columns, keep_index=True)
save_df_as_csv(cf_crsr_impossible3, 'SarahBDD_impossible_ar3_uws.csv', fields_order=cf_crsr_impossible3.columns, keep_index=True)
print('Scorages impossibles:')
print('1. auditif 0-2 + visuel 5')
print(cf_crsr_impossible1)
print('2. auditif 0-2 + com 1-2')
print(cf_crsr_impossible2)
print('Scorages bizarres')
print('3. éveil 3 + UWS')
print(cf_crsr_impossible3)

---------------------------------
## Merge with FMP database

In [None]:
# TODO: merge names first, then rename names of one or the other, then compare sessions

In [None]:
cfmp = pd.read_csv(fmp_csv, sep=';', low_memory=False).dropna(axis=0, how='all')  # drop empty lines (but NOT columns else we might lose important columns such as 13_since_onset which was never filled but is necessary for the stacking!)
cfmp.rename(columns={'CRSr::Date of CRSr': 'date_assess'}, inplace=True)
cfmp

In [None]:
# Convert to uniformized dates, there should be no error here, else add errors='coerce'
dates = pd.to_datetime(cfmp.reset_index().set_index('Name')['date_assess'], dayfirst=True)
# Reassign similar index (based on Name only, to be able to merge)
cfmp = cfmp.reset_index().set_index('Name')
# Replace the date column
cfmp['date_assess'] = dates
# Reset the hierarchical name/date index and sort
cfmp = cfmp.reset_index().set_index(['Name', 'date_assess']).sort_index()
# Display!
cfmp

In [None]:
# Merge csv and fmp final diagnoses if name matches
cmerge = merge_two_df(cf_crsr_all_nostr.reset_index(), cfmp.reset_index(), mode=1, skip_sanity=True)
cmerge

In [None]:
# Extract subjects missing in either database
missing_in_fmp = cmerge[cmerge['Name2'].isnull()]['Name'].tolist()
missing_in_sarah = cmerge[cmerge['Name'].isnull()]['Name2'].tolist()
save_df_as_csv(pd.DataFrame(missing_in_fmp, columns=['Name']), 'SarahBDD_subjects_missing_in_fmp.csv', csv_order_by='Name')
save_df_as_csv(pd.DataFrame(missing_in_sarah, columns=['Name']), 'SarahBDD_subjects_missing_in_sarah.csv', csv_order_by='Name')
print('Missing subjects saved in SarahBDD_subjects_missing_in_fmp.csv and SarahBDD_subjects_missing_in_sarah.csv')

In [None]:
# Remap names in Sarah's db to match FMP db (so we can compare sessions by subject name)
def replace_nonnull_df(x, repmap):
    replacement = repmap[x]
    return replacement if replacement is not None else x

keep_nulls = False
repmap = cmerge.set_index('Name')['Name2'].to_dict()
cf_crsr_all_ren = cf_crsr_all_nostr.copy().reset_index()
if keep_nulls:
    # Much faster but if there are nulls they will be replaced
    cf_crsr_all_ren['Name'] = cf_crsr_all_ren['Name'].map(repmap)
else:
    # Slower but remap only if the remap is not null
    cf_crsr_all_ren['Name'] = cf_crsr_all_ren['Name'].apply(lambda x: replace_nonnull_df(x, repmap))
cf_crsr_all_ren = cf_crsr_all_ren.set_index(['Name', 'date_assess'])
cf_crsr_all_ren

In [None]:
# Find all CRS-Rs present in Sarah's db but not in FMP db (we simply compute the difference of the name/date_assess indices!)
crsr_missing_in_fmp = cf_crsr_all_ren.loc[cf_crsr_all_ren.index.difference(cfmp.index)]
save_df_as_csv(crsr_missing_in_fmp, 'SarahBDD_crsr_missing_in_fmp.csv', fields_order=crsr_missing_in_fmp.columns, keep_index=True)
print('Missing CRS-Rs in FMP that are available in SarahBDD saved in SarahBDD_crsr_missing_in_fmp.csv')
crsr_missing_in_fmp

In [None]:
# Find all best diagnosis for each patient in Sarah's db
cf_crsr_bestdiags = compute_best_diag(cf_crsr_all_ren['diagn_crs'].replace({'vs':'vs/uws', 'vs ':'vs/uws', 'uws':'vs/uws'}), diag_order=['coma', 'vs/uws', 'mcs', 'mcs-', 'mcs+', 'emcs', 'lis', 'lis incomplete', 'mcs-/lis??'])
cf_crsr_bestdiags

In [None]:
# Find any CRS-R session that has a different diagnosis in both db
diags_sarah = cf_crsr_all_ren['diagn_crs'].replace({'vs':'vs/uws', 'vs ':'vs/uws', 'uws':'vs/uws'}).str.lower().str.strip()
diags_fmp = cfmp['CRSr::Computed Outcome'].str.lower().str.strip()
diags_intersection_idxs = diags_sarah.index.intersection(diags_fmp.index.intersection(diags_sarah.index))
# Need to do a manual loop because there are duplications (multiple CRS-Rs sessions on the same day, thus the same key/index...)
conflicts = []
for idx in diags_intersection_idxs:
    ds = diags_sarah.loc[idx]
    df = diags_fmp.loc[idx]
    try:
        flagEq = True
        for s in ds:
            for f in df:
                if s != f:
                    flagEq = False
                    break
            if not flagEq:
                break
        if not flagEq:
            n = diags_sarah.loc[idx].reset_index()['Name'][0]
            d = diags_sarah.loc[idx].reset_index()['date_assess'][0]
            conflicts.append((n,d,s,f))
    except ValueError as exc:
        print(ds)
        print(df)
        raise
conflicts_any = pd.DataFrame(conflicts, columns=['Name', 'date_assess', 'Sarah_diag', 'FMP_diag']).set_index(['Name','date_assess'])
print('Conflicts saved in SarahBDD_any_conflicts_with_fmp.csv')
save_df_as_csv(conflicts_any, 'SarahBDD_any_conflicts_with_fmp.csv', keep_index=True)
conflicts_any

In [None]:
# Find any CRS-R session that has a different diagnosis in both db in all CRS-R sessions of the day (because eg, fmp can store more sessions on one day, thus explaining a different diagnosis but it's only because we have an additional session)
diags_sarah = cf_crsr_all_ren['diagn_crs'].replace({'vs':'vs/uws', 'vs ':'vs/uws', 'uws':'vs/uws'}).str.lower().str.strip()
diags_fmp = cfmp['CRSr::Computed Outcome'].str.lower().str.strip()
diags_intersection_idxs = diags_sarah.index.intersection(diags_fmp.index.intersection(diags_sarah.index))
# Need to do a manual loop because there are duplications (multiple CRS-Rs sessions on the same day, thus the same key/index...)
conflicts = []
for idx in diags_intersection_idxs:
    ds = diags_sarah.loc[idx]
    df = diags_fmp.loc[idx]
    try:
        flagEq = False
        for s in ds:
            for f in df:
                if s == f:
                    flagEq = True
                    break
            if flagEq:
                break
        if not flagEq:
            n = diags_sarah.loc[idx].reset_index()['Name'][0]
            d = diags_sarah.loc[idx].reset_index()['date_assess'][0]
            conflicts.append((n,d,s,f))
    except ValueError as exc:
        print(ds)
        print(df)
        raise
conflicts_all = pd.DataFrame(conflicts, columns=['Name', 'date_assess', 'Sarah_diag', 'FMP_diag']).set_index(['Name','date_assess'])
save_df_as_csv(conflicts_all, 'SarahBDD_all_conflicts_with_fmp.csv', keep_index=True)
print('Conflicts saved in SarahBDD_all_conflicts_with_fmp.csv')
conflicts_all