In [None]:
# CSV Databases Merger (old)
# (see csg_fileutil_db_merger.ipynb for a more general purpose merger, this one is limited to only a few fields, whereas the new one can merge everything)
# By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
# Creation date: 2017-02-07
# License: MIT
# v1.1.2
#
# INSTALL NOTE:
# You need to pip install pandas before launching this script.
# Tested on Python 2.7.11
#
# USAGE:
# Any two csv files can be used for the merge, you just need to ensure both contain a 'name' field with patient's name, and a 'final_diagnosis' field with the CRS-R final score. Other fields may have different names, however if you want to merge fields automatically, you should rename other fields as well to have similar column names (if there are columns in one csv that are not available in the other, don't worry, these will be added in the final merged csv).
#
# TODO:
#

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.distance import distance

from csg_fileutil_libs.aux_funcs import distance_jaccard_words_split, cleanup_name, cleanup_name_df, replace_buggy_accents, save_df_as_csv, _tqdm


In [None]:
# PARAMETERS

# Reports database
reports_csv = 'all_patients_fields.csv'

# FileMakerPro (FMP) database
fmp_csv = 'db-fmp.csv'

# Final_diagnosis translation in FMP database to match nomenclatura of reports database
fmp_diag_translation = [
    ('VS/UWS', 'UWS'),
    ('LIS incomplete', 'partial LIS'),
    ('COMA', 'coma')
]

# Optional: How to translate FMP fields to merge with reports fields?
fmp_col_translation = {'date_birth': 'birthdate',
                      'date_accident': 'accident_date',
                      #'etiology_specified': 'accident_etiology',
                      'sedation_mri': 'mri_sedation',
                      'epilepsia': 'epileptic'}

----------------------------------------
# Loading databases

In [None]:
import pandas as pd

cf = pd.read_csv(reports_csv, sep=';')
cf

In [None]:
cdb = pd.read_csv(fmp_csv, sep=';').dropna(how='all').dropna(subset=['name'], how='all')
# Clean up names of fmp database
#cdb = cleanup_name_df(cdb)
# Reorder by name
cdb.sort_values('name', inplace=True)
# Renaming fields (to better merge with reports csv database later)
if fmp_col_translation:
    cdb.rename(columns=fmp_col_translation, inplace=True)
# Renaming diagnoses to match nomenclatura of the other csv
if fmp_diag_translation:
    for pattern, replacement in fmp_diag_translation:
        cdb.loc[cdb['final_diagnosis'] == pattern, 'final_diagnosis'] = replacement
# Print db
print(len(cdb))
cdb

In [None]:
print('Reports database contains %i diagnoses over %i records in total (%i unique diagnoses for %i unique patients).' % (cf['final_diagnosis'].count(), len(cf), len(cf[~cf['final_diagnosis'].isnull()]['name'].unique()), len(cf['name'].unique())))

In [None]:
print('FMP database contains %i diagnoses over %i records in total (%i unique diagnoses for %i unique patients).' % (cdb['final_diagnosis'].count(), len(cdb), len(cdb[~cdb['final_diagnosis'].isnull()]['name'].unique()), len(cdb['name'].unique())))

In [None]:
print('All names in FMP database:')
cdb['name']

------------------------
## Comparison of the two csv databases

In [None]:
# Merge csv and fmp final diagnoses if name matches
# TODO: replace by updated compute_names_distance_matrix() function
dist_threshold = 0.2
comp_res = []
for crowid, c in _tqdm(cf.iterrows(), total=len(cf), desc='MERGE'):
    found = False
    for cdrowid, cd in cdb.iterrows():
        if not cd['name']:
            continue
        pts_name = cleanup_name(replace_buggy_accents(cd['name']), 'utf-8')
        if distance.nlevenshtein(pts_name, c['name'], method=1) <= dist_threshold or distance_jaccard_words_split(pts_name, c['name'], partial=True, norm=True, dist=dist_threshold) <= dist_threshold: # use shortest distance with normalized levenshtein
            comp_res.append( {'name': c['name'], 'name_fmp': cd['name'], 'csv_final_diagnosis': c['final_diagnosis'], 'fmp_final_diagnosis': cd['final_diagnosis'], 'report_path': c['report_path']} )
            found = True
    if not found:
        comp_res.append( {'name': c['name'], 'name_fmp': None, 'csv_final_diagnosis': c['final_diagnosis'], 'fmp_final_diagnosis': None} )
comp_res = pd.DataFrame(comp_res)
print('FMP and reports databases were merged successfully!')

In [None]:
print('List of all merged records (ie, records that are present in both databases):')
comp_res = comp_res.where((pd.notnull(comp_res)), None)
print('Total number of unique patients: %i' % len(comp_res['name'].unique()))
comp_res_diag = comp_res[~(comp_res['csv_final_diagnosis'].isnull() & comp_res['fmp_final_diagnosis'].isnull())]
print('Total number of unique patients with at least a diagnosis: %i' % len(comp_res_diag['name'].unique()))
comp_res.to_csv('diff_merge_test.csv', sep=';')
comp_res

In [None]:
print('List of unique patients in csv database but missing from fmp database (not accounted in subsequent analyses):')
comp_miss_fmp = comp_res[comp_res['name_fmp'].isnull()]
comp_miss_fmp_with_diag = comp_miss_fmp.dropna(subset=['csv_final_diagnosis'], how='all')
comp_miss_fmp_full = cf[cf['name'].isin(comp_miss_fmp['name'])].sort_values('name')
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print("Total new records: %i" % (len(comp_miss_fmp_full)))
    print("Total unique patients: %i (with diagnosis: %i)" % (len(comp_miss_fmp['name'].unique()), len(comp_miss_fmp_with_diag['name'].unique())))
    print(pd.Series(comp_miss_fmp['name'].unique()))
# Save result to a csv
comp_miss_fmp_full.to_csv('diff_reports_more.csv', index=False, sep=';')

In [None]:
print('List of records in csv database present in fmp database but without a diagnosis:')
comp_res_fmp_with_diag_unique = comp_res[~comp_res['fmp_final_diagnosis'].isnull()]['name'].unique()
comp_miss_fmp2 = comp_res[~comp_res['name'].isin(comp_res_fmp_with_diag_unique) & ~comp_res['name_fmp'].isnull()]
comp_miss_fmp2_full = cf[cf['name'].isin(comp_miss_fmp2['name'])].sort_values('name')

print("Total number of fmp unique patients found also in csv: %i" % len(comp_res_fmp_with_diag_unique))
print("Total records matched in fmp but without a diagnosis: %i" % (len(comp_miss_fmp2_full)))
print("Total unique patients matched in fmp without a diagnosis: %i" % (len(comp_miss_fmp2_full['name'].unique())))
print("Total unique patients with diag from csv: %i" % (len(comp_miss_fmp2_full[~comp_miss_fmp2_full['final_diagnosis'].isnull()]['name'].unique())))
#with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    #print(pd.Series(comp_miss_fmp2['name'].unique()))
# Save result to a csv
comp_miss_fmp2_full.to_csv('diff_reports_more_diag_only.csv', index=False, sep=';')

In [None]:
print('List of patients in fmp database but missing from csv database (not accounted in subsequent analyses):')
comp_miss_csv = cdb[~cdb['name'].isin(comp_res['name_fmp'])]
comp_miss_csv_with_diag = comp_miss_csv.dropna(subset=['final_diagnosis'], how='all')
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print("%i unique patients, with %i having a diagnosis." % (len(comp_miss_csv['name'].unique()), comp_miss_csv_with_diag['name'].count()))
    print
    print("-> With diagnosis:")
    print(comp_miss_csv_with_diag.loc[:, ['name', 'final_diagnosis']])
    print
    print("-> Without diagnosis:")
    print(comp_miss_csv[comp_miss_csv['final_diagnosis'].isnull()]['name'])
cdb[cdb['name'].isin(comp_miss_csv_with_diag['name'])].sort_values('name').to_csv('diff_fmp_more.csv', index=False, sep=';')

In [None]:
# Compute list of records found in both databases, with a diagnosis in fmp db
comp_res2 = comp_res.copy()
comp_res2 = comp_res2.dropna(subset=['fmp_final_diagnosis'], how='all')
comp_res2.sort_values(by=['name'])

In [None]:
print('List of all agreements in final diagnoses between databases (where FMP has a diagnosis):')
comp_eqs = comp_res2[comp_res2['csv_final_diagnosis'] == comp_res2['fmp_final_diagnosis']]
print('Total: %i records (%i unique patients)' % (len(comp_eqs), len(comp_eqs['name'].unique())))
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(comp_eqs.sort_values('name').loc[:, ['name', 'csv_final_diagnosis', 'fmp_final_diagnosis']])

In [None]:
print('List of all differences in final diagnoses between databases (where FMP has a diagnosis):')
comp_diffs = comp_res2[comp_res2['csv_final_diagnosis'] != comp_res2['fmp_final_diagnosis']]
print(len(comp_diffs))
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(comp_diffs.sort_values('name').loc[:, ['name', 'csv_final_diagnosis', 'fmp_final_diagnosis']])

In [None]:
print('List of all real differences in final diagnoses between databases (where FMP has a diagnosis AND not in agreement for this subject for another session):')
comp_diffs2 = comp_res2[comp_res2['csv_final_diagnosis'] != comp_res2['fmp_final_diagnosis']]
comp_diffs2 = comp_diffs2[~comp_diffs2['name'].isin(comp_eqs['name'])]
print(len(comp_diffs2))
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'expand_frame_repr', False):
    print(comp_diffs2.sort_values('name').loc[:, ['name', 'csv_final_diagnosis', 'fmp_final_diagnosis', 'report_path']].rename(columns={'csv_final_diagnosis': 'csv_diag', 'fmp_final_diagnosis': 'fmp_diag', 'report_path': 'path'}))  # To shorten to first 4 characters each column name: .rename(columns=lambda x: x[:4])
comp_diffs2.sort_values('name').to_csv('diff_reports_vs_fmp_conflicts.csv', sep=';', index=False)

In [None]:
print('List of diffs because one report is incomplete but others are OK:')
comp_diffs3 = comp_res2[comp_res2['csv_final_diagnosis'] != comp_res2['fmp_final_diagnosis']]
comp_diffs3 = comp_diffs3[comp_diffs3['name'].isin(comp_eqs['name'])]
print(len(comp_diffs3))
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'expand_frame_repr', False):
    print(comp_diffs3.sort_values('name').rename(columns=lambda x: x[:8]))

In [None]:
fmp_replace_reduce = [
    ('MCS+', 'MCS'),
    ('MCS-', 'MCS'),
    ('coma', 'UWS')
]
comp_res3 = comp_res2.copy()
for pattern, replacement in fmp_replace_reduce:
    comp_res3.loc[comp_res3['fmp_final_diagnosis'] == pattern, 'fmp_final_diagnosis'] = replacement
    comp_res3.loc[comp_res3['csv_final_diagnosis'] == pattern, 'csv_final_diagnosis'] = replacement

print('Removing differences between MCS+/- and coma/uws, here are the remaining differences:')
comp_diffs3 = comp_res3[comp_res3['csv_final_diagnosis'] != comp_res3['fmp_final_diagnosis']]
print(len(comp_diffs3))
comp_diffs3.sort_values('name').loc[:, ['name', 'csv_final_diagnosis', 'fmp_final_diagnosis', 'report_path']]

In [None]:
comp_res4 = comp_res.copy() # TODO: add also patient missing from fmp eventually
comp_res4 = comp_res4.dropna(subset=['csv_final_diagnosis'], how='all')
comp_res4.sort_values(by=['name'])

fmp_replace = [
    ('VS/UWS', 'UWS'),
    ('LIS incomplete', 'partial LIS'),
    ('COMA', 'coma')
]
for pattern, replacement in fmp_replace:
    comp_res4.loc[comp_res4['fmp_final_diagnosis'] == pattern, 'fmp_final_diagnosis'] = replacement
comp_res4.sort_values(by=['name'])

print('List of all new entries (new from csv and not present in FMP):')
comp_new = comp_res4[comp_res4['csv_final_diagnosis'] != comp_res4['fmp_final_diagnosis']]
comp_new = comp_new[~comp_new['name_fmp'].isin(cdb['name'])]
print("%s records (%s unique patients)" %(len(comp_new), len(comp_new['name'].unique())))
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(comp_new.sort_values('name').loc[:, ['name', 'csv_final_diagnosis', 'fmp_final_diagnosis']])

-----------------------------------
## Saving final database, merging as much as possible automatically (rejecting conflicts)

In [None]:
# Save full database: reports + additional records from fmp with diagnosis that are missing in reports + diagnosis from fmp where csv diagnosis is missing
cf_cdb = cf.copy()
# Extract only new fmp names
cdb_only_new = cdb[cdb['name'].isin(comp_miss_csv_with_diag['name'])].sort_values('name')
# Cleanup the names
cdb_only_now = cleanup_name_df(cdb_only_new)
# Append new names from fmp
cf_cdb = cf_cdb.append(cdb_only_new)
# Assign fmp diagnosis for patients where there is no csv diagnosis (for any record)
comp_no_csv_diag_but_fmp = comp_diffs2[~comp_diffs2['name'].isin(cf_cdb.dropna(subset=['final_diagnosis'], how='all')['name'].unique())]
for c in comp_no_csv_diag_but_fmp.iterrows():
    for c2 in cf_cdb[cf_cdb['name'] == c['name']].itertuples(): # TODO: warning, if there are more than 255 columns or elements, this will raise an error (because we get tuple() instead of namedtuple())
        cf_cdb.loc[c2.Index, 'final_diagnosis'] = c['fmp_final_diagnosis']
    #possible solution to itertuples() issue:
    #idxs = cf_cdb[cf_cdb['name'] == c['name']]
    #cf_cdb.loc[idxs, 'final_diagnosis'] = c['fmp_final_diagnosis']
# Save to csv
#cf_cdb.to_csv('db_reports_plus_fmp.csv', index=False, sep=';')
fields_order = ['name', 'gender', 'age', 'final_diagnosis', 'mri_sedation']
save_df_as_csv(cf_cdb, 'db_reports_plus_fmp.csv', fields_order, csv_order_by='name', verbose=True)
print('Full database (reports + additional records from fmp) was saved in db_reports_plus_fmp.csv.')
print('Full database has %i records (%i with diagnosis) and %i unique patients (%i with diagnosis)' % (len(cf_cdb),
                                                                                                       len(cf_cdb.dropna(subset=['final_diagnosis'], how='all')),
                                                                                                       len(cf_cdb['name'].unique()),
                                                                                                       len(cf_cdb.dropna(subset=['final_diagnosis'], how='all')['name'].unique())))

------------------------------------------------------
## Final stats and charts

In [None]:
final_db_stats = {'conflicts': len(comp_diffs2['name'].unique()),
                  'agreements': len(comp_eqs[~comp_eqs['csv_final_diagnosis'].isnull()]['name'].unique()),
                  'fmp_add': len(comp_miss_csv_with_diag['name'].unique()),
                  'reports_add': len(comp_new[~comp_new['csv_final_diagnosis'].isnull()]['name'].unique()),
                  'reports_add_just_diag': len(comp_miss_fmp2_full[~comp_miss_fmp2_full['final_diagnosis'].isnull()]['name'].unique()),
                 }
print('Conflicts: %i subjects (with %i having no csv diagnosis so fmp diagnosis was copied)' % (final_db_stats['conflicts'], len(comp_no_csv_diag_but_fmp)))
print('Agreements: %i records (%i unique patients, %i with diagnosis)' % (len(comp_eqs), len(comp_eqs['name'].unique()), final_db_stats['agreements']))
print("FMP adds %i unique subjects having a diagnosis." % final_db_stats['fmp_add'])
print("CSV adds %s totally new records (%s unique patients with a diagnosis)" %(len(comp_new), final_db_stats['reports_add']))
print("CSV adds %i diagnoses to unique subjects present in both databases." % final_db_stats['reports_add_just_diag'])
print('Full database has %i records (%i with diagnosis) and %i unique patients (%i with diagnosis)' % (len(cf_cdb),
                                                                                                       len(cf_cdb.dropna(subset=['final_diagnosis'], how='all')),
                                                                                                       len(cf_cdb['name'].unique()),
                                                                                                       len(cf_cdb.dropna(subset=['final_diagnosis'], how='all')['name'].unique())))
cdb_missing_diag = cdb[~cdb['name'].isin(comp_res['name_fmp'].unique()) & ~cdb['name'].isin(comp_miss_csv_with_diag['name'].unique())]
print('FMP missing diagnosis (and not present in reports db): %i' % len(cdb_missing_diag))
cdb_missing_diag.to_csv('diff_fmp_missing_diags.csv', sep=';', index=False)
print('List of FMP subjects with no diagnosis and missing from reports was saved in diff_fmp_missing_diags.csv.')

In [None]:
%matplotlib inline
df_db_stats = pd.Series(final_db_stats, name='All unique subjects w/ diag (%i)' % sum(final_db_stats.values()))
df_db_stats.plot(kind='pie', title='Final db subjects provenance', use_index=False, figsize=(6, 6), fontsize=10, pctdistance=1.15, labeldistance=1.3, autopct=lambda(p): '%.0f%% (%i)' % (p, round(p / 100.0 * df_db_stats.sum())))

In [None]:
merge_stats = {'agreements': final_db_stats['agreements'],
             'conflicts': final_db_stats['conflicts'],
            }
df_merge_stats = pd.Series(merge_stats, name='Merge (%i subjects)' % sum(final_db_stats.values()))
df_merge_stats.plot(kind='pie', title='Merge results', use_index=False, figsize=(6, 6), fontsize=10, pctdistance=0.6, labeldistance=1.1, autopct=lambda(p): '%.0f%% (%i)' % (p, round(p / 100.0 * df_merge_stats.sum())), colors=['g', 'r'], startangle=292)

In [None]:
all_db_stats = pd.DataFrame(columns=['total', 'diag', 'unique', 'unique_w_diag'])
all_db_stats.loc['reports'] = [len(cf), cf['final_diagnosis'].count(), len(cf['name'].unique()), len(cf[~cf['final_diagnosis'].isnull()]['name'].unique())]
all_db_stats.loc['fmp'] = [len(cdb), cdb['final_diagnosis'].count(), len(cdb['name'].unique()), len(cdb[~cdb['final_diagnosis'].isnull()]['name'].unique())]
all_db_stats.loc['new_db'] = [len(cf_cdb),
                       len(cf_cdb.dropna(subset=['final_diagnosis'], how='all')),
                       len(cf_cdb['name'].unique()),
                       len(cf_cdb.dropna(subset=['final_diagnosis'], how='all')['name'].unique())]

# Plot!
ax = all_db_stats.plot(kind='bar')
# Place legend better
ax.legend(loc='upper right', bbox_to_anchor=(1.4, 1.0),
          ncol=1, fancybox=True, shadow=True)
# Annotate bar with values
x_offset = -0.08
y_offset = -0.1
for p in ax.patches:
    b = p.get_bbox()
    val = "{:.0f}".format(b.y1 + b.y0) # use {+:.2f} for possibly negative values to get the sign
    ax.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))

----------------------------------------
## Test

In [None]:
def compare_test(c1, c2, eq=False):
    if eq:
        return c1[c1['name'].isin(c2['name'].unique())]
    else:
        return c1[~c1['name'].isin(c2['name'].unique())]
def get_unique(c):
    return c['name'].unique()
print(len( get_unique(compare_test(cf_cdb.dropna(subset=['final_diagnosis'], how='all'), comp_diffs2)) ))
compare_test(comp_eqs, comp_diffs2, True)
compare_test(comp_diffs2, cf_cdb.dropna(subset=['final_diagnosis'], how='all'))

In [None]:
print(len(comp_miss_fmp2_full[comp_miss_fmp2_full['name'].isin(comp_eqs['name'].unique())]['name'].unique()))
print(len(comp_miss_fmp2_full[comp_miss_fmp2_full['name'].isin(comp_diffs2['name'].unique())]['name']))

In [None]:
pd.Series(comp_miss_fmp2_full[comp_miss_fmp2_full['name'].isin(comp_eqs['name'].unique())]['name'].unique())

In [None]:
comp_miss_fmp2_full[comp_miss_fmp2_full['name'].isin(comp_diffs2['name'].unique())]['name']