# CSV Databases Merger
By Stephen Karl Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2018-05-27
License: MIT
v1.3.6
2018-2019

DESCRIPTION:
Generic tool to merge two CSV databases based on the subject's name (hence expecting a column 'name' in each csv file).
This script will take care of fuzzy matching names and append all columns of each csv file, hence centralizing all informations into one file.

Note: in case of multiple names/lines matching, they will all be concatenated into a single line: if one name in one of the two databases match multiple names/lines of the second database, then the second databases lines will be concatenated into one. In the opposite case (second database's name match multiple names in first database), the same holds.

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.13

USAGE:

TODO:
* Nothing here!

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from collections import OrderedDict
from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, merge_two_df, df_remap_names, concat_vals, df_literal_eval, df_to_unicode_fast, reorder_cols_df


In [None]:
# PARAMETERS

# First (ID) database to merge (both need to have a column 'name'). The merged 'name' column will use the names from this database.
id_db = r'databases_original\patients-sedation-2018-checked-by-Stephen-from-archives_v7_2018-10-18_full-names-merged.csv'
# Second (reference) database to merge. The names will be added as a new column 'name_altx'.
ref_db = r'databases_output\fmp_db_subjects_aggregated.csv_etiosedatfixed_dicomsdatediag_dicompathsedat.csv_acute.csv'
# Output database with the merge results
out_db = r'databases_output\merged_db.csv'

# Similarity search parameters (of names in both databases)
dist_threshold = 0.2 # character distance (normalized on 1 over the total number of characters = jaccard distance), lower is more similar, default: 0.2
dist_words_threshold = 0.4 # words distance (normalized idem but on number of words), default: 0.2
keep_lastname_only = False # keep only the lastname? (supposed to be the first word) - this can enhance the matching if there are too many false positives, particularly if one database only include the last name but the other one has the full name

# Additional options
rename_columns_per_csv = ['steph.', 'fmpagg.']  # rename each column by prepending the csv file from where the column comes from. True will use the first 3 letters from the filename, or a list of 2 string prefixes can be provided, or False to disable renaming
#rename_columns_per_csv = False
pdmerge_indicator = False  # pandas.merge() option, if True, a new column 'x_merge' will be created for every columns, summarizing from what original database the info was merged from
pdmerge_validate = None  # pandas.merge() option, allows to ensure that the mapping is unique, for example with '1:1'. Can be: '1:1', '1:m', 'm:1', 'm:m'

# Multi-columns merging
# keys to use for the merge: by default will only use the 'name' column, but can merge on multiple columns and with different types: 'id' or 'datetime' are supported
# the order of the columns must be the same, since we will not use the column names: the id's database first column must match the same type as the ref's database first column, etc for all subsequent columns
# format is: an OrderedDict with the column_name: column_type for each item
#id_db_keys = None
#ref_db_keys = None
id_db_keys = OrderedDict([('name', 'id'), ('MRI date (on mri paper)', 'datetime|%d/%m/%Y')])
ref_db_keys = OrderedDict([('name', 'id'), ('Dicom Date', 'datetime|%Y-%m-%d')])
shared_key_column = 'name'  # in case you set ref_db_keys and id_db_keys to None (to disable multi-columns merging), you can specify here the name of the single column that both databases share, and on which the merge will be done
fillna = True  # if merging on multiple key columns, if fillna=True this will try to fill nan fields from other filled fields from the same subject, eg: subject A has 2 sessions, one with lots of infos, second session mostly empty, with fillna=True the second session will get all missing infos copied over from the first (except for the key columns of course)


In [None]:
# PARAMETERS FOR 2ND MERGE (skip this cell if you did not do the first merge beforehand)

# First (ID) database to merge (both need to have a column 'name'). The merged 'name' column will use the names from this database.
id_db = r'databases_output\merged_db_1.csv'
# Second (reference) database to merge. The names will be added as a new column 'name_altx'.
ref_db = r'databases_original\manon_Database_MRI_patients.csv'
# Output database with the merge results
out_db = r'databases_output\merged_db.csv'

# Similarity search parameters (of names in both databases)
dist_threshold = 0.2 # character distance (normalized on 1 over the total number of characters = jaccard distance), lower is more similar, default: 0.2
dist_words_threshold = 0.4 # words distance (normalized idem but on number of words), default: 0.2
keep_lastname_only = False # keep only the lastname? (supposed to be the first word) - this can enhance the matching if there are too many false positives, particularly if one database only include the last name but the other one has the full name

# Additional options
rename_columns_per_csv = ['', 'manon.'] # rename each column by prepending the csv file from where the column comes from. True will use the first 3 letters from the filename, or a list of 2 string prefixes can be provided, or False to disable renaming
#rename_columns_per_csv = False
pdmerge_indicator = False  # pandas.merge() option, if True, a new column 'x_merge' will be created for every columns, summarizing from what original database the info was merged from
pdmerge_validate = None  # pandas.merge() option, allows to ensure that the mapping is unique, for example with '1:1'. Can be: '1:1', '1:m', 'm:1', 'm:m'

# Multi-columns merging
# keys to use for the merge: by default will only use the 'name' column, but can merge on multiple columns and with different types: 'id' or 'datetime' are supported
# the order of the columns must be the same, since we will not use the column names: the id's database first column must match the same type as the ref's database first column, etc for all subsequent columns
# format is: an OrderedDict with the column_name: column_type for each item
id_db_keys = OrderedDict([('name', 'id'), ('MRI date (on mri paper) + Dicom Date', 'datetime|%Y-%m-%d')])
ref_db_keys = OrderedDict([('name', 'id'), ('Date of RMN', 'datetime|%d/%m/%Y')])
shared_key_column = 'name'  # in case you set ref_db_keys and id_db_keys to None (to disable multi-columns merging), you can specify here the name of the single column that both databases share, and on which the merge will be done
fillna = True  # if merging on multiple key columns, if fillna=True this will try to fill nan fields from other filled fields from the same subject, eg: subject A has 2 sessions, one with lots of infos, second session mostly empty, with fillna=True the second session will get all missing infos copied over from the first (except for the key columns of course)


In [None]:
import pandas as pd

# Load first database
cid = pd.read_csv(id_db, sep=';').dropna(how='all')
cid

In [None]:
# Load second database
cref = pd.read_csv(ref_db, sep=';').dropna(how='all')
cref

In [None]:
# Prepare prefix for columns renaming if rane_columns_per_csv is enabled
# We use the first 3 characters of each path plus a dot
# The user can also specify its own list of prefixes
if rename_columns_per_csv is True:
    prependcols = [os.path.basename(os.path.normpath(db))[:3]+'.' for db in [id_db, ref_db]]
elif isinstance(rename_columns_per_csv, list) and len(rename_columns_per_csv) == 2:
    prependcols = rename_columns_per_csv
else:
    prependcols = None
prependcols

In [None]:
# Prepare key columns if using multiple columns as keys for the merge
if id_db_keys and ref_db_keys:
    keycol = [id_db_keys, ref_db_keys]
else:
    keycol = shared_key_column
keycol

In [None]:
# Merge both databases if name matches (here we extract the names/indices where they match)
cmerge, cfinal = merge_two_df(cid, cref, col=keycol, mode=0,
                              dist_threshold=dist_threshold,
                              dist_words_threshold=dist_words_threshold,
                              skip_sanity=True, keep_nulls=True,
                              returnmerged=True, prependcols=prependcols,
                              fillna=fillna,
                              indicator=pdmerge_indicator, validate=pdmerge_validate)


In [None]:
# Show the merge mapping
cmerge

In [None]:
# Show the merged result (unified database)
cfinal

In [None]:
# Reorder so that the first columns are always the key columns
if isinstance(keycol, list):
    # Deduplicate the key columns list (to avoid duplicating columns!)
    keycol_unique = []
    for kcol in keycol:
        for colname, coltype in kcol.items():
            if colname not in keycol_unique:
                keycol_unique.append(colname)
    # Reorder
    cfinal = reorder_cols_df(cfinal, keycol_unique)
else:
    # Only one key column, reorder according to it
    cfinal = reorder_cols_df(cfinal, keycol)
cfinal

In [None]:
# Save the merge mapping and unified database as csv files
cfinal_unicode = df_to_unicode_fast(cfinal)
cmerge_unicode = df_to_unicode_fast(cmerge)
if save_df_as_csv(cfinal_unicode, out_db, fields_order=list(cfinal_unicode.columns), csv_order_by='name', blankna=True) and \
    save_df_as_csv(cmerge_unicode, out_db[:-4]+'_mapping.csv', fields_order=list(cmerge_unicode.columns), csv_order_by='name', blankna=True):
    print('Merged database successfully saved in %s and %s!' % (out_db, out_db[:-4]+'_mapping.csv'))
else:
    print('ERROR: the merged database could not be saved!')