# CSV Databases Merger
By Stephen Karl Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2018-05-27
License: MIT
v1.2.9
2018-2019

DESCRIPTION:
Generic tool to merge two CSV databases based on the subject's name (hence expecting a column 'name' in each csv file).
This script will take care of fuzzy matching names and append all columns of each csv file, hence centralizing all informations into one file.

Note: in case of multiple names/lines matching, they will all be concatenated into a single line: if one name in one of the two databases match multiple names/lines of the second database, then the second databases lines will be concatenated into one. In the opposite case (second database's name match multiple names in first database), the same holds.

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.13

USAGE:

TODO:
* Allow multiple keys for merge (and different keys, eg one with name and the other with [name, dicom date]

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, merge_two_df, df_remap_names, concat_vals, df_literal_eval, df_to_unicode_fast


In [None]:
# PARAMETERS

# First (ID) database to merge (both need to have a column 'name'). The merged 'name' column will use the names from this database.
id_db = r'databases_original\patients-sedation-2018-checked-by-Stephen-from-archives_v7_2018-10-18_full-names-merged.csv'
# Second (reference) database to merge. The names will be added as a new column 'name_altx'.
ref_db = r'databases_output\fmp_db_subjects_aggregated.csv_etiosedatfixed_dicomsdatediag_dicompathsedat.csv_acute.csv'
# Output database with the merge results
out_db = r'databases_output\merged_db.csv'

# Similarity search parameters (of names in both databases)
dist_threshold = 0.2 # character distance (normalized on 1 over the total number of characters = jaccard distance), lower is more similar, default: 0.2
dist_words_threshold = 0.4 # words distance (normalized idem but on number of words), default: 0.2
keep_lastname_only = False # keep only the lastname? (supposed to be the first word) - this can enhance the matching if there are too many false positives, particularly if one database only include the last name but the other one has the full name

# Additional options
rename_columns_per_csv = ['steph.', 'fmpagg.']  # rename each column by prepending the csv file from where the column comes from. True will use the first 3 letters from the filename, or a list of 2 string prefixes can be provided, or False to disable renaming


In [None]:
import pandas as pd

# Load first database
cref = pd.read_csv(ref_db, sep=';').dropna(how='all')
cref

In [None]:
cid = pd.read_csv(id_db, sep=';').dropna(how='all')
cid

In [None]:
# Prepare prefix for columns renaming if rane_columns_per_csv is enabled
# We use the first 3 characters of each path plus a dot
# The user can also specify its own list of prefixes
if rename_columns_per_csv is True:
    prependcols = [os.path.basename(os.path.normpath(db))[:3]+'.' for db in [id_db, ref_db]]
elif isinstance(rename_columns_per_csv, list) and len(rename_columns_per_csv) == 2:
    prependcols = rename_columns_per_csv
else:
    prependcols = None
prependcols

In [None]:
# Merge both databases if name matches (here we extract the names/indices where they match)
cmerge, cfinal = merge_two_df(cid, cref, col='name', mode=0, dist_threshold=dist_threshold, dist_words_threshold=dist_words_threshold, skip_sanity=True, keep_nulls=True, returnmerged=True, prependcols=prependcols)

In [None]:
# Show the merge mapping
cmerge

In [None]:
# Show the merged result
cfinal

In [None]:
if save_df_as_csv(df_to_unicode_fast(cfinal.reset_index()), out_db, fields_order=list(cfinal.columns), csv_order_by='name'):
    print('Merged database successfully saved in %s!' % out_db)
else:
    print('ERROR: the merged database could not be saved!')