# CSV Databases Merger
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2018-05-27
License: MIT
v1.2.4

DESCRIPTION:
Generic tool to merge two CSV databases based on the subject's name (hence expecting a column 'name' in each csv file).
This script will take care of fuzzy matching names and append all columns of each csv file, hence centralizing all informations into one file.

Note: in case of multiple names/lines matching, they will all be concatenated into a single line: if one name in one of the two databases match multiple names/lines of the second database, then the second databases lines will be concatenated into one. In the opposite case (second database's name match multiple names in first database), the same holds.

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.13

USAGE:

TODO:
* Better support for accentuated characters when saving (they are replaced during merging but then they are copied over from the original dataframe and when saving the encoding is incorrect)

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import ast
import re

from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, merge_two_df, df_remap_names, concat_vals

def df_literal_eval(x):
    try:
        return(ast.literal_eval(x))
    except (SyntaxError, ValueError):
        return x

In [None]:
# PARAMETERS

# First (ID) database to merge (both need to have a column 'name'). The merged 'name' column will use the names from this database.
id_db = r'latestdbs2018\patients-sedation-2018-checked-by-Stephen-from-archives_v7_2018-10-18_full-names-merged.csv'
# Second (reference) database to merge. The names will be added as a new column 'name_altx'.
ref_db = r'latestdbs2018\fmp_db_subjects_aggregated.csv_etiosedatfixed_dicomsdatediag.csv_acute_mergesedat.csv'
# Output database with the merge results
out_db = r'latestdbs2018\merged_db.csv'

# Similarity search parameters (of names in both databases)
dist_threshold = 0.2 # character distance (normalized on 1 over the total number of characters = jaccard distance), lower is more similar, default: 0.2
dist_words_threshold = 0.4 # words distance (normalized idem but on number of words), default: 0.2
keep_lastname_only = False # keep only the lastname? (supposed to be the first word) - this can enhance the matching if there are too many false positives, particularly if one database only include the last name but the other one has the full name


In [None]:
import pandas as pd

# Load first database
cref = pd.read_csv(ref_db, sep=';').dropna(how='all').dropna(how='any', subset=['name'])  # drop all rows where name is empty (necessary else this will produce an error, we expect the name to exist)
cref['name_orig'] = cref['name'] # make a backup of the original name
cref['name'] = cref['name'].apply(lambda x: df_literal_eval(x)[0] if isinstance(df_literal_eval(x), list) else x)  # if doing multiple consecutive merges, a name can in fact be a list of concatenated names, then extract the first name in the list
if keep_lastname_only: # keep only the lastname (supposed to be first)
    cref['name'] = cref['name'].apply(lambda x: x.split()[0])
cref

In [None]:
cid = pd.read_csv(id_db, sep=';').dropna(how='all').dropna(how='any', subset=['name'])
cid['name_orig2'] = cid['name'] # make a backup of the original name
cid['name'] = cid['name'].apply(lambda x: df_literal_eval(x)[0] if isinstance(df_literal_eval(x), list) else x)  # if doing multiple consecutive merges, a name can in fact be a list of concatenated names, then extract the first name in the list
if keep_lastname_only:
    cid['name'] = cid['name'].apply(lambda x: x.split()[0])
cid

In [None]:
# Merge both databases if name matches (here we extract the names/indices where they match)
cmerge = merge_two_df(cref, cid, col='name', mode=0, dist_threshold=dist_threshold, dist_words_threshold=dist_words_threshold, skip_sanity=True)
cmerge

In [None]:
cmerge.loc[cmerge['name'].isnull(), 'name'] = cmerge['name2']
cmerge

In [None]:
# Remap names
cid2 = df_remap_names(cid, cmerge, 'name', 'name2', keep_nulls=True)
del cid2['index']
cid2

In [None]:
# Make sure each name is unique, else concatenate all rows for each name into one row
cref = cref.reset_index().groupby('name').agg(concat_vals)
cid2 = cid2.reset_index().groupby('name').agg(concat_vals)
cref.reset_index(inplace=True)
cid2.reset_index(inplace=True)
#cref.set_index('name', inplace=True)

# Join both databases
cfinal = pd.merge(cid2, cref, how='outer', on='name')
# Reset the original name and keep the 1st database names
cfinal['name'] = cfinal['name_orig2']
del cfinal['name_orig2']

# Create another columns to store the name from 2nd database and create a column with any name from first db or second db
for x in range(1000):
    # If we do multiple merge, we will have multiple name_alt columns: name_alt0, name_alt1, etc
    if not ('name_alt%i' % x) in cfinal.columns:
        # Rename the name column from the 2nd database
        cfinal.insert(1, ('name_alt%i' % x), cfinal['name_orig']) # insert the column just after 'name' for ergonomy
        
        # Create a new name column with either name from 1st db or 2nd if nan in 1st
        nametemp = cfinal['name']
        nametemp[nametemp.isnull()] = cfinal['name_orig']
        cfinal.insert(2, ('name_all%i' % x), nametemp)

        # Finally delete the useless column (that we copied over to name_altx)
        del cfinal['name_orig']

        # Finish!
        break
# Show the merged result
cfinal

In [None]:
if save_df_as_csv(cfinal.reset_index(), out_db, fields_order=list(cfinal.columns), csv_order_by='name'):
    print('Merged database successfully saved in %s!' % out_db)
else:
    print('ERROR: the merged database could not be saved!')