# CSV Database Shortener
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2017-04-05
License: MIT
v1.2.0
2017-2019

## INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.15

## DESCRIPTION:
Extracts a subset of rows from a csv database based on a list of names provided in a second csv file. In other words, we keep from the reference database only the records that have an id that can be found in the filter database.
You have two csv files: one being the full database full of demographics infos, the other one being the list of patients names for your study.
If you want to filter the full database to extract only the patients in your smaller list, then use this notebook.

## USAGE:
Any two csv files can be used for the shortening, you just need to have a "name" field in both. The first csv will be used as the reference, and its rows will be extracted if same names are found in the second database.

## TODO:
* Nothing here!

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.aux_funcs import compute_names_distance_matrix, cleanup_name, cleanup_name_df, cleanup_name_customregex_df, replace_buggy_accents, save_df_as_csv, _tqdm, df_to_unicode, df_to_unicode_fast


In [None]:
# PARAMETERS

# Reference database, from which records will be extracted (need to include a "name" column with all the patients names)
ref_db = r'databases_output\fmp_db_subjects_aggregated.csv_etiosedatfixed_dicomsdatediag_dicompathsedat.csv_acute.csv'
ref_db_idcol = 'Name'  # id column name for the reference database

# Filter database, the one used to filter the reference database's records by matching names (need to include a "name" column with all the patients names)
filt_db = r'databases_original\CSG_demographics_QC_2_final 36 subjects_FOR_Stephen (ENCRYPTED).csv'
filt_db_idcol = 'name'   # id column name for the filter database (both databases will be joined on this key and then filtered)

# Output where to store the CSV files
output_dir = r'databases_output'

# How to filter names in the filter database (remove useless terms) - can use regex
filter_name = {'_': ' ',
               'repos': '',
               'ecg': '',
               '[0-9]+': '',
              }

----------------------------------------
# Loading databases

In [None]:
import pandas as pd

# Load reference database
cref = pd.read_csv(ref_db, sep=';')
cref.dropna(axis=0, subset=[ref_db_idcol], inplace=True) # drop lines where the name is empty, important to avoid errors
# Clean up names in full/reference database (to more easily compare)
cref[ref_db_idcol+'_orig'] = cref[ref_db_idcol]  # make a backup first
cref = cleanup_name_df(cref, col=ref_db_idcol)
# Show
cref

In [None]:
# Load filter database
cfilt = pd.read_csv(filt_db, sep=';').dropna(how='all').dropna(subset=[filt_db_idcol], how='all')
# Reorder by name
cfilt.sort_values(filt_db_idcol, inplace=True)
# Removing useless terms from the patient name
if filter_name:
    cfilt = cleanup_name_customregex_df(cfilt, filter_name)
# Cleanup name in filtering db
cfilt = cleanup_name_df(cfilt, col=filt_db_idcol)
# Print db
print("Filter database contains %i rows." % len(cfilt))
cfilt

In [None]:
# Sanity check: number of subjects in the filter database with missing id/name (they will be dropped, we need an id to filter!)
missing_id = cfilt[filt_db_idcol].isnull() | (cfilt[filt_db_idcol] == '')
print('Filter database contains %i rows with a missing id/name, they will be dropped.' % sum(missing_id))
cfilt[missing_id]

------------------------
## Comparison of the two csv databases

In [None]:
# Merging parameters - EDIT ME - do not hesitate to try different parameters until the matching seems good to you
dist_threshold_letters = 0.2 # percentage of letters matching
dist_threshold_words = 0.4 # percentage of words matching
dist_threshold_words_norm = True # normalize words jaccard distance? Can be True, False or None
dist_minlength = 4 # minimum length of words to compare distance jaccard words

# Merge the two databases names
dmat = compute_names_distance_matrix(cfilt[filt_db_idcol], cref[ref_db_idcol], dist_threshold_letters, dist_threshold_words, dist_threshold_words_norm, dist_minlength)
print('Reference & Filter databases were merged successfully!')
print('List of matchs (please check if this looks fine!):')
dmat

In [None]:
# Save the list of names found in the filter database but missing in the reference database
missing_list = [key for key, val in dmat.items() if val is None]
cmissing = pd.DataFrame(missing_list, columns=[ref_db_idcol])
#cmissing.to_csv(os.path.join(output_dir, 'shorten_missing.csv'), index=False, sep=';')
save_df_as_csv(df_to_unicode_fast(cmissing), os.path.join(output_dir, 'shorten_missing.csv'), fields_order=False, keep_index=False)
print('Saved list of missing subjects in shorten_missing.csv')
print('Missing subjects (no demographics found in the reference database): %i' % len(missing_list))
cmissing

In [None]:
# Shorten (filter) reference demographics database
# In other words, we keep from the reference database only the records that have an id that can be found in the filter database
found_list = [item[0] for item in filter(None, dmat.values())]
cfound = cref[cref[ref_db_idcol].isin(found_list)]

# Add a column to show what was the filtering name
dmat_inv = {ref_db_idcol: [], (ref_db_idcol+'_filter'): []}
for key, vals in dmat.items():
    if vals is None:
        continue
    for v in vals:
        dmat_inv[ref_db_idcol].append(v)
        dmat_inv[ref_db_idcol+'_filter'].append(key)
# create a dataframe
df_dmat_inv = pd.DataFrame(dmat_inv)
df_dmat_inv[ref_db_idcol] = df_dmat_inv[ref_db_idcol].apply(str)
# merge on name column
cfound2 = pd.merge(cfound, df_dmat_inv, how='outer', on=ref_db_idcol)
# reorder columns to place name_filter just after name
cfound2 = cfound2[cfound2.columns[[0, -1] + range(1,len(cfound2.columns)-1)]]
# Restore original name (without cleanup)
cfound2[ref_db_idcol+'_clean'] = cfound2[ref_db_idcol]  # make a backup of the cleaned up name first, so that we can easily compare and understand how the filtering worked
cfound2[ref_db_idcol] = cfound2[ref_db_idcol+'_orig']  # restore the original names
# reorder columns to place name_orig just after name
cfound2 = cfound2[cfound2.columns[[0, -1] + range(1,len(cfound2.columns)-1)]]

# Save into a csv file
#cfound2.to_csv(os.path.join(output_dir, 'shorten_found.csv'), index=False, sep=';')
save_df_as_csv(df_to_unicode_fast(cfound2), os.path.join(output_dir, 'shorten_found.csv'), fields_order=False, keep_index=False, blankna=True)
print('Saved list of found subjects in shorten_found.csv')
print('Found subjects: %i' % len(found_list))
cfound2

----------------------------------------------------
## Test

In [None]:
from csg_fileutil_libs.distance import distance
from csg_fileutil_libs.aux_funcs import distance_jaccard_words_split

subj = 'de caliafiera'
c = 'de caliafiera teng'
print(distance.nlevenshtein(subj, c, method=1))
print(distance_jaccard_words_split(subj, c, partial=True, norm=None, dist=dist_threshold_letters, minlength=3))