In [None]:
# CSV Database Shortener
# By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
# Creation date: 2017-04-05
# License: MIT
# v1.1.0
# INSTALL NOTE:
# You need to pip install pandas before launching this script.
# Tested on Python 2.7.13
#
# DESCRIPTION:
# Extracts a subset of rows from a csv database based on a list of names provided in a second csv file.
# You have two csv files: one being the full database full of demographics infos, the other one being the list of patients names for your study.
# If you want to filter the full database to extract only the patients in your smaller list, then use this notebook.
#
# USAGE:
# Any two csv files can be used for the shortening, you just need to have a "name" field in both. The first csv will be used as the reference, and its rows will be extracted if same names are found in the second database.
#
# TODO:
#

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.aux_funcs import compute_names_distance_matrix, cleanup_name, cleanup_name_df, cleanup_name_customregex_df, replace_buggy_accents, save_df_as_csv, _tqdm


In [None]:
# PARAMETERS

# Reference database, from which records will be extracted (need to include a "name" column with all the patients names)
ref_db = r'latestdbs2018\fmp_db_subjects_aggregated.csv_etiosedatfixed_dicomsdatediag.csv_acute_mergesedat_sedatmine - Copie.csv'

# Filter database, the one used to filter the reference database's records by matching names (need to include a "name" column with all the patients names)
filt_db = r'latestdbs2018\CSG_demographics_QC_2_final 36 subjects_FOR_Stephen.csv'

# How to filter names in the filter database (remove useless terms) - can use regex
filter_name = {'_': ' ',
               'repos': '',
               'ecg': '',
               '[0-9]+': '',
              }

----------------------------------------
# Loading databases

In [None]:
import pandas as pd

cref = pd.read_csv(ref_db, sep=';')
cref.dropna(axis=0, subset=['name'], inplace=True) # drop lines where the name is empty, important to avoid errors
cref

In [None]:
cfilt = pd.read_csv(filt_db, sep=';').dropna(how='all').dropna(subset=['name'], how='all')
# Reorder by name
cfilt.sort_values('name', inplace=True)
# Removing useless terms from the patient name
if filter_name:
    cfilt = cleanup_name_customregex_df(cfilt, filter_name)
# Cleanup name in filtering db
cfilt = cleanup_name_df(cfilt)
# Clean up names in full database
cref = cleanup_name_df(cref)
# Print db
print(len(cfilt))
cfilt

------------------------
## Comparison of the two csv databases

In [None]:
# Merging parameters - EDIT ME - do not hesitate to try different parameters until the matching seems good to you
dist_threshold_letters = 0.2 # percentage of letters matching
dist_threshold_words = 0.4 # percentage of words matching
dist_threshold_words_norm = True # normalize words jaccard distance? Can be True, False or None
dist_minlength = 4 # minimum length of words to compare distance jaccard words

# Merge the two databases names
dmat = compute_names_distance_matrix(cfilt['name'], cref['name'], dist_threshold_letters, dist_threshold_words, dist_threshold_words_norm, dist_minlength)
print('Reference & Filter databases were merged successfully!')
print('List of matchs (please check if this looks fine!):')
dmat

In [None]:
missing_list = [key for key, val in dmat.items() if val is None]
cmissing = pd.DataFrame(missing_list, columns=['name'])
cmissing.to_csv('shorten_missing.csv', index=False, sep=';')
print('Saved list of missing subjects in shorten_missing.csv')
print('Missing subjects (no demographics found in the reference database): %i' % len(missing_list))
cmissing

In [None]:
# Shorten reference demographics database
found_list = [item[0] for item in filter(None, dmat.values())]
cfound = cref[cref['name'].isin(found_list)]

# Add a column to show what was the filtering name
dmat_inv = {'name': [], 'name_filter': []}
for key, vals in dmat.items():
    for v in vals:
        dmat_inv['name'].append(v)
        dmat_inv['name_filter'].append(key)
# create a dataframe
df_dmat_inv = pd.DataFrame(dmat_inv)
df_dmat_inv['name'] = df_dmat_inv['name'].apply(str)
# merge on name column
cfound2 = pd.merge(cfound, df_dmat_inv, how='outer', on='name')
# reorder columns to place name_filter just after name
cfound2 = cfound2[cfound2.columns[[0, -1] + range(1,len(cfound2.columns)-1)]]

# Save into a csv file
cfound2.to_csv('shorten_found.csv', index=False, sep=';')
print('Saved list of found subjects in shorten_found.csv')
print('Found subjects: %i' % len(found_list))
cfound2

----------------------------------------------------
## Test

In [None]:
from csg_fileutil_libs.distance import distance
from csg_fileutil_libs.aux_funcs import distance_jaccard_words_split

subj = 'de caliafiera'
c = 'de caliafiera teng'
print(distance.nlevenshtein(subj, c, method=1))
print(distance_jaccard_words_split(subj, c, partial=True, norm=None, dist=dist_threshold_letters, minlength=3))