# FMP+DICOMS stats analysis
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2018-02-17
License: MIT
v1.3.1

DESCRIPTION:
Some statistical analysis of the combination of FMP and DICOMS databases. This also saves a merged csv file with both data.

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.13

USAGE:
Input the cleaned FMP database (cleaned using fmp_db_cleaner.ipynb) and the DICOMs csv database.

TODO:
* Nothing here!

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re

from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, merge_two_df, compute_best_diag, reorder_cols_df, find_columns_matching, cleanup_name, replace_buggy_accents, convert_to_datetype, df_drop_duplicated_index, df_to_unicode, df_to_unicode_fast, cleanup_name_df


In [None]:
# Nice plots!
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
# PARAMETERS

# FileMakerPro (FMP) database, cleaned with the provided script
fmp_agg_csv = r'databases_output\fmp_db_subjects_aggregated.csv'
fmp_crsr_csv = r'databases_output\fmp_db_subjects_crsr.csv'

# DICOMS database extract, with provided script
dicoms_csv = r'databases_output\dicoms_db_infos_new.csv'

# Hide null values in plots?
plot_hide_nan = True

In [None]:
# Additional AUX functions
#def convert_to_datetype(df, col, dtformat, **kwargs):
#    """Convert a column of a dataframe to date type with the given format"""
#    if not df.index.name is None:
#        df = df.reset_index()
#    df[col] = pd.to_datetime(df[col], format=dtformat, **kwargs)
#    return df

#def df_drop_duplicated_index(df):
#    """Drop all duplicated indices in a dataframe or series"""
#    return df[~df.index.duplicated(keep='first')]

In [None]:
# Import the csv dbs as dataframes
import pandas as pd
import numpy as np

cf_agg = pd.read_csv(fmp_agg_csv, sep=';', low_memory=False).dropna(axis=0, how='all').fillna('')  # drop empty lines
cf_crsr = pd.read_csv(fmp_crsr_csv, sep=';', low_memory=False).dropna(axis=0, how='all')  # drop empty lines
cdicom = pd.read_csv(dicoms_csv, sep=';', low_memory=False).dropna(axis=0, how='all').fillna('')  # drop empty lines
cdicom.drop_duplicates(inplace=True) # drop duplicates
cdicom.rename(columns={'name': 'Name'}, inplace=True)  # rename to have the same column

In [None]:
# Compute the dicoms availability compared to FMP
print('The dicoms db includes %i different MRI sessions and %i unique subjects.' % (len(cdicom), len(cdicom['Name'].unique())))
print('The FMP db includes %i different CRS-Rs and %i unique subjects.' % (len(cf_crsr), len(cf_agg)))
available = len(cdicom['Name'].unique())
unavailable = len(cf_agg) - available
fig = plt.figure()
pd.Series([available, unavailable], index=['Available (%i)' % available, 'Unavailable (%i)' % unavailable]).plot(kind='pie', title='Dicoms Availability (total: %i patients)' % (available+unavailable), autopct='%.1f%%', figsize=(6, 6))
plt.axis('off')  # hide axis "None" label
fig.savefig('fig_dicoms-availability.png', bbox_inches='tight', dpi=600)

In [None]:
# Compute the mapping of names to rename the names in dicoms to correspond to those of FMP (to ease comparison)
(mapping, cdicom_merged) = merge_two_df(cdicom, cf_agg, col='name', mode=1, skip_sanity=True, keep_nulls=True, returnmerged=True)
mapping

In [None]:
# Rename dicoms names and drop empty names (could not find a mapping in FMP or missing, so we don't have any info anyway - we draw below a list of these subjects anyway)
cdicom_ren = cdicom.copy()
cdicom_ren['Name'] = cdicom_ren['Name'].map(mapping.set_index('name')['name2'].to_dict())
cdicom_ren.dropna(subset=['Name'], inplace=True)  # drop empty names (no mapping in FMP)
cdicom_ren

In [None]:
# Find all missing dicoms of subjects we can however find in the FMP db (so we need to get the dicoms!)
missing_dicoms = list(set(cf_agg.set_index('Name').index.difference(cdicom_ren.set_index('Name').index).tolist()))
missing_dicoms

In [None]:
# Save as csv
cf_missing_dicoms = pd.DataFrame(missing_dicoms, columns=['Name'])
save_df_as_csv(df_to_unicode(cf_missing_dicoms), 'missing_dicoms.csv', csv_order_by='Name')
print('Total missing dicoms: %i, saved in missing_dicoms.csv' % len(cf_missing_dicoms))

In [None]:
# Subjects missing in FMP (typo? bug?)
print('List of subjects present in dicoms but could not find the name in FMP db (typos or longer name in FMP?):')
cf_missing_fmp = mapping[mapping['name2'].isnull()]['name']
save_df_as_csv(df_to_unicode(cf_missing_fmp.to_frame()), 'missing_fmp_but_have_dicoms.csv')
print('Total missing demographics but has dicoms: %i, saved in missing_fmp_but_have_dicoms.csv' % len(cf_missing_fmp))
cf_missing_fmp

In [None]:
# Add back the subjects that have dicoms but are not in FMP database
# Note that cdicom_ren_all might look like cdicom (the original input) but it is different in the sense that the Name column is now synchronized with the Name column from FMP database for subjects we could find there, so the comparison will then be eased
cf_missing_fmp_full = cdicom.loc[cleanup_name_df(cdicom, 'Name')['Name'].isin(cf_missing_fmp),:]
cdicom_ren_all = pd.concat([cdicom_ren, cf_missing_fmp_full])
cdicom_ren_all

In [None]:
# Plot the diagnosis repartition
fig = plt.figure()
tmp = cf_agg['CRSr::Best Computed Outcome'].value_counts(dropna=plot_hide_nan)
tmp.plot(kind='pie', title='CRS-R diagnosis repartition (%i patients)' % tmp.sum(), autopct='%.0f%%', figsize=(6, 6))
plt.axis('off')
fig.savefig('fig_crsr-diag-repartition.png', bbox_inches='tight', dpi=600)
tmp

In [None]:
# Plot the EPI sedation repartition
cf_agg.loc[cf_agg['MRI::Sedation EPI'] == "['both', 'yes']", 'MRI::Sedation EPI'] = 'both'
cf_agg['MRI::Sedation EPI'] = cf_agg['MRI::Sedation EPI'].str.strip()
fig = plt.figure()
tmp = cf_agg['MRI::Sedation EPI'].str.strip().value_counts(dropna=plot_hide_nan)
tmp.plot(fig=fig, kind='pie', title='MRI EPI sedation (%i patients)' % tmp.sum(), autopct='%.0f%%', figsize=(6, 6))
fig.savefig('fig_mri-epi-sedation.png', bbox_inches='tight', dpi=600)
tmp

In [None]:
# Plot the etiology repartition
def replace_nonnull_df(x, repmap, cleanup=False):
    if cleanup and isinstance(x, str):
        x = cleanup_name(replace_buggy_accents(x))
    if x in repmap:
        replacement = repmap[x]
        return replacement if replacement is not None else x
    else:
        return x

cf_agg_etio = cf_agg.copy()
# For null Etiology cells, fill in with the Etiology specified column's value
cf_agg_etio.loc[cf_agg_etio['Etiology'].isnull(), 'Etiology'] = cf_agg_etio.loc[cf_agg_etio['Etiology'].isnull()]['Etiology specified.']
# Rename the multitude of etiologies descriptions into a few, to better plot
mapping_etio = {'traumatiques accident de la circulation accident du travail chute violence et autres': 'traumatic',
 'pas traumatiques - anoxie ex arrt cardiaque noyade pendaison intoxication co ranimation': 'nt anoxic',
 'pas traumatiques avc ex hmorragie infarctus aneurysme ischmie hypertension': 'nt stroke - ischemic',
 'pas traumatiques - infection encphalite': 'nt infection',
 'pas traumatiques - mtaboliques ex hypoglycmie hyperglycmie': 'metabolic',
 'pas traumatiques - autre intoxication': 'intoxication',
 'pas traumatiques - autre': 'other',
 'ischemic stroke': 'nt stroke - ischemic',
 'tbi': 'traumatic',
 'hemorrhagic stroke': 'stroke - hemorrhage',
 'anoxia': 'nt anoxic',
 'mix trau anox': 'mixte trauma anoxia',
 'avc': 'nt stroke - ischemic',
 'arca': 'nt anoxic',
 'anoxic': 'nt anoxic',
 'arca sur infarctus': 'nt anoxic',
 'arrt respiratoire': 'nt anoxic',
 'hematoma': 'nt stroke - hemorrhage',
 'anevrysm': 'nt stroke - hemorrhage',
 'mix traumatic anoxic': 'mixte trauma anoxia',
 'mixte trauma anoxie': 'mixte trauma anoxia',
 'arca coronarien - ranim': 'nt anoxic',
 'pas traumatiques - tumeur crbrale': 'nt tumor',
 'pas traumatiques - autre epilepsie': 'nt epilepsia',
 'mix other': 'other',
 'nt anoxia': 'nt anoxic',
 'mixed traumatic anoxic': 'mixte trauma anoxia',
 'pas traumatiques - autre ams': 'nt ams',
 'infection': 'nt infection',
 'subarachnoid hemorrhage': 'stroke - hemorrhage',
 'hmatome capsulo-thalamique': 'stroke - hemorrhage',
 'infarctus ischmique': 'nt stroke - ischemic',
 'nt intoxication': 'intoxication',
}
cf_agg_etio['Etiology'] = cf_agg_etio['Etiology'].apply(lambda x: replace_nonnull_df(x, mapping_etio, cleanup=True))

# Plot!
fig = plt.figure()
tmp = cf_agg_etio['Etiology'].value_counts(dropna=plot_hide_nan)
tmp.sort_values(ascending=False)[:11].plot(kind='pie', title='Etiology (%i patients)' % tmp.sum(), autopct='%.0f%%', figsize=(6, 6))
plt.axis('off')
fig.savefig('fig_etiology.png', bbox_inches='tight', dpi=600)
tmp

In [None]:
# Save the fixed cf_agg
cf_agg_etio_norm = df_to_unicode_fast(cf_agg_etio.reset_index())
save_df_as_csv(cf_agg_etio_norm, fmp_agg_csv+'_etio-sedation-fixed.csv', fields_order=cf_agg_etio_norm.columns, keep_index=False)
cf_agg_etio

------------------
## Find all patients with a differing/same diagnosis on MRI scan date as the best diagnosis

In [None]:
# Rename and convert to date type for easier comparison
cdicom_date = cdicom_ren_all.rename(columns={'AcquisitionDate': 'Date'})
cdicom_date = convert_to_datetype(cdicom_date, 'Date', '{%Y%m%d}').set_index(['Name', 'Date'])
cdicom_date

In [None]:
# Same for list of subjects with dicoms but missing in FMP db
cf_missing_fmp_full_date = cf_missing_fmp_full.rename(columns={'AcquisitionDate': 'Dicom Date', 'path': 'Dicom Path Sedation'})
cf_missing_fmp_full_date = convert_to_datetype(cf_missing_fmp_full_date, 'Dicom Date', '{%Y%m%d}').set_index(['Name', 'Dicom Date'])
cf_missing_fmp_full_date

In [None]:
# Same for CRS-R dataframe
cf_crsr_date = cf_crsr.rename(columns={'CRSr::Date of CRSr': 'Date'})
cf_crsr_date = convert_to_datetype(cf_crsr_date, 'Date', '%d/%m/%Y').set_index(['Name', 'Date'])
cf_crsr_date

In [None]:
# Find all sessions that are present in both df
dicom_sessions_idxs = cf_crsr_date.index.intersection(cdicom_date.index)
dicom_sessions_idxs

In [None]:
# Extract the sessions infos
cf_crsr_date_dicoms = cf_crsr_date.loc[dicom_sessions_idxs,:]
cf_crsr_date_dicoms

In [None]:
# Compute the diagnosis on dicom/scan day (get the best one, because there can be multiple CRS-Rs on one day, or they are buggy duplicated entries in FMP...)
# Make a copy
cf_agg_dicoms = cf_agg_etio.copy()
# Add dicom MRI scan/acquisition date
dicoms_dates_only = pd.Series(index=dicom_sessions_idxs).reset_index().set_index('Name')['Date']
#cf_agg_dicoms.loc[:, 'Dicom Date'] = df_drop_duplicated_index(dicoms_dates_only)  # WRONG: will lose all dicom dates but the first acquisition if there are multiple for one subject
dicoms_dates_only_dedup = df_drop_duplicated_index(dicoms_dates_only.to_frame().rename(columns={'Date': 'Dicom Date'}).reset_index().set_index(['Name', 'Dicom Date'])).reset_index().set_index('Name')  # deduplicate indexes (on both name and date)
cf_agg_dicoms = pd.merge(cf_agg_dicoms, dicoms_dates_only_dedup, how='outer', on='Name')  # Correct: do a cartesian product so that columns but Dicom Date are duplicated if there are multiple Dicom Date
# Set index on Name and Date
cf_agg_dicoms = cf_agg_dicoms.reset_index().set_index(['Name', 'Dicom Date'])
# Add best dicom diagnosis
cf_agg_dicoms.loc[:, 'Dicom Diagnosis'] = compute_best_diag(cf_crsr_date_dicoms.loc[:, 'CRSr::Computed Outcome'], persubject=False)  # need to compute the best diagnosis because there can be multiple CRS-R on one day (or duplicates entries in the CRS-R database due to funky FileMakerPro behavior)
# Add the missing subjects in FMP but who have DICOMs
cf_agg_dicoms = pd.concat([cf_agg_dicoms, cf_missing_fmp_full_date], sort=False)
# Reorder the columns for nicer display
cf_agg_dicoms = reorder_cols_df(cf_agg_dicoms, ['Dicom Diagnosis', 'CRSr::Best Computed Outcome'])
# Save as csv
cf_agg_dicoms_norm = df_to_unicode_fast(cf_agg_dicoms.reset_index())  # convert strings to unicode
save_df_as_csv(cf_agg_dicoms_norm.fillna(''), fmp_agg_csv+'_etiosedatfixed_dicomsdatediag.csv', fields_order=cf_agg_dicoms_norm.columns, keep_index=False)
# Display results!
cf_agg_dicoms

In [None]:
# Add the sedation assumed in DICOM path
# Make a copy
cf_agg_sedat = cf_agg_dicoms.copy()
# Define the function to extract sedation info from path
def df_path_to_sedation(x):
    if 'non_sedated' in x.lower():
        return 'no'
    elif 'sedated' in x.lower():
        return 'yes'
    else:
        return 'unknown'
# Synchronize indices
cf_agg_sedat = cf_agg_sedat.reset_index().set_index(['Name', 'Dicom Date'])
# Drop duplicated entries (ie, multiple times the same dicom, in different paths = files duplicates)
cdicom_path_sedat = df_drop_duplicated_index(cdicom_date.loc[:, 'path'].apply(df_path_to_sedation))
# Merge the sedation info back into the main dataframe
cf_agg_sedat.loc[:, 'Dicom Path Sedation'] = cdicom_path_sedat
# Save the result as csv
cf_agg_sedat_norm = df_to_unicode_fast(cf_agg_sedat.reset_index())  # convert strings to unicode
save_df_as_csv(cf_agg_sedat_norm.fillna(''), fmp_agg_csv+'_etiosedatfixed_dicomsdatediag_dicompathsedat.csv', fields_order=cf_agg_sedat_norm.columns, keep_index=False)
# Show the result
cf_agg_sedat

In [None]:
# Find the list of patients with a different diagnosis on scan day than their final diagnosis
cf_agg_diffdiagdicom = cf_agg_dicoms[cf_agg_dicoms['Dicom Diagnosis'] != cf_agg_dicoms['CRSr::Best Computed Outcome']].dropna(how='any', subset=['Dicom Diagnosis', 'CRSr::Best Computed Outcome'])
cf_agg_diffdiagdicom_norm = df_to_unicode_fast(cf_agg_diffdiagdicom.reset_index())
save_df_as_csv(cf_agg_diffdiagdicom_norm.fillna(''), fmp_agg_csv+'_diffdiagdicom.csv', fields_order=cf_agg_diffdiagdicom_norm.columns, keep_index=True)
cf_agg_diffdiagdicom

In [None]:
# Find the opposite list (patients with the same diagnosis on scan day and final diagnosis)
cf_agg_samediagdicom = cf_agg_dicoms[cf_agg_dicoms['Dicom Diagnosis'] == cf_agg_dicoms['CRSr::Best Computed Outcome']].dropna(how='any', subset=['Dicom Diagnosis', 'CRSr::Best Computed Outcome'])
cf_agg_samediagdicom_norm = df_to_unicode_fast(cf_agg_samediagdicom.reset_index())
save_df_as_csv(cf_agg_samediagdicom_norm.fillna(''), fmp_agg_csv+'_samediagdicom.csv', fields_order=cf_agg_diffdiagdicom_norm.columns, keep_index=True)
cf_agg_samediagdicom

In [None]:
# Plot repartition of differing diagnoses
fig = plt.figure()
tmp = cf_agg_diffdiagdicom['CRSr::Best Computed Outcome'].value_counts(dropna=plot_hide_nan)
tmp.plot(fig=fig, kind='pie', title='Diagnosis repartition for fluctuating patients\n(different diagnosis on scan day)\n%i sessions' % sum(tmp), autopct='%.1f%%', figsize=(6,6))
plt.axis('off')
fig.savefig('fig_diffdiag.png', bbox_inches='tight', dpi=600)

In [None]:
# Plot repartition of same/stable diagnoses
fig = plt.figure()
tmp = cf_agg_samediagdicom['CRSr::Best Computed Outcome'].value_counts(dropna=plot_hide_nan)
tmp.plot(fig=fig, kind='pie', title='Diagnosis repartition for stable patients\n(same diagnosis on scan day)\n%i sessions' % sum(tmp), autopct='%.1f%%', figsize=(6,6))
plt.axis('off')
fig.savefig('fig_samediag.png', bbox_inches='tight', dpi=600)

In [None]:
# Plot the EPI sedation repartition using Dicom Path Sedation
cf_agg_sedat.loc[cf_agg_sedat['Dicom Path Sedation'] == 'yes', 'Dicom Path Sedation']
cf_agg_sedat['Dicom Path Sedation'] = cf_agg_sedat['Dicom Path Sedation'].str.strip()
fig = plt.figure()
tmp = cf_agg_sedat['Dicom Path Sedation'].str.strip().value_counts(dropna=plot_hide_nan)
tmp.plot(fig=fig, kind='pie', title='Dicom Path Sedation (%i patients)' % tmp.sum(), autopct='%.0f%%', figsize=(6, 6))
fig.savefig('fig_mri-epi-dicompathsedation.png', bbox_inches='tight', dpi=600)
tmp