# NIfTI modular reorganizer (aka NeuroDataset Builder)
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2019-03-21
License: MIT
v1.0.2

DESCRIPTION:
This tool allows to automatically organize (copy) NIfTI folders (one per subject/session) into neatly organized folders according to a supplied demographics file.

The goal of this modular reorganizer is to allow updating the demographics and/or adding more niftis without having to redo all the conversion process, which is both quite time consuming and requires to reorient again all structural and functions! Here you reorient/coregister once all subjects, and THEN you build your dataset given your demographics csv file and selection/filtering criteria of choice.



INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.15
You need also mcverter, as part of [MRIConvert](https://lcni.uoregon.edu/downloads/mriconvert).

USAGE:
Input:
* the final unified and postprocessed database (merged_fmp_steph_manon_sarah_dicom_ecg_reports_unifiedall.csv), resulting from using [csg_datafusion_finaldbunification.ipynb](csg_datafusion_finaldbunification.ipynb), or another csv file containing same info as can be found on nifti folders naming (eg, subject name and study date, like 'bernard-dupont_2019-01-01' as nifti folder name, and in database you have two rows 'name' with "Bernard Dupont" and 'StudyDate' with "01/01/2019" - everything can be configured in the parameters below).
* a rootpath folder where each folder = one subject/folder, with the folders being named according to the demographics file (use [csg_datafusion_dicoms_to_nifti.ipynb](csg_datafusion_dicoms_to_nifti.ipynb) or from dicom infos (use dcm2niix with specific formatting to save the subject id/name and study date in folder name).

TODO:
* Nothing here!

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re
import shutil
import subprocess
import zipfile

from collections import OrderedDict
from tempfile import mkdtemp

from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, reorder_cols_df, find_columns_matching, cleanup_name, df_to_unicode, df_to_unicode_fast, cleanup_name_df, df_literal_eval, reorder_cols_df, create_dir_if_not_exist, copy_any, get_list_of_folders, merge_two_df
from IPython.display import display


In [None]:
# PARAMETERS

# Unified post-processed demographics database
unified_csv = r'databases_output\merged_fmp_steph_manon_sarah_dicom_ecg_reports_unifiedall_nifti_mov.csv'
# Input folder where all nifti folders are located (one folder per subject/session, they must ALL be stored at the same level, side by side)
input_dir = r'G:\hyperc_doc\niftis2'
# Output folder for converted NIFTI files (a subfolder for each key will be created)
output_dir = r'G:\hyperc_doc\nhdoc4'

# Mode selector
# mode can be 'demographics' or 'niftis':
# * if 'demographics', will use the demographics to build the path to the niftis (will derive the folder name based on some key_columns);
# * if 'niftis', will extract infos from the niftis folders based on the specified template, and will then compare/merge against the specified key_columns in the demographics database.
# In other words: 'demographics' use the demographics to find the niftis, whereas 'niftis' starts from the niftis and then try to find the corresponding demographics entries. Use 'demographics' if you followed the whole csg_datafusion pipeline, else if you converted from dicoms to niftis by yourself (eg, using dcm2niix without csg_datafusion), then use 'niftis'.
script_mode = 'demographics'

# -- DEMOGRAPHICS MODE
# Columns in the demographics that were used to generate the nifti folders names from demographics (use the same here as in dicoms_to_nifti.ipynb)
# Note 1: rows will be filtered if any of these key columns is empty
# Note 2: the resulting rows need to be unique: not any two rows should have the same key columns (all combined)
# Note 3: later for automatic dataset reorganization, you will need to input the same key columns
key_columns = ['name', 'StudyDate']  # only for script_mode == 'demographics'

# -- NIFTIS MODE
# Naming template for niftis folders, to extract the pertinent variables. The regex group names should be the same as the key_columns_merge used to merge these infos with the demographics (so it should be the names of columns in the demographics csv file).
folder_template = r'(?P<name>[^_]+)_(?P<StudyDate>[^_]+)'  # only for script_mode == 'niftis'
# Please indicate here the name of the columns and their type for the merge with the demographics csv file
key_columns_merge = OrderedDict([('name', 'id'), ('StudyDate', 'datetime|%Y-%m-%d')])  # only for script_mode == 'niftis'
# Save list of nifti folders as a csv file?
save_nifti_list = True

# -- FOR BOTH MODES
# Filter function
# define here what rows will be selected for the reorganization. This will filter out all the rows you don't want to keep.
# in other words, this is where you define which subjects/sessions you select for your study.
# this should return the dataframe filtered by any condition you want (make sure to return at least the key_columns and hierarchy_cols for the rest of the script to work)
def my_filter_func(cf_unified):
    return cf_unified[
                        #(cf_unified['unified.diagnosis_best'] == 'emcs') \
                        #(cf_unified['unified.diagnoses_count'] >= 3) \
                      (cf_unified['nifti.struct OK (for fmri)'].isin(['O','M'])) \
                      & (cf_unified['nifti.func OK'].isin(['O','M'])) \
                      & (cf_unified['unified.episedationsimple'].isin(['no','both']))
                     ]
filter_func = my_filter_func

# Hierarchy columns
# define here what hierarchies should be used to create the subdirectory trees, in the order of the list (ie, 1st column's values will be top parent, then 2nd column's values is subdirectory, then 3rd column is subsubdirectory, etc)
#hierarchy_cols = ['unified.episedationsimple', 'unified.etiology']
hierarchy_cols = ['unified.diagnosis_best']
# prepend column name before the value in the folder name (be careful that the output filepath does not get too long, or you might run into errors!)
hierarchy_prepend_colname = False
# In case the value for a field is missing (for the hierarchy columns), what should we replace it with?
placeholder_value = 'unknown'

# Skip conversion errors?
skip_errors = True
# Cleanup names to replace accentuated and special characters? (advised, please use same setting as in dicoms_to_nifti.ipynb)
clean_names = True

# Special parameters
verbose = False
debug = False

In [None]:
# Import the csv db as dataframe
import pandas as pd

cf_unified = pd.read_csv(unified_csv, sep=';', low_memory=False).dropna(axis=0, how='all').fillna('')  # drop empty lines
cf_unified = df_to_unicode_fast(cf_unified, progress_bar=True)  # convert to unicode (can fix issues with accentuated characters)
cf_unified

In [None]:
# Extract subset with non empty key columns and dicom column (ie, dicom is available)
if script_mode == 'niftis':
    key_columns = key_columns_merge.keys()
cf_unified_nonempty = cf_unified[~(cf_unified[key_columns].isnull() | (cf_unified[key_columns] == '')).any(axis=1)]
cf_unified_nonempty

In [None]:
# Create an id for each subject/session (will be used as the output folder name)
# from dicoms_to_nifti.ipynb
# TODO: make a function in aux.py to be shared
# TODO: useless in script_mode niftis?
def df_concat_cols(x):
    """Concatenate values over different columns""" 
    return '_'.join(x).strip().replace(' ','-')

idcol = df_concat_cols(key_columns)
cf_unified_nonempty.loc[:, idcol] = cf_unified_nonempty.loc[:, key_columns].apply(df_concat_cols, axis=1)
if clean_names:
    cf_unified_nonempty.loc[:, idcol] = cf_unified_nonempty.loc[:, idcol].apply(cleanup_name).apply(lambda x: x.replace(' ', '_'))
cf_clean = cf_unified_nonempty[~cf_unified_nonempty[idcol].isnull()]
cf_clean[idcol]

In [None]:
# Filter to keep only the rows we are interested in
cf_filtered = filter_func(cf_clean)
cf_filtered

In [None]:
# Extract metadata from nifti folders names (using the provided regex)
if script_mode == 'niftis' or save_nifti_list:
    key_columns = key_columns_merge.keys()
    RE_folder = re.compile(folder_template, re.I)  # precompile the regex to speed up calculations
    niftis_metadata = []
    # For each nifti folder
    for p in get_list_of_folders(input_dir):
        try:
            # Create a list of dictionaries
            niftis_metadata.append({})
            # Add the nifti folder
            niftis_metadata[-1]['niftifolder'] = p
            # And add each specified metadata (with the name of the regex named group being the column/metadata name)
            for k in key_columns:
                niftis_metadata[-1][k] = RE_folder.search(p).group(k)
        except Exception as exc:
            print('ERROR: the provided regex template (in folder_template) does not match some nifti folders names! Please check your regex template and retry!')
            print('Choked on nifti folder: %s' % p)
            raise exc

    # Convert a DataFrame
    cf_niftis = pd.DataFrame(niftis_metadata)
    # Save list of nifti in a csv file
    if save_nifti_list:
        cf_niftis_unicode = df_to_unicode_fast(cf_niftis)
        if save_df_as_csv(cf_niftis_unicode, unified_csv[:-4]+'_niftis.csv', fields_order=False, date_format='%Y-%m-%d'):
            print('Input niftis list saved in %s!' % (unified_csv[:-4]+'_niftis.csv'))
        else:
            print('ERROR: the input niftis list could not be saved!')
    # Display all input niftis
    display(cf_niftis)

In [None]:
# Join the demographics and niftis databases based on the key columns (usually name and StudyDate)
if script_mode == 'niftis':
    cf_merge_mapping, cf_merge = merge_two_df(cf_filtered, cf_niftis, col=[key_columns_merge, key_columns_merge],
                                  skip_sanity=True, keep_nulls=False,
                                  returnmerged=True,
                                  join_on_shared_keys=False)
    print('Done!')
    display(cf_merge)

In [None]:
# MAIN LOOP
# Copying the nifti folders with the generated hierarchy
if script_mode == 'demographics':
    cf_loop = cf_filtered
elif script_mode == 'niftis':
    cf_loop = cf_merge
else:
    raise Exception('ERROR: script_mode %s is undefined! Cannot continue!' % scrip_mode)

conflicts = []
missing = []
# For each row
for idx, row in _tqdm(cf_loop.iterrows(), total=len(cf_loop), desc='REORG', unit='sessions'):
    # Build the input and output paths
    if script_mode == 'demographics':
        foldername = row[idcol]
    elif script_mode == 'niftis':
        foldername = row['niftifolder']
    input_filepath = os.path.join(input_dir, row[idcol])

    if not os.path.exists(input_filepath):
        # Missing input file, we skip!
        missing.append(input_filepath)
    else:
        # Organize per the specified hierarchy
        outpath = []
        for hcol in hierarchy_cols:  # select the columns to use as hierarchy
            try:
                # Get the value for this column
                v = row[hcol]
                # If empty, raise an error
                if not v.strip():
                    raise Exception('empty value')
            except Exception as exc:
                # If error (value empty or inexistent), we use a placeholder value
                if verbose:
                    print('Warning: no or empty value for hierarchical column %s for row id %s' % (hcol, row[idcol]))
                v = placeholder_value
            # Prepend the column name if option enabled
            if hierarchy_prepend_colname:
                v = '%s_%s' % (hcol, v)
            # Add the value to the list of subfolders
            outpath.append(v)
        # Append the subject name as the final subfolder
        outpath.append(foldername)
        # Build the final path, prepending the output directory
        output_filepath = os.path.join(output_dir, *outpath)
        # Check if there is a conflict (output already exists)
        if os.path.exists(output_filepath):
            conflicts.append([input_filepath, output_filepath])
        # Copy recursively!
        try:
            copy_any(input_filepath, output_filepath)
        except Exception as exc:
            print('Error when copying: maybe the constructed path is too long for your OS? Then please revise your parameters (reduce hierarchy for example). Full error:')
            print(exc)
        # Debug stuff
        if debug:
            break

print('All done!')

In [None]:
import pprint
if missing:
    with open('niftis_missing.txt', 'w') as f:
        f.write(pprint.pformat(missing, indent=4, width=80))
    print('\nSome nifti folders were not found, the list is saved in niftis_unprocessed.txt')
else:
    print('\nAll nifti folders were processed!')
if conflicts:
    with open('niftis_conflicts.txt', 'w') as f:
        f.write(pprint.pformat(conflicts, indent=4, width=80))
    print('\nSome nifti folders were in conflicts and got overwritten, the list is saved in niftis_conflicts.txt')
else:
    print('\nNo conflicts found!')

In [None]:
# Save the subset of selected entries into demographics csv files
cf_loop_unicode = df_to_unicode_fast(cf_loop)
cf_loop_extended = cf_unified[cf_unified[key_columns[0]].isin(cf_loop[key_columns[0]])]
cf_loop_extended_unicode = df_to_unicode_fast(cf_loop_extended)
if save_df_as_csv(cf_loop_unicode, unified_csv[:-4]+'_reorganizedsubset.csv', fields_order=False, date_format='%Y-%m-%d'):
    save_df_as_csv(cf_loop_extended_unicode, unified_csv[:-4]+'_reorganizedsubsetextended.csv', fields_order=False, date_format='%Y-%m-%d')
    print('Subset demographics for the reorganized database successfully saved in %s and %s!' % (unified_csv[:-4]+'_reorganizedsubset.csv', unified_csv[:-4]+'_reorganizedsubsetextended.csv'))
else:
    print('ERROR: the subset demographics for the reorganized database could not be saved!')

In [None]:
# Save extended subset of selected entries into demographics csv files
# by first disambiguate any similar name, and then save any entry that can be relevant
# this way, we make sure we don't miss any oddly named (eg, typo in name) entries

# Disambiguate names
cf_filtered_extended2_mapping = merge_two_df(cf_unified, cf_unified, col=key_columns[0], returnmerged=False, skip_sanity=True)
# Filter the name mapping
cf_filtered_extended2_mapping2 = cf_filtered_extended2_mapping.loc[cf_filtered_extended2_mapping[key_columns[0]].isin(cf_filtered[key_columns[0]].unique()), :]
# Get all entries from cf_unified that match either of the filtered disambiguated names
cf_filtered_extended2 = cf_unified.loc[cf_unified[key_columns[0]].isin(cf_filtered_extended2_mapping2[key_columns[0]]) | cf_unified[key_columns[0]].isin(cf_filtered_extended2_mapping2[key_columns[0]+'2']), :]
# Save extended infos about these entries (and convert to unicode first)
cf_filtered_extended2_unicode = df_to_unicode_fast(cf_filtered_extended2)
if save_df_as_csv(cf_filtered_extended2_unicode, unified_csv[:-4]+'_reorganizedsubsetextended2.csv', fields_order=False, csv_order_by=key_columns, date_format='%Y-%m-%d'):
    print('Subset demographics for the reorganized database successfully saved in %s!' % (unified_csv[:-4]+'_reorganizedsubsetextended2.csv'))
else:
    print('ERROR: the extended subset demographics for the reorganized database could not be saved!')
# Display the entries
cf_filtered_extended2