# DICOMs to NIfTI converter
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2019-03-08
License: MIT
v1.1.1

DESCRIPTION:
An agnostic DICOM folders and zip files converter into nifti, using a specific naming scheme synchronized with a demographics file, so as to ease subsequent manipulation of NIfTI dataset according to demographics data.

Any command/converter can be used, this tool only provide the framework to automate the conversion process on lots of folders/zip files (and does a bit of sanity checks too).

Note: a good alternative to this script is to use dcm2niix directly, but it works only on DICOM folders (not zips). Example of commandline: `dcm2niix.exe -f %s_%p_%d_%z/%s_%n_%t_%p_%d_%z_%r -d 9 -o output_dir input_dir`

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.15
You need also mcverter, as part of [MRIConvert](https://lcni.uoregon.edu/downloads/mriconvert).

USAGE:
Input: the final unified and postprocessed database (`merged_fmp_steph_manon_sarah_dicom_ecg_reports_unifiedall.csv`), resulting from using [csg_datafusion_finaldbunification.ipynb](csg_datafusion_finaldbunification.ipynb). The DICOM folders/zip files path should be included in the demographics file. If not, please use [csg_datafusion_dicoms_extract.ipynb](csg_datafusion_dicoms_extract.ipynb) to generate a new database (and merge it using [csg_datafusion_db_merger.ipynb](csg_datafusion_db_merger.ipynb)). If you have issues with DICOMs duplicates/conflicts, please use [csg_datafusion_dicoms_reorganizer.ipynb](csg_datafusion_dicoms_reorganizer.ipynb).

TODO:
* When an error happens, try to get back the original row from original database, and save as new csv, so that retrying the failed entries is easier.

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# AUX FUNCTIONS

import os, sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

import re
import shutil
import subprocess
import zipfile

from tempfile import mkdtemp

from csg_fileutil_libs.aux_funcs import save_df_as_csv, _tqdm, reorder_cols_df, find_columns_matching, cleanup_name, df_to_unicode, df_to_unicode_fast, cleanup_name_df, df_literal_eval, reorder_cols_df, create_dir_if_not_exist, df_literal_eval, get_list_of_folders


In [None]:
# PARAMETERS

# Unified post-processed demographics database
unified_csv = r'databases_output\merged_fmp_steph_manon_sarah_dicom_ecg_reports_unifiedall.csv'
# Columns in the unified demographics database to use as keys and also as naming for each converted DICOM folder/zip file
# Note 1: rows will be filtered if any of these key columns is empty
# Note 2: the resulting rows need to be unique: not any two rows should have the same key columns (all combined)
# Note 3: later for automatic dataset reorganization, you will need to input the same key columns
key_columns = ['name', 'StudyDate']
# Column in unified_csv that contain the path to the DICOM file/folder
dicom_column = 'dicom.path'
# Replace part of the dicom path with a new path (eg if the path changed since generating the database). If None, no replacement will happen. 3rd value specify if using regular expression (True).
dicom_path_replace = None # [r'H:\\ALLDICOMS\\PATIENTS\\', r'C:\\git\\datatest\\', True]
# Command template to launch to convert. You can use whatever you want, just make sure to include the variables %(inputpath)s and %(outputpath)s
# Note: don't forget to double % if it's part of the command and not meant to be replaced by a Python variable (eg, instead of %t, use %%t)
# Note2: in the default dcm2niix command, the patient's details are anonymized. If you want to change that, set -ba n -t y
cmd_template = r'C:\git\dcm2niix_11-Apr-2019_win\dcm2niix.exe -z 3 -f %%s_%%p_%%d_%%z/%%s_%%t_%%p_%%d_%%z -d 9 -b y -ba y -t n -w 2 -o %(outputpath)s %(inputpath)s'

# Output folder for converted NIFTI files (a subfolder for each key will be created)
# WARNING: please check the output path does not exist before launching this script, to avoid any conflict!
output_dir = r'G:\hyperc_doc\niftis2kpacsadd'

# Skip conversion errors?
skip_errors = True
# Cleanup names to replace accentuated and special characters? (advised)
clean_names = True

# Special parameters
verbose = False
debug = False

In [None]:
# Import the csv db as dataframe
import pandas as pd

cf_unified = pd.read_csv(unified_csv, sep=';', low_memory=False).dropna(axis=0, how='all').fillna('')  # drop empty lines
cf_unified = df_to_unicode_fast(cf_unified, progress_bar=True)  # convert to unicode (can fix issues with accentuated characters)
cf_unified

In [None]:
# Extract subset with non empty key columns and dicom column (ie, dicom is available)
cf_unified_dicoms = cf_unified[~(cf_unified[key_columns + [dicom_column]].isnull() | (cf_unified[key_columns + [dicom_column]] == '')).any(axis=1)]
cf_unified_dicoms

In [None]:
# All patients acquired on the same day, just out of curiosity
cf_unified_dicoms[cf_unified_dicoms.duplicated(subset=[key_columns[-1]], keep=False)].sort_values(by=[key_columns[-1]])

In [None]:
# Sanity check: Find any duplicated keys or dicom path, if that's the case we have an issue and we stop here
cf_duplicated_keys = cf_unified_dicoms[cf_unified_dicoms.duplicated(subset=key_columns, keep=False)]
cf_duplicated_dicompath = cf_unified_dicoms[cf_unified_dicoms.duplicated(subset=[dicom_column], keep=False)]
if len(cf_duplicated_keys) or len(cf_duplicated_dicompath):
    if len(cf_duplicated_keys):
        print('ERROR: rows with duplicated keys were found! Please ensure your key columns are unique!')
        duprows = cf_duplicated_keys[key_columns + [dicom_column]].sort_values(by=key_columns)
        print(duprows)
        if save_df_as_csv(duprows, 'duprows_keys.csv', fields_order=False):
            print('List of duplicated rows were saved in duprows_keys.csv, please fix the dicom manually or change the key columns you use to ensure they are unique.')
    if len(cf_duplicated_dicompath):
        print('ERROR: multiple entries share the same dicom path! Please ensure the dicom path is unique for each entry!')
        duprows = cf_duplicated_dicompath[key_columns + [dicom_column]].sort_values(by=dicom_column)
        print(duprows)
        if save_df_as_csv(duprows, 'duprows_dicompath.csv', fields_order=False):
            print('List of duplicated rows were saved in duprows_dicompath.csv, please fix them manually (by moving dicoms in separate folders) before restarting this script.')


In [None]:
# Replace dicom paths if provided a replacement
if dicom_path_replace:
    cf_unified_dicoms.loc[:, dicom_column] = cf_unified_dicoms[dicom_column].str.replace(dicom_path_replace[0], dicom_path_replace[1], case=False, regex=dicom_path_replace[2])
    cf_unified_dicoms[dicom_column]

In [None]:
# Evaluate literals in dicom paths (in case there are multiple paths in a list/set)
cf_unified_dicoms.loc[:, dicom_column] = cf_unified_dicoms.loc[:, dicom_column].apply(df_literal_eval)
cf_unified_dicoms[dicom_column]

In [None]:
# Create an id for each subject/session (will be used as the output folder name)
def df_concat_cols(x):
    """Concatenate values over different columns""" 
    return '_'.join(x).strip().replace(' ','-')

idcol = df_concat_cols(key_columns)
# Clean names + select the key columns
if clean_names:
    # Cleanup name per column (else we might get buggy spaces if we apply on concatenated idcol)
    cf_unified_keycols = cf_unified_dicoms.loc[:, key_columns].applymap(lambda name: cleanup_name(name))
else:
    cf_unified_keycols = cf_unified_dicoms.loc[:, key_columns]
# Merge key columns in one (concatenated with an underscore)
cf_unified_dicoms.loc[:, idcol] = cf_unified_keycols.apply(df_concat_cols, axis=1)
# Clean name again
if clean_names:
    cf_unified_dicoms.loc[:, idcol] = cf_unified_dicoms.loc[:, idcol].apply(cleanup_name).apply(lambda x: x.replace(' ', '_'))
# Make a new DataFrame with only the clean id column and dicom column
cf_clean = cf_unified_dicoms[[idcol, dicom_column]]
cf_clean

In [None]:
# DEBUG
# Test if lists and zip files are supported
if debug:
    cf_clean.loc[0, dicom_column] = [cf_clean.loc[0, dicom_column], '%s%s' % (cf_clean.loc[0, dicom_column],'.zip')]

In [None]:
# MAIN CONVERSION LOOP
def launch_cmd(inputpath, outputpath, robust=False, verbose=False):
    """Launch a command and return the output"""
    cmd = cmd_template % {'inputpath': inputpath, 'outputpath': outputpath}
    if verbose:
        print(cmd)
    try:
        res = subprocess.check_output(cmd, shell=True)
        if not res:
            print('Error when converting:')
            print(res)
        return res
    except Exception as exc:
        if not robust:
            raise exc
        else:
            print('ERROR: the command returned an error, the DICOM acquisition might have been partially or not at all converted to NIfTI, please check manually:')
            print(exc)
            pass
            return -1
    return 0

missing = []
conflicts = []
errorslist = []
# For each row
for idx, row in _tqdm(cf_clean.iterrows(), total=len(cf_clean), desc='CONVERT', unit='sessions'):
    # Build the input and output paths
    input_filepaths = row[dicom_column]
    output_filepath = os.path.join(output_dir, row[idcol])
    if not isinstance(input_filepaths, (list, set)):
        input_filepaths = [input_filepaths]
    # If DICOM path for this row contains multiple paths (there are multiple DICOMs for this subject/session, might be duplicates or the acquisition was split in several DICOM directories/zip files), we loop on all of them
    for input_filepath in input_filepaths:
        # If the dicom path exists (as specified in the dataframe row)
        if os.path.exists(input_filepath):
            # If the output file already exists, then there is a conflict, we will overwrite but we save in a list all conflicts
            if os.path.exists(output_filepath):
                conflicts.append(input_filepath)
            # Create the output folder for this row
            create_dir_if_not_exist(output_filepath)
            rtncode = 0
            if not input_filepath.endswith('.zip'):
                # If not a zip, we can directly process the whole folder
                rtncode = launch_cmd(input_filepath, output_filepath, robust=skip_errors, verbose=verbose)
            else:
                # Else it is a zip, we first need to unzip it
                temp_dir = mkdtemp()
                try:
                    # Unzip the dicoms into the temporary folder
                    ziph = zipfile.ZipFile(input_filepath, 'r')
                    ziph.extractall(temp_dir)
                    ziph.close()

                    # Convert
                    rtncode = launch_cmd(temp_dir, output_filepath, robust=skip_errors, verbose=verbose)
                finally:
                    # Finally we delete the temporary directory
                    shutil.rmtree(temp_dir)
            if rtncode == -1:
                errorslist.append([input_filepath, output_filepath])
        else:
            # The input dicom path does not exist, we have a missing dicom, we cannot convert (but save in a list for manual inspection)
            missing.append(input_filepath)
    if debug:
        break

print('All done!')
if missing:
    print('\n')
    print('Some input DICOMs were missing:')
    print(missing)
if conflicts:
    print('\n')
    print('Some conflicts (probably duplicates) were found and overwritten, but please make sure to review the list below. If you have an issue, check the key_columns you set ensures no loss/mixing of DICOMs by using uniquely identifying key_colums! Here is the list of conflicts:')
    print(conflicts)
    print('Note: these were reported as conflicts because the output folder already existed before. There might not have been any overwriting if for example a subject examination was split in two, but with the same name and studydate, then the sequences will have a different id and everything is fine, but please review manually to ensure that.')
if errorslist:
    print('\n')
    print('Some DICOM folders could not be completely processed, the result might be partial or inexistent, please check manually the following entries:')
    for (inp, outp) in errorslist:
        print('* Input: %s -> Output: %s' % (inp, outp))


In [None]:
# Sanity check: check each output nifti subfolder if there is something inside, else the subject was not converted at all (dcm2niix crash)
empty_conversions = []
for p in get_list_of_folders(output_dir):
    if not len(os.listdir(os.path.join(output_dir, p))):
        empty_conversions.append(p)

if empty_conversions:
    print('Some subjects/sessions could not be converted at all (probably because of converter crashing silently), here is the list:')
    for e in empty_conversions:
        print('* %s' % e)
    print('Please just retry converting these subjects/sessions manually, sometimes this fixes the issue.')
else:
    print('All subjects/sessions could be partially or completely converted, congratulations!')
