# DICOMs reorganizer
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2019-03-09
License: MIT
v1.2.7

DESCRIPTION:
Reorganize DICOM folders and zip files into neatly named DICOM folders. This also allows deduplication, since it's based on DICOM fields (it is suggested to use ids: study id, machine id, etc).

As an alternative to this script, you can use dcm2niix renaming functionality, but it works only on DICOM folders (not zips). Example of commandline: `dcm2niix.exe -f %n_%t/%s_%p_%d_%z/%s_%n_%t_%p_%d_%z_%r.dcm -r y -d 9 -o output_dir input_dir`

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.15

USAGE:
Input: a root folder containing your dicoms, it will be recursively processed.

TODO:
* Zip each folder/subfolder when done? (at the end because duplicates might overwrite)
* Check if file already exist before copying, if true make a list of conflicts so we know
* copy/manage DICOMDIR files? https://www.medicalconnections.co.uk/kb/DICOMDIR/
* add series number to be able to disambiguate multiple acquisitions of the same sequence (and also in what order, nice to guess if there was sedation)

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# Imports
import collections
import math  # to know the number of digits in a number
import os, sys
import re
import shutil
import zipfile

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

# For DB reorganization
from csg_fileutil_libs.aux_funcs import save_dict_as_csv, save_df_as_csv, _tqdm, df_to_unicode, create_dir_if_not_exist, real_copy, recwalk_dcm, generate_path_from_dicom_fields

# For Dicom reading
from csg_fileutil_libs.aux_funcs import cleanup_name, recwalk, _StringIO
import csg_fileutil_libs.pydicom as pydicom
from csg_fileutil_libs.pydicom import config as pydicomconfig
from csg_fileutil_libs.pydicom.filereader import InvalidDicomError

pydicomconfig.enforce_valid_values = False  # to allow more resilience against malformatted dicom fields

In [None]:
# PARAMETERS

# Input rootpath where all the DICOM folders/files/zipfiles are located
# Can be a list of folders, with output_dirs also a list of the same length so to specify a different output dir for different inputs
rootpaths_to_dicoms = [r'H:\ALLDICOMS\DICOMS\PATIENTS\NON_SEDATED',
                       r'H:\ALLDICOMS\DICOMS\PATIENTS\SEDATED',
                       r'H:\ALLDICOMS\DICOMS\PATIENTS\UNKNOWN',
                       r'H:\ALLDICOMS\DICOMS\CONTROLS\Controls',
                       r'H:\ALLDICOMS\DICOMS\CONTROLS\Controls_new_dti',

                       #r'C:\git\datatest\input',
                     ]
# Where to copy the reorganized dicom files
output_dirs = [r'C:\git\datatest\output\PATIENTS\NON_SEDATED',
               r'C:\git\datatest\output\PATIENTS\SEDATED',
               r'C:\git\datatest\output\PATIENTS\UNKNOWN',
               r'C:\git\datatest\output\CONTROLS\Controls',
               r'C:\git\datatest\output\CONTROLS\Controls_new_dti',

               #r'C:\git\datatest\output\test',
             ]
# DICOM fields that will be used to name the top folder
# Can be a nested field: top list will be converted to a folder hierarchy, inner lists will be concatenated as the name of one folder, eg: [['PatientName', 'AcquisitionDate'], 'StudyId'] will result in PatientName_20190309/982VRSTI/...
# Note: StudyDate might be more reliably present than AcquisitionDate (which might be missing, particularly for post-acquisition reconstructed sequences...), but AcquisitionDate is more precise (sometimes different acquisitions on multiple days can be grouped under the same StudyDate)
# Note2: However, we finally chose StudyDate in order to keep in the same folder the dicoms series that are post-processed, for example the phoenixzip reports, which are useful to export the sequences parameters to another MRI machine, or simply to have a look. You should check on your database if there are not too many DICOMs with a different AcquisitionDate and StudyDate (use dicoms_extract.ipynb which does that automatically)
key_dicom_fields = [['PatientName', 'StudyDate'], 'SeriesDescription']
# Cleanup the key dicom fields used for naming the output folders? This will remove any accentuated character
cleanup_dicom_fields = True
# In case the value for a field is missing, what should we replace it with?
placeholder_value = 'unknown'

# Verbose mode
verbose = False

In [None]:
# ADDITIONAL AUX FUNCTIONS

def getIntegerPlaces(theNumber):
    """Get the number of digits in an integer
    from https://stackoverflow.com/a/28883802
    DEPRECATED"""
    if theNumber <= 999999999999997:
        return int(math.log10(theNumber)) + 1
    else:
        counter = 15
        while theNumber >= 10**counter:
            counter += 1
        return counter

In [None]:
# REORGANIZE DICOM FILES
# This copies the dicom files to a new path according to the specified dicom metadata fields

# Ensure paths are contained in lists
if not isinstance(rootpaths_to_dicoms, list):
    rootpaths_to_dicoms = [rootpaths_to_dicoms]
if not isinstance(output_dirs, list):
    output_dirs = [output_dirs]

# Main loop
conflicts = []
unprocessed = []
for rootpath_to_dicoms, output_dir in zip(rootpaths_to_dicoms, output_dirs):
    for dcmfile in recwalk_dcm(rootpath_to_dicoms, verbose=verbose):  # recursively fetch any dicom file/zip file member (ie, file inside a zip)
        try:
            # Load the dicom file data
            filename = dcmfile['filename']
            dirpath = dcmfile['dirpath']
            dcmdata = dcmfile['data']
            if not filename.endswith('.zip'):
                # Generate the path from dicom fields
                finalpathdir = generate_path_from_dicom_fields(output_dir, dcmdata, key_dicom_fields, cleanup_dicom_fields=cleanup_dicom_fields)
                # Generate the new filename, based on a unique UID to avoid overwriting
                # To ensure there is no duplicates and that we do not unduly overwrite dicom files, we use the SOP Instance UID which is unique for every DICOM volume
                # This can fail as some dicoms are malformatted (normally the field should always be accessible)
                newfilename = "%s.dcm" % str(dcmdata.data_element('SOPInstanceUID').value)  # we should use MediaStorageSOPInstanceUID and not SOPInstanceUID but can't find the tag: https://forum.dcmtk.org/viewtopic.php?t=3405
                newfilepath = os.path.join(finalpathdir, newfilename)
                oldfilepath = os.path.join(dirpath, filename)
                if os.path.exists(newfilepath):  # conflict detected!
                    conflicts.append([newfilepath, oldfilepath])
                # Make the directory if necessary
                create_dir_if_not_exist(finalpathdir)
                # Copy the dicom file (directly at the root of the newly created path, so we effectively destroy any previous folder naming scheme, but that's a feature since we WANT to reorganize)
                real_copy(oldfilepath, newfilepath)
                # If it's a .dcm/.bmp tuple, we also copy the .bmp
                # FALSE: .bmp files are NOT necessary
                #if os.path.exists(oldfilepath[:-4]+'.bmp'):
                    #real_copy((oldfilepath[:-4]+'.bmp'), (newfilepath[:-4]+'.bmp'))
            else:
                # Load additional zip file data
                zipfh = dcmfile['ziphandle']
                zfile = dcmfile['zipfilemember']
                # Generate the new path from dicom fields
                finalpathdir = generate_path_from_dicom_fields(output_dir, dcmdata, key_dicom_fields, cleanup_dicom_fields=cleanup_dicom_fields)
                # Generate the new filename, based on a unique UID to avoid overwriting
                # To ensure there is no duplicates and that we do not unduly overwrite dicom files, we use the SOP Instance UID which is unique for every DICOM volume
                # This can fail as some dicoms are malformatted (normally the field should always be accessible)
                newfilename = "%s.dcm" % str(dcmdata.data_element('SOPInstanceUID').value)  # change the filename of the zipfile member directly to avoid extracting the full path
                if os.path.exists(os.path.join(finalpathdir, newfilename)):
                    try:
                        oldfilepath = os.path.join(dirpath, filename, cleanup_name(zfile.filename))
                    except UnicodeDecodeError as exc:
                        oldfilepath = os.path.join(dirpath, filename)
                        pass
                    conflicts.append([finalpathdir, oldfilepath])
                zfile.filename = newfilename
                # Make the directory if necessary
                create_dir_if_not_exist(finalpathdir)
                # Copy the dicom file (directly at the root of the newly created path, so we effectively destroy any previous folder naming scheme, but that's a feature since we WANT to reorganize)
                zipfh.extract(zfile, finalpathdir)  # extract zipfile member with metadata (contrary to zipfh.read())
        except KeyError as exc:
            # The MediaStorageSOPInstanceUID tag cannot be found: the DICOM is malformatted and unreadable (by pydicom as of March 2019), we simply skip, even if it means losing a few subjects...
            if 'zipfilemember' in dcmfile:
                unprocessed.append(os.path.join(dcmfile['dirpath'], dcmfile['filename']))
                unprocessed.append(dcmfile['zipfilemember'].filename)
            else:
                unprocessed.append(os.path.join(dcmfile['dirpath'], dcmfile['filename']))
            continue
        except Exception as exc:
            print('ERROR: chocked on file %s' % os.path.join(dcmfile['dirpath'], dcmfile['filename']))
            if 'zipfilemember' in dcmfile:
                print('More precisely on zipfile member: %s' % dcmfile['zipfilemember'].filename)
            import traceback
            print(traceback.format_exc())
            raise(exc)

print('All done!')

In [None]:
import pprint
if unprocessed:
    with open('dicom_unprocessed.txt', 'w') as f:
        f.write(pprint.pformat(unprocessed, indent=4, width=80))
    print('\nSome files could not be processed because of being malformatted, the list is saved in dicom_unprocessed.txt')
else:
    print('\nAll files were processed!')
if conflicts:
    with open('dicom_conflicts.txt', 'w') as f:
        f.write(pprint.pformat(conflicts, indent=4, width=80))
    print('\nSome files were in conflicts and got overwritten, the list is saved in dicom_conflicts.txt')
else:
    print('\nNo conflicts found!')