# Dicom reorganizer
By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
Creation date: 2019-03-09
License: MIT
v1.2.3

DESCRIPTION:
Reorganize DICOM folders and zip files into neatly named DICOM folders (or DICOM zip files). This also allows deduplication, since it's based on DICOM fields (it is suggested to use ids: study id, machine id, etc).

INSTALL NOTE:
You need to pip install pandas before launching this script.
Tested on Python 2.7.15

USAGE:
Input: a root folder containing your dicoms, it will be recursively processed.

TODO:
* Zip each folder/subfolder when done? (at the end because duplicates might overwrite)
* Check if file already exist before copying, if true make a list of conflicts so we know
* copy/manage DICOMDIR files? https://www.medicalconnections.co.uk/kb/DICOMDIR/

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

In [None]:
# Imports
import collections
import math  # to know the number of digits in a number
import os, sys
import re
import shutil
import zipfile

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for unidecode and cleanup_name, because it does not support relative paths (yet?)

# For DB reorganization
from csg_fileutil_libs.aux_funcs import save_dict_as_csv, save_df_as_csv, _tqdm, df_to_unicode

# For Dicom reading
from csg_fileutil_libs.aux_funcs import cleanup_name, recwalk, _StringIO
import csg_fileutil_libs.pydicom as pydicom
from csg_fileutil_libs.pydicom import config as pydicomconfig
from csg_fileutil_libs.pydicom.filereader import InvalidDicomError

pydicomconfig.enforce_valid_values = False  # to allow more resilience against malformatted dicom fields

In [None]:
# PARAMETERS

# Input rootpath where all the DICOM folders/files/zipfiles are located
# Can be a list of folders, with output_dirs also a list of the same length so to specify a different output dir for different inputs
rootpaths_to_dicoms = [r'H:\ALLDICOMS\DICOMS\PATIENTS\NON_SEDATED',
                       r'H:\ALLDICOMS\DICOMS\PATIENTS\SEDATED',
                       r'H:\ALLDICOMS\DICOMS\PATIENTS\UNKNOWN',
                       r'H:\ALLDICOMS\DICOMS\CONTROLS\Controls',
                       r'H:\ALLDICOMS\DICOMS\CONTROLS\Controls_new_dti',

                       #r'C:\git\datatest\input',
                     ]
# Where to copy the reorganized dicom files
output_dirs = [r'C:\git\datatest\output\PATIENTS\NON_SEDATED',
               r'C:\git\datatest\output\PATIENTS\SEDATED',
               r'C:\git\datatest\output\PATIENTS\UNKNOWN',
               r'C:\git\datatest\output\CONTROLS\Controls',
               r'C:\git\datatest\output\CONTROLS\Controls_new_dti',

               #r'C:\git\datatest\output\test',
             ]
# DICOM fields that will be used to name the top folder
# Can be a nested field: top list will be converted to a folder hierarchy, inner lists will be concatenated as the name of one folder, eg: [['PatientName', 'AcquisitionDate'], 'StudyId'] will result in PatientName_20190309/982VRSTI/...
key_dicom_fields = [['PatientName', 'StudyDate'], 'SeriesDescription']  # StudyDate is more reliable than AcquisitionDate (which might be missing, particularly for post-acquisition reconstructed sequences...)
# Cleanup the key dicom fields used for naming the output folders? This will remove any accentuated character
cleanup_dicom_fields = True
# In case the value for a field is missing, what should we replace it with?
placeholder_value = 'unknown'

# Verbose mode
verbose = False

In [None]:
# ADDITIONAL AUX FUNCTIONS

def create_dir_if_not_exist(path):
    """Create a directory if it does not already exist, else nothing is done and no error is return"""
    if not os.path.exists(path):
        os.makedirs(path)

def real_copy(srcfile, dstfile):
    """Copy a file or a folder and keep stats"""
    shutil.copyfile(srcfile, dstfile)
    shutil.copystat(srcfile, dstfile)

def symbolic_copy(srcfile, dstfile):
    """Create a symlink (symbolic/soft link) instead of a real copy"""
    os.symlink(srcfile, dstfile)

def getIntegerPlaces(theNumber):
    """Get the number of digits in an integer
    from https://stackoverflow.com/a/28883802
    DEPRECATED"""
    if theNumber <= 999999999999997:
        return int(math.log10(theNumber)) + 1
    else:
        counter = 15
        while theNumber >= 10**counter:
            counter += 1
        return counter

In [None]:
def generate_path_from_dicom_fields(output_dir, dcmdata, key_dicom_fields, cleanup_dicom_fields=True):
    pathparts = []
    # For each outer list elements (will be concatenated with a directory separator like '/')
    for dfields in key_dicom_fields:
        if not isinstance(dfields, list):
            dfields = [dfields]
        innerpathparts = []
        # For each inner list elements (will be concatenated with '_')
        for dfield in dfields:
            # Extract the dicom field's value
            if dfield in dcmdata:
                if isinstance(dfield, str):
                    # If string (a named field)
                    dcmfieldval = dcmdata[dcmdata.data_element(dfield).tag].value
                else:
                    # Else it's a coordinate field (no name, like (0010, 2020))
                    dcmfieldval = dcmdata[dfield].value
            else:
                dcmfieldval = placeholder_value
            # Cleanup the dicom field is enabled (this will replace accentuated characters, most english softwares do not support those)
            if cleanup_dicom_fields:
                dcmfieldval = cleanup_name(dcmfieldval)
            # Add the path parts to the list
            innerpathparts.append(dcmfieldval)
        # Concatenate the inner path parts and add to the outer path parts list
        pathparts.append('_'.join(innerpathparts))
    # Build the full path from the outer path parts list
    pathpartsassembled = os.path.join(*pathparts)
    # Replace all spaces by dashes (so that programs that do not support spaces well won't be bothered)
    pathpartsassembled = re.sub(r'\s+', r'-', pathpartsassembled, count=0)
    # Join with output dir to get final path
    finalpathdir = os.path.join(output_dir, pathpartsassembled)
    return finalpathdir

def recwalk_dcm(*args, **kwargs):
    """Recursive DICOM metadata reader, supporting zipfiles.
    Yields for each dicom file (whether normal or inside a zipfile) a dictionary filled with DICOM file metadata, path and zip handler if it is inside a zipfile.
    Comes with an integrated progress bar."""
    if 'verbose' in kwargs:
        verbose = kwargs['verbose']
        del kwargs['verbose']
    else:
        verbose = False
    if 'nobar' in kwargs:
        nobar = kwargs['nobar']
        del kwargs['nobar']
    else:
        nobar = False
    if not 'filetype' in kwargs:
        kwargs['filetype'] = ['.dcm', '', '.zip']

    # Make list of filetypes for zipfile
    # Process no extension separately (because else endswith() will accept any extension if we supply '')
    noextflag = False
    filetypes = list(kwargs['filetype'])  # make a copy
    if '' in filetypes:
        filetypes.remove('')
        filetypes = tuple(filetypes)  # endswith() only supports tuples
        noextflag = True

    # Counting total number of files (to show a progress bar)
    filescount = 0
    if not nobar:
        for dirpath, filename in _tqdm(recwalk(*args, **kwargs), desc='PRECOMP', unit='files'):
            if not filename.endswith('.zip'):
                filescount +=1
            else:
                try:
                    zfilepath = os.path.join(dirpath, filename)
                    with zipfile.ZipFile(zfilepath, 'r') as zipfh:
                        zfilescount = sum(1 for item in zipfh.namelist() if not item.endswith('/'))
                    filescount += zfilescount
                except zipfile.BadZipfile as exc:
                    # If the zipfile is unreadable, just pass
                    if verbose:
                        print('Error: Bad zip file: %s' % os.path.join(dirpath, filename))
                    pass

    pbar = _tqdm(total=filescount, desc='REORG', unit='files', disable=nobar)
    for dirpath, filename in recwalk(*args, **kwargs):
        try:
            if not filename.endswith('.zip'):
                if filename.lower() == 'dicomdir':  # pass DICOMDIR files
                    continue
                try:
                    if verbose:
                        print('* Try to read fields from dicom file: %s' % os.path.join(dirpath, filename))
                    # Update progress bar
                    pbar.update()
                    # Read the dicom data in memory (via StringIO)
                    dcmdata = pydicom.read_file(os.path.join(dirpath, filename), stop_before_pixels=True, defer_size="512 KB", force=True)  # stop_before_pixels allow for faster processing since we do not read the full dicom data, and here we can use it because we do not modify the dicom, we only read it to extract the dicom patient name. defer_size avoids reading everything into memory, which workarounds issues with some malformatted fields that are too long (OverflowError: Python int too large to convert to C long)
                    yield {'data': dcmdata, 'dirpath': dirpath, 'filename': filename}
                except (InvalidDicomError, AttributeError, OverflowError) as exc:
                    pass
            else:
                try:
                    zfilepath = os.path.join(dirpath, filename)
                    with zipfile.ZipFile(zfilepath, 'r') as zipfh:
                        #zfolders = (item for item in zipfh.namelist() if item.endswith('/'))
                        zfiles = ( item for item in zipfh.infolist() if (not item.filename.endswith('/') and (item.filename.endswith(filetypes) or (noextflag and not '.' in item.filename))) )  # infolist() is better than namelist() because it will also work in case of duplicate filenames
                        for zfile in zfiles:
                            # Update progress bar
                            pbar.update()
                            # Need to extract because pydicom does not support not having seek() (and zipfile in-memory does not provide seek())
                            zf = zfile.filename
                            if zf.lower().endswith('dicomdir'):  # pass DICOMDIR files
                                continue
                            z = _StringIO(zipfh.read(zf)) # do not use .extract(), the path can be anything and it does not support unicode (so it can easily extract to the root instead of target folder!)
                            # Try to open the extracted dicom
                            try:
                                if verbose:
                                    print('* Try to decode dicom fields with zipfile member %s' % zf)
                                # Read the dicom data in memory (via StringIO)
                                dcmdata = pydicom.read_file(z, stop_before_pixels=True, defer_size="512 KB", force=True)  # stop_before_pixels allow for faster processing since we do not read the full dicom data, and here we can use it because we do not modify the dicom, we only read it to extract the dicom patient name. defer_size avoids reading everything into memory, which workarounds issues with some malformatted fields that are too long (OverflowError: Python int too large to convert to C long)
                                yield {'data': dcmdata, 'dirpath': dirpath, 'filename': filename, 'ziphandle': zipfh, 'zipfilemember': zfile}
                            except (InvalidDicomError, AttributeError, OverflowError) as exc:
                                pass
                            except IOError as exc:
                                if 'no tag to read' in str(exc).lower():
                                    pass
                                else:
                                    raise
                except zipfile.BadZipfile as exc:
                    # If the zipfile is unreadable, just pass
                    if verbose:
                        print('Error: Bad zip file: %s' % os.path.join(dirpath, filename))
                    pass
        except Exception as exc:
            print('ERROR: chocked on file %s' % os.path.join(dirpath, filename))
            import traceback
            print(traceback.format_exc())
            raise(exc)

In [None]:
# Ensure paths are contained in lists
if not isinstance(rootpaths_to_dicoms, list):
    rootpaths_to_dicoms = [rootpaths_to_dicoms]
if not isinstance(output_dirs, list):
    output_dirs = [output_dirs]

# Main loop
conflicts = []
unprocessed = []
for rootpath_to_dicoms, output_dir in zip(rootpaths_to_dicoms, output_dirs):
    for dcmfile in recwalk_dcm(rootpath_to_dicoms, verbose=verbose):  # recursively fetch any dicom file/zip file member (ie, file inside a zip)
        try:
            # Load the dicom file data
            filename = dcmfile['filename']
            dirpath = dcmfile['dirpath']
            dcmdata = dcmfile['data']
            if not filename.endswith('.zip'):
                # Generate the path from dicom fields
                finalpathdir = generate_path_from_dicom_fields(output_dir, dcmdata, key_dicom_fields, cleanup_dicom_fields=cleanup_dicom_fields)
                # Generate the new filename, based on a unique UID to avoid overwriting
                # To ensure there is no duplicates and that we do not unduly overwrite dicom files, we use the SOP Instance UID which is unique for every DICOM volume
                # This can fail as some dicoms are malformatted (normally the field should always be accessible)
                newfilename = "%s.dcm" % str(dcmdata.data_element('SOPInstanceUID').value)  # we should use MediaStorageSOPInstanceUID and not SOPInstanceUID but can't find the tag: https://forum.dcmtk.org/viewtopic.php?t=3405
                newfilepath = os.path.join(finalpathdir, newfilename)
                oldfilepath = os.path.join(dirpath, filename)
                if os.path.exists(newfilepath):  # conflict detected!
                    conflicts.append([newfilepath, oldfilepath])
                # Make the directory if necessary
                create_dir_if_not_exist(finalpathdir)
                # Copy the dicom file (directly at the root of the newly created path, so we effectively destroy any previous folder naming scheme, but that's a feature since we WANT to reorganize)
                real_copy(oldfilepath, newfilepath)
                # If it's a .dcm/.bmp tuple, we also copy the .bmp
                # FALSE: .bmp files are NOT necessary
                #if os.path.exists(oldfilepath[:-4]+'.bmp'):
                    #real_copy((oldfilepath[:-4]+'.bmp'), (newfilepath[:-4]+'.bmp'))
            else:
                # Load additional zip file data
                zipfh = dcmfile['ziphandle']
                zfile = dcmfile['zipfilemember']
                # Generate the new path from dicom fields
                finalpathdir = generate_path_from_dicom_fields(output_dir, dcmdata, key_dicom_fields, cleanup_dicom_fields=cleanup_dicom_fields)
                # Generate the new filename, based on a unique UID to avoid overwriting
                # To ensure there is no duplicates and that we do not unduly overwrite dicom files, we use the SOP Instance UID which is unique for every DICOM volume
                # This can fail as some dicoms are malformatted (normally the field should always be accessible)
                newfilename = "%s.dcm" % str(dcmdata.data_element('SOPInstanceUID').value)  # change the filename of the zipfile member directly to avoid extracting the full path
                if os.path.exists(os.path.join(finalpathdir, newfilename)):
                    try:
                        oldfilepath = os.path.join(dirpath, filename, cleanup_name(zfile.filename))
                    except UnicodeDecodeError as exc:
                        oldfilepath = os.path.join(dirpath, filename)
                        pass
                    conflicts.append([finalpathdir, oldfilepath])
                zfile.filename = newfilename
                # Make the directory if necessary
                create_dir_if_not_exist(finalpathdir)
                # Copy the dicom file (directly at the root of the newly created path, so we effectively destroy any previous folder naming scheme, but that's a feature since we WANT to reorganize)
                zipfh.extract(zfile, finalpathdir)  # extract zipfile member with metadata (contrary to zipfh.read())
        except KeyError as exc:
            # The MediaStorageSOPInstanceUID tag cannot be found: the DICOM is malformatted and unreadable (by pydicom as of March 2019), we simply skip, even if it means losing a few subjects...
            if 'zipfilemember' in dcmfile:
                unprocessed.append(os.path.join(dcmfile['dirpath'], dcmfile['filename']))
                unprocessed.append(dcmfile['zipfilemember'].filename)
            else:
                unprocessed.append(os.path.join(dcmfile['dirpath'], dcmfile['filename']))
            continue
        except Exception as exc:
            print('ERROR: chocked on file %s' % os.path.join(dcmfile['dirpath'], dcmfile['filename']))
            if 'zipfilemember' in dcmfile:
                print('More precisely on zipfile member: %s' % dcmfile['zipfilemember'].filename)
            import traceback
            print(traceback.format_exc())
            raise(exc)

print('All done!')

In [None]:
import pprint
if unprocessed:
    with open('dicom_unprocessed.txt', 'w') as f:
        f.write(pprint.pformat(unprocessed, indent=4, width=80))
    print('\nSome files could not be processed because of being malformatted, the list is saved in dicom_unprocessed.txt')
else:
    print('\nAll files were processed!')
if conflicts:
    with open('dicom_conflicts.txt', 'w') as f:
        f.write(pprint.pformat(conflicts, indent=4, width=80))
    print('\nSome files were in conflicts and got overwritten, the list is saved in dicom_conflicts.txt')
else:
    print('\nNo conflicts found!')