In [None]:
# Dicoms anonymizer
# By Stephen Larroque @ Coma Science Group, GIGA Research, University of Liege
# Creation date: 2017-02-07
# License: MIT
# v1.3.0
#
# INSTALL NOTE:
# Tested on Python 2.7.11
#
# TODO:
# * unify dicom names (if not already done)
# * unify demographics names (if not already done)
# * check if recursion ok (to anonymize MRI & PET at the same time for example).
# * convert cells to functions
# * put in a python script and use gooey (except if --cmd passed as argument)
# * freeze using pyinstaller
# * make a nice progress bar in gooey? add support in tqdm?
#

In [None]:
# Forcefully autoreload all python modules
%load_ext autoreload
%autoreload 2

# AUX FUNCS

In [None]:
# Auxiliary libraries and necessary functions

import os
import re
import shutil
import sys

cur_path = os.path.realpath('.')
sys.path.append(os.path.join(cur_path, 'csg_fileutil_libs'))  # for pydicom, because it does not support relative paths (yet?)

import csg_fileutil_libs.pydicom as dicom
from csg_fileutil_libs.pydicom.filereader import InvalidDicomError
from csg_fileutil_libs.distance import distance

from csg_fileutil_libs.aux_funcs import recwalk, replace_buggy_accents, _unidecode, _tqdm, cleanup_name, save_dict_as_csv

from csg_fileutil_libs.aux_funcs import distance_jaccard_words_split
def disambiguate_names(L, dist_threshold=0.2, verbose=False):
    '''Disambiguate names in a list (ie, find all duplicate names with switched words or typos, and fix them and add them to an "alt_names" field)
    Input: list of names or list of dicts with "name" field. Output: list of dict with fields "name" and "alt_names". Alt names can then be used to do a mapping.'''
    # It's a list of dict (straight from a csv.DictReader)
    if isinstance(L[0], dict):
        res = list(L)  # copy the list of dicts (and thus all fields)
        vals = [c['name'] for c in L]  # extract names
    else:  # Convert input list to a dict
        res = [{'name': name} for name in L]
        vals = L
    for idx, c in _tqdm(enumerate(vals), total=len(vals), desc='DISAMB', unit='names'):
        for idx2, c2 in enumerate(vals[idx+1:]):
            #print(c, c2)
            if c != c2 and \
            (distance.nlevenshtein(c, c2, method=1) <= dist_threshold or distance_jaccard_words_split(c2, c, partial=True, norm=True, dist=dist_threshold) <= dist_threshold): # use shortest distance with normalized levenshtein
                if verbose:
                    print(c, c2, distance.nlevenshtein(c, c2, method=1))
                # Replace the name of the second entry with the name of the first entry
                res[idx+idx2+1]['name'] = c
                # Add the other name as an alternative name, just in case we did a mistake for example
                res[idx+idx2+1]['alt_names'] = res[idx]['alt_names'] + '/' + c2 if 'alt_names' in res[idx] else c2
    return res

def get_list_of_folders(rootpath):
    return [item for item in os.listdir(rootpath) if os.path.isdir(os.path.join(rootpath, item))]

def get_list_of_zip(rootpath):
    return [item for item in os.listdir(rootpath) if os.path.isfile(os.path.join(rootpath, item)) and item.endswith('.zip')]

def get_dcm_names_from_dir(rootpath, dcm_subj_list=None, folder_to_name=None, verbose=False):
    if dcm_subj_list is None:
        dcm_subj_list = []  # store list of subjects names from dicom files (useful for csv filtering)
    if folder_to_name is None:
        folder_to_name = {}  # store the name of the patient stored in each root folder (useful for anonymization later on)
    for subject in get_list_of_folders(rootpath):
        if verbose:
            print('- Processing subject %s' % unicode(subject, 'latin1'))
        fullpath = os.path.join(rootpath, subject)
        if not isinstance(fullpath, unicode):
            fullpath = unicode(fullpath, 'latin1')
        pts_name = None
        for dirpath, filename in recwalk(fullpath, filetype=['.dcm', '']):
            try:
                #print('* Try to read fields from dicom file: %s' % os.path.join(dirpath, filename))
                dcmdata = dicom.read_file(os.path.join(dirpath, filename), stop_before_pixels=True)  # stop_before_pixels allow for faster processing since we do not read the full dicom data, and here we can use it because we do not modify the dicom, we only read it to extract the dicom patient name
                #print(dcmdata.PatientName)
                pts_name = cleanup_name(dcmdata.PatientName)
                dcm_subj_list.append( pts_name )
                break
            except (InvalidDicomError, AttributeError) as exc:
                pass
        folder_to_name[subject] = pts_name
    return dcm_subj_list, folder_to_name

from tempfile import mkdtemp, mkstemp
def get_dcm_names_from_zip(rootpath, dcm_subj_list=None, folder_to_name=None, verbose=False):
    if dcm_subj_list is None:
        dcm_subj_list = []  # store list of subjects names from dicom files (useful for csv filtering)
    if folder_to_name is None:
        folder_to_name = {}  # store the name of the patient stored in each root folder (useful for anonymization later on)
    # Create a temporary file (to extract one dicom from zip)
    dcmfilefh, dcmfilepath = mkstemp(suffix='.dcm')  # tesseract < 3.03 do not support "stdout" argument, so need to save into a file
    os.close(dcmfilefh)  # close file to allow writing after
    #dcmfilepath = 'tempdicomextract/tempdicom.dcm'
    #try:
    #    os.makedirs(os.path.dirname(dcmfilepath))
    #except OSError as exc:
    #    pass

    # Extract names from zipped dicom files (extract the first dicom file we can read and use its fields)
    for zipfilename in get_list_of_zip(rootpath):
        zfilepath = os.path.join(rootpath, zipfilename)
        if verbose:
            print('- Processing file %s' % zipfilename)
        with zipfile.ZipFile(zfilepath, 'r') as zipfh:
            # Extract only files, not directories (end with '/', this is standard detection in zipfile)
            zfolder = (item for item in zipfh.namelist() if item.endswith('/'))
            zfiles = (item for item in zipfh.namelist() if not item.endswith('/'))
            # Get first top folder inside zip to extract folder name (because when we will extract the zip, we need the folder name)
            try:
                folder_name = zfolder.next().strip('/')
            except StopIteration:
                folder_name = re.search('^([^\\/]+)[\\/]', zipfh.namelist()[0]).group(1)
            # Get first dicom file we can find
            pts_name = None
            for zf in zfiles:
                # Need to extract because pydicom does not support not having seek() (and zipfile in-memory does not provide seek())
                z = zipfh.read(zf) # do not use .extract(), the path can be anything and it does not support unicode (so it can easily extract to the root instead of target folder!)
                with open(dcmfilepath, 'wb') as dcmf:
                    dcmf.write(z)
                # Try to open the extracted dicom
                try:
                    if verbose:
                        print('Try to decode dicom fields with file %s' % zf)
                    dcmdata = dicom.read_file(dcmfilepath, stop_before_pixels=True)
                    pts_name = cleanup_name(dcmdata.PatientName)
                    dcm_subj_list.append( pts_name )
                    os.remove(dcmfilepath)
                    break
                except (InvalidDicomError, AttributeError) as exc:
                    continue
                except IOError as exc:
                    if 'no tag to read' in str(exc).lower():
                        continue
                    else:
                        raise
            # Add to the folder name -> dicom patient name mapping
            folder_to_name[folder_name] = pts_name
    return dcm_subj_list, folder_to_name

def dist_matrix(list1, list2, dist_threshold=0.2):
    '''Find all similar items in two lists that are below a specified distance threshold (using both letters- and words- levenshtein distances)'''
    dist_matches = {}
    for subj in list1:
        found = False
        for c in list2:
            if distance.nlevenshtein(subj, c, method=1) <= dist_threshold or distance_jaccard_words_split(subj, c, partial=True, norm=True, dist=dist_threshold) <= dist_threshold: # use shortest distance with normalized levenshtein
                if subj not in dist_matches:
                    dist_matches[subj] = []
                dist_matches[subj].append(c)
                found = True
        if not found:
            dist_matches[subj] = None
    return dist_matches

In [None]:
import zipfile
import os
import cStringIO

def zipwalk(zfilename):
    """Zip file tree generator.

    For each file entry in a zip archive, this yields
    a two tuple of the zip information and the data
    of the file as a StringIO object.

    zipinfo, filedata

    zipinfo is an instance of zipfile.ZipInfo class
    which gives information of the file contained
    in the zip archive. filedata is a StringIO instance
    representing the actual file data.

    If the file again a zip file, the generator extracts
    the contents of the zip file and walks them.

    Inspired by os.walk .
    Source: by Anand http://code.activestate.com/recipes/425840-zip-walker-zip-file-tree-generator/
    """

    tempdir=os.environ.get('TEMP',os.environ.get('TMP',os.environ.get('TMPDIR','/tmp')))
    
    try:
        z=zipfile.ZipFile(zfilename,'r')
        for info in z.infolist():
            fname = info.filename
            data = z.read(fname)
            extn = (os.path.splitext(fname)[1]).lower()

            if extn=='.zip':
                checkz=False
                
                tmpfpath = os.path.join(tempdir,os.path.basename(fname))
                try:
                    open(tmpfpath,'w+b').write(data)
                except (IOError, OSError),e:
                    print e

                if zipfile.is_zipfile(tmpfpath):
                    checkz=True

                if checkz:
                    try:
                        for x in zipwalk(tmpfpath):
                            yield x
                    except Exception, e:
                        raise
                    
                try:
                    os.remove(tmpfpath)
                except:
                    pass
            else:
                yield (info, cStringIO.StringIO(data))
    except RuntimeError, e:
        print 'Runtime Error'
    except zipfile.error, e:
        raise


----------------------------------------
# Part 1
## Extract dicom names

In [None]:
# Params
rootpath = 'dicoms'  # path to dicom folders and zipfiles
demo_csv = 'db_reports_plus_fmp.csv'  # path to the demographics csv file
verbose = True

In [None]:
# -- Get the list of dicoms (they must all be at the first level, one folder per subject)

# Get unzipped dicom folders list
print('Constructing list of DICOM subjects through folders names, please wait...')
subjects_list = get_list_of_folders(rootpath)

# Extract name from first readable dicom
if verbose:
    print('Found subjects dicom folders: %s' % ', '.join(subjects_list))

dcm_subj_list, folder_to_name = get_dcm_names_from_dir(rootpath, verbose=verbose)
print('Total dicom subjects: %i. Detailed list: %s' % (len(dcm_subj_list), ', '.join(dcm_subj_list)))

In [None]:
# -- Extracting subjects names from zip files
# NOTE: anonymization is NOT supported on zip files, only on unzipped dicom folders!

# Extract list of zip files
subjects_zip_list = get_list_of_zip(rootpath)
print(subjects_zip_list)

dcm_subj_list, folder_to_name = get_dcm_names_from_zip(rootpath, dcm_subj_list, folder_to_name, verbose=verbose)
print('Total dicom subjects: %i. Detailed list: %s' % (len(dcm_subj_list), ', '.join(dcm_subj_list)))

In [None]:
folder_to_name

In [None]:
# Save all extracted fields to a csv file!
from csg_fileutil_libs.aux_funcs import save_dict_as_csv

output_file = 'dicom_names.csv'
save_dict_as_csv([{'name': name, 'path': path} for name, path in zip(dcm_subj_list, subjects_list + subjects_zip_list)], output_file, csv_order_by='name', verbose=True)
print('Dicom patients names saved to csv file: %s' % output_file)

---------------------------------
# Part 2: Generate anonymization mapping
## Anonymization initialization (generate anonymized ids)

In [None]:
# Params
anon_prefix = 'subj_'  # prefix of the generated anonymized ids

anon_salt = 'some random string'  # set this to a string of your choice to generate unique hashes, this adds protection against decrypting the ids (but keep the same if you want to be able to update anonymized data)
anon_permanent_ids = True  # if you want the anonymized id to always be the same (useful if you want to add new subjects and keep the same ids for old ones, eg, if you have multiple datasets with overlapping subjects but with different ones as well), but at the expense of security (the anonymized id can potentially be decrypted). If set to False, you will get near impossible decryption and a nice simple id scheme (from 1 to the number of subjects)
anon_hash_algo = 'md5'
anon_length = 8 # length of the id. Set to None to disable. Shortening might be an added security if anon_permanent_ids == True, but raises the risks of collisions (two names having same id). You can try, there will be an exception anyway if there is a collision.


In [None]:
import csv

with open("dicom_names.csv") as f:
    dcm_subj_list = [row['name'] for row in csv.DictReader(f, delimiter=';')]
dcm_subj_list

In [None]:
# Generate disambiguated list of dicom names
# Disambiguate dicom names
cd_unique = disambiguate_names(dcm_subj_list, verbose=True)
# Extract list of unique dicom names
dcm_unique = set([c['name'] for c in cd_unique])

#cd_unique
print(dcm_unique)

In [None]:
# Unique id extractor from name, insensitive to accentuated charaters nor non-alphabetical characters nor firstname/lastname position switching
import hashlib
import re

from csg_fileutil_libs.aux_funcs import sort_list_a_given_list_b, replace_buggy_accents, _unidecode

def clean_name(name):
    '''Clean name from accents and non-alphabetical characters'''
    return re.sub(r'\W', r'', _unidecode(replace_buggy_accents(name.decode('utf8'), 'utf8')).lower())
def extract_ordered_letters(name):
    '''Order letters composing a name to alphabetical order'''
    alphabet = list('abcdefghijklmnopqrstuvwxyz1234567890-')
    return ''.join(sort_list_a_given_list_b(list(clean_name(name)), alphabet))
def get_hash(string, algo=None):
    if algo is None or algo == 'md5':
        return hashlib.md5(string).hexdigest()
    elif algo == 'sha1':
        return hashlib.md5(string).hexdigest()
    else:
        raise NameError('Hash algorithm not recognized: %s' % algo)
def get_ordered_hash(hash_func, string, salt=None, algo=None):
    '''Get a unique hash insensitive to accents, non-alphabetical characters nor words position switching'''
    return hash_func(extract_ordered_letters(string+(salt if salt else '')), algo=algo)
def get_duplicates(d):
    seen = set()
    for k, v in d.items():
        if v in seen:
            yield k, v
        else:
            seen.add(v)

# Unit test
name1 = 'rajaé chatila'
name2 = 'chatila  rajaé|'
assert(get_ordered_hash(get_hash, name1) == get_ordered_hash(get_hash, name2))

In [None]:
##### Generate anonymization scheme from dicoms patients names #####

# Generate unique hashes from each dicom's patient name
# Generate an anonymized id resilient to spaces and non letters characters and words switching
# to do that, we take the name, and reorder all letters (and remove any non-letter symbol) by alphabetical order, which gives us simply the ordered sequence of letters composing each name
anon_hashes = {name: get_ordered_hash(get_hash, name, anon_salt, algo=anon_hash_algo) for name in dcm_unique}
# Shortening is an added security, so that if someone tries to bruteforce, there will be missing info to reconstitute the original name that gave this hash (because we are missing parts of the hash, so lots of dissimilar names will have the same shortened hash)
if anon_length:
    for name, h in anon_hashes.items():
        anon_hashes[name] = anon_hashes[name][:anon_length]
# There can be collisions in hashes, then check that there is none
anon_dups = dict(get_duplicates(anon_hashes))
if anon_dups:
    anon_dups_print = {h: [name for name, h2 in anon_hashes.items() if h2 == h] for h in anon_dups.values()}
    raise ValueError('Two names have the same id! Please use another hashing algorithm or raise hash length or another salt or turn off anon_permanent_ids. Here is the list of names with same ids: %s' % anon_dups_print)
del anon_dups

# Generate the final id (second step)
if anon_permanent_ids:
    # generate a straightforward id from a shortened hash
    names_and_ids = anon_hashes
else:
    # generate a unique id based on order (simply the order number when ordered by the hash - since we use the "ordered hash", we get the same properties: the same set of patients names will always generate the same order, and the order cannot be traced back, since it depends both on the hash AND the exact set of patients names to get the exact same order)
    # the big advantage of this approach is that it is nearly impossible to decrypt the original name, since the id gives strictly no information at all
    # the disadvantage is that it is dependent on the subjects names list, so if you add a subject, nearly all ids will change
    names_and_ids = {name: str(id+1).zfill(anon_length) for id, name in enumerate(sorted(anon_hashes, key=anon_hashes.get))}
# Prepend prefix and save the anonymized ids
anon_ids = {("%s%s" % (anon_prefix, id)): name for name, id in names_and_ids.items()}
anon_ids

In [None]:
# Write anonymization csv
ids_to_name = [{'id': pts_id, 'name': anon_ids[pts_id]} for pts_id in sorted(anon_ids)]
save_dict_as_csv(ids_to_name, 'idtoname.csv', fields_order=['id', 'name'], csv_order_by='id', verbose=False)
print('Conversion list (id -> name) saved to idtoname.csv.')

------------------------------------
# Part 3: applying anonymization
TODO: remake to adapt the dcm_names to the ones in the specified rootpath, not the ones in dicom_names.csv!
## Merging dicom names and demographics csv names

In [None]:
# Params
rootpath = 'dicoms'  # path to dicom folders and zipfiles, this can be another folder than the one you used to generate the anonymization mapping
# TODO: auto add new dicom names to anonymization mapping (we have the list of names, we can generate a new mapping!)
demo_csv = 'db_reports_plus_fmp.csv'  # path to the demographics csv file
cols_drop = ['report_path', 'alt_names']  # columns to drop from demographics csv that might containt patient name
verbose = True

In [None]:
import csv

with open(demo_csv) as f:
    cf = list(csv.DictReader(f, delimiter=';'))

with open("dicom_names.csv") as f:
    dcm_subj_list = [row['name'] for row in csv.DictReader(f, delimiter=';')]

In [None]:
# Generate disambiguated list of dicom names
# Disambiguate dicom names
cd_unique = disambiguate_names(dcm_subj_list, verbose=verbose)
# Extract list of unique dicom names
dcm_unique = set([c['name'] for c in cd_unique])

print(dcm_unique)

# Generate dicom name to unique name mapping
dcmname_to_uniquename = {(c['alt_names'] if 'alt_names' in c else c['name']): c['name'] for c in cd_unique}

print(dcm_unique)
dcmname_to_uniquename

In [None]:
# Disambiguate and clean up csv names
from csg_fileutil_libs.distance import distance
from csg_fileutil_libs.aux_funcs import distance_jaccard_words_split
dist_threshold = 0.2

# Cleanup names
for i in range(len(cf)):
    cf[i]['name'] = cleanup_name(cf[i]['name'])

# Disambiguate (ie, same name with typos or inversed firstname/lastname)
cf = disambiguate_names(cf, dist_threshold=dist_threshold, verbose=verbose)
# Print list of disambiguated names
[{c['name']: c['alt_names']} for c in cf if 'alt_names' in c and c['alt_names']]

In [None]:
# Computing distance matrix (ie, finding similar names between dicoms and demographics csv)
from csg_fileutil_libs.aux_funcs import distance_jaccard_words_split
dist_threshold = 0.2 # normalized distance threshold to match similar names. 0.0 is no difference, 1.0 is everything different.

print('Computing distance matrix (finding similar names) between dicoms and demographics, please wait...')
dist_matches = {}
name_to_anon_ids = {v: k for k, v in anon_ids.items()}
for subj in _tqdm(dcm_unique, desc='distmat', unit='subj'):
    found = False
    for c in cf:
        if distance.nlevenshtein(subj, c['name'], method=1) <= dist_threshold or distance_jaccard_words_split(subj, c['name'], partial=True, norm=True, dist=dist_threshold) <= dist_threshold: # use shortest distance with normalized levenshtein
            if subj not in dist_matches:
                dist_matches[subj] = []
            dist_matches[subj].append(c['name'])
            found = True
    if not found:
        dist_matches[subj] = None

# Remove duplicate values (ie, csv names)
dist_matches = {k: (list(set(v)) if v else v) for k, v in dist_matches.items()}
# Find missing subjects (ie, dicom name present but missing in csv database)
missing_subj = {k: v for k, v in dist_matches.items() if not v}
# Print results
if not missing_subj:
    print('No missing subject, congratulations!')
else:
    print('Missing subjects from csv database (saved in missing_demo.csv): %i, names: %s' % (len(missing_subj), ', '.join(sorted(missing_subj.keys()))))
    save_dict_as_csv([{'name': msubj, 'id': name_to_anon_ids[msubj]} for msubj in missing_subj.keys()], 'missing_demo.csv', fields_order=['name', 'id'], csv_order_by='name', verbose=False)
    save_dict_as_csv([{'id': name_to_anon_ids[msubj]} for msubj in missing_subj.keys()], 'missing_demo_anonymized.csv', fields_order=['id'], csv_order_by='id', verbose=False)

print('\nList of all matches (dicom : csv):')
print(dist_matches)

In [None]:
# Compute the csv name to dicom unique name mapping
csvname_to_uniquename = {}
for uniquename in dist_matches.keys():
    if dist_matches[uniquename]:
        for csv_name in dist_matches[uniquename]:
            csvname_to_uniquename[csv_name] = uniquename
csvname_to_uniquename

In [None]:
def flatten_gen(L):
    for item in L:
        if isinstance(item, list):
            for i in flatten(item):
                yield i
        else:
            yield item

def flatten(L):
    return list(flatten_gen(L))

def get_unique_names(L):
    return filter(None, set(L))

In [None]:
demo_short_csv = 'demographics_shortened.csv'
# Get unique demo csv names (which matched with dicom in the distance matrix)
demo_names = get_unique_names(flatten(dist_matches.values()))
# Shorten demographics to only names present in dicoms
cf_short = [c for c in cf if c['name'] in demo_names]
# Save shortened demographics
save_dict_as_csv(cf_short, demo_short_csv, fields_order=['name'], csv_order_by='name', verbose=False)
print('Shortened demographics (to only the dicoms available) were saved to %s.' % demo_short_csv)

In [None]:
cf_short

-------------------------------
## Anonymizing demographics csv

In [None]:
import csv
with open('idtoname.csv', mode='r') as f:
    reader = csv.DictReader(f, delimiter=';')
    anon_ids = {row['id']: row['name'] for row in reader}
anon_ids

In [None]:
name_to_anon_ids = {v: k for k, v in anon_ids.items()}
name_to_anon_ids

In [None]:
# Anonymize demographics csv

demo_anon_csv = 'demographics_anonymized.csv'

# Make a new dataframe from shortened demographics csv
cf_anon = list(cf_short)  # copy
# Anonymize names by using the csv name -> unique name -> anonymized id mapping
# TODO: might be better to use pandas join?
for rowid in range(len(cf_anon)):
    cf_anon[rowid]['name'] = name_to_anon_ids[csvname_to_uniquename[cf_anon[rowid]['name']]]
# Drop columns that we cannot anonymize but might give away subjects infos (like report_path and alt_names)
if cols_drop:
    for rowid in range(len(cf_anon)):
        for col in cols_drop:
            if col in cf_anon[rowid]:
                del cf_anon[rowid][col]
# Save anonymized demographics
save_dict_as_csv(cf_anon, demo_anon_csv, fields_order=['name'], csv_order_by='name', verbose=False)
print('Anonymized demographics successfully saved to %s.' % demo_anon_csv)

-----------------
## Anonymizing dicoms

In [None]:
# Unzip all into folders, because we cannot anonymize zip files (ie, can't modify files inside a zip)
# NOTE: ZIP FILES WILL BE DELETED!
import zipfile

def unzip(zipfilepath, outputpath):
    zip_ref = zipfile.ZipFile(zipfilepath, 'r')
    zip_ref.extractall(outputpath)
    zip_ref.close()

# First we need to delete all .DS_Store files (else we get permission denied IOError)
count_dstore = 0
for dirpath, filename in _tqdm(recwalk(rootpath, topdown=False, folders=True), unit='files', desc='DSTOREDEL'):
    if filename.lower() == '.ds_store':
        fullfilepath = os.path.join(dirpath, filename)
        os.remove(fullfilepath)
        count_dstore += 1
print('Total .DS_Store files deleted: %i.' % count_dstore)

# Unzip files and delete zip
subjects_list_zip = get_list_of_zip(rootpath)
count_zip = 0
for z in _tqdm(subjects_list_zip, unit='files', desc='UNZIP'):
    zipfilepath = os.path.join(rootpath, z)
    if verbose:
        print('- Unzipping file: %s' % z)
    # Unzip file into the same root directory as other dicom folders
    unzip(zipfilepath, rootpath)
    # Delete the zip (since we cannot anonymize it)
    os.remove(zipfilepath)
    count_zip += 1
print('Total zipfiles unzipped: %i.' % count_zip)

In [None]:
# -- Anonymization of dicom files
# Note: this will anonymize only the dicoms fields (name of patient, whatever field it is found in).
# If there are any other file containing the patient's name (such as .txt, .csv, .xls, etc), the files might be deleted if you want (add the extension in the list) or they will stay.
#from dicom.filebase import DicomFileLike  # fix for IOError access denied, see https://github.com/darcymason/pydicom/issues/69
import re

def find_hidden_name_fields(dcmdata, dcm_pts_names, hidden_name_fields=None):
    '''From a pydicom object, return all fields where one of the dcm_pts_name (a list) is present.
    This ease the detection of additional fields where patient name was stored.'''
    if hidden_name_fields is None:
        hidden_name_fields = set()
    # Convert name to regex friendly (because dicoms often replace spaces by ^)
    dcm_pts_names = [pts_name.replace(' ', '[\W]+') for pts_name in dcm_pts_names]
    # Walk through each dicom field
    for dcmfield in dcmdata.keys(): # different from dir()?
        if dcmdata[dcmfield]:
            try:
                #dcmfieldval = dcmdata.data_element(dcmfield).value
                dcmfieldval = dcmdata[dcmfield].value
                check = False
                if isinstance(dcmfieldval, list):
                    dcmfieldval_lower = [s.lower() if isinstance(s, str) else s for s in dcmfieldval]
                    check = any(pts_name in dcmfieldval_lower for pts_name in dcm_pts_names)
                elif isinstance(dcmfieldval, (int, float)):
                    check = False
                else:
                    check = any(re.search(pts_name, dcmfieldval.lower()) for pts_name in dcm_pts_names)
                if check:
                    hidden_name_fields.add(dcmfield)
            except AttributeError:
                print('Error with field: %s' % str(dcmfield))
                raise
    return hidden_name_fields


reports_delete = True  # delete pdf/doc/docx/txt files automatically?
skip_already_processed = True
remove_private_tags = False
fields_to_del = ['PatientAddress', 'PatientBirthTime', 'PatientTelephoneNumbers', 'OtherPatientNames']
print('-- Anonymization started, please wait, this might take a while (also make sure you unzipped all dicoms into folders)...')
print('Note: if you get an IOError permission denied error, make sure you close any file explorer or application using any of the subjects folder (including Windows Explorer, else folders cannot be renamed).')
print('Note2: JPEG2000 compressed dicom files are unsupported, please uncompress them beforehand (eg, using dcmdjpeg).')
print('Note3: in case of an Access Error, you can continue the anonymization, it will restart from the start but it will skip already processed dicom files.')
count_anon = 0
count_files = 0
count_delete = 0
count_files_skipped = 0
# Init path and 1st level folders list
uni_rootpath = unicode(rootpath, 'latin1')  # convert rootpath to unicode before walking with os.listdir and recwalk, so we get back unicode strings too (else we won't be able to enter folders with accentuated characters)
subjects_list = get_list_of_folders(uni_rootpath)
# Precompute total number of files (for progressbar)
print('Precomputing total number of files, please wait...')
for subject in _tqdm(subjects_list, unit='folders', desc='PRECOMP'):
    fullpath = os.path.join(uni_rootpath, subject)
    for dirpath, filename in recwalk(fullpath, topdown=False, folders=True):
        count_files += 1
# Get folder_to_name mapping
_, folder_to_name = get_dcm_names_from_dir(uni_rootpath)
_, folder_to_name = get_dcm_names_from_zip(uni_rootpath, folder_to_name=folder_to_name)
# Loop through each subject root directory to rewrite dicoms
print('Launching anonymization of dicoms fields, please wait...')
tbar = _tqdm(total=count_files, unit='files', desc='ANON')
hfields = set()
for folder in subjects_list:
    # Already processed folder and there are several sessions, extract the id from folder name
    subject = folder
    try:
        subject = re.match('(^%s.+)_s\d+$' % anon_prefix, subject).group(1)
    except AttributeError:
        pass
    # Already processed folder, then retrieve back the patient's name from the anonymized id
    if subject in anon_ids:
        pts_name = anon_ids[subject]
        if skip_already_processed:
            fullpath = os.path.join(uni_rootpath, folder)
            c = 0
            for _, _ in recwalk(fullpath, topdown=False, folders=True):
                c += 1
            tbar.update(c)
            count_files_skipped += c
            continue
    # Partially processed folder, get the original name and continue
    elif folder_to_name[subject] in anon_ids:
        pts_name = anon_ids[folder_to_name[subject]]
    else:
        pts_name = folder_to_name[subject]
    # Special case: no dicom with a patient name can be found inside the folder (might be nifti files instead?), so we just skip
    if pts_name is None:
        continue
    # Fetch the anonymized id from folder name (because we already looked inside to get the first dicom's patientname)
    anon_id = name_to_anon_ids[dcmname_to_uniquename[pts_name]]
    if verbose:
        print('- Processing subject %s -> %s in folder %s' % (pts_name, anon_id, folder))
    #fullpath = unicode(os.path.join(rootpath, folder), 'latin1')
    fullpath = os.path.join(uni_rootpath, folder)  # no need to use unicode(str, 'latin1') here because rootpath and folder were both converted to unicode before
    # Loop through each subfiles and subfolders for this subject (we assume all dicoms are for one subject, so we rename them all to this subject)
    for dirpath, filename in recwalk(fullpath, topdown=False, folders=True):
        fullfilepath = os.path.join(dirpath, filename)
        # Report file: delete if option enabled
        if reports_delete and filename.endswith( ('pdf', 'doc', 'docx', 'txt', 'csv', 'xls', 'xlsx') ):
            os.remove(fullfilepath)
            count_delete += 1
            continue
        elif os.path.isdir(fullfilepath):  # else we get an IOError...
            continue
        else:
            # Dicom file: change PatientName field
            try:
                #TODO: autodetect if name is in filename and change!
                #print('* Try to read fields from dicom file: %s' % os.path.join(dirpath, filename))
                # Read dicom's file data
                #os_id = os.open(str(fullfilepath), os.O_BINARY | os.O_RDONLY)
                #fd = os.fdopen(os_id)
                #dcmdata = dicom.read_file(DicomFileLike(fd), stop_before_pixels=False)
                dcmdata = dicom.read_file(fullfilepath, stop_before_pixels=False)  # need to read the full dicom here since we will modify it, so stop_before_pixels must be False
                # Store current name (to check at the end if we correctly cleaned up the name)
                try:
                    dcm_pts_name = _unidecode(dcmdata.PatientName.decode('latin1').replace('^', ' ')).lower().strip()
                    # Already anonymized dicom? Get the original patient's name from the anonymized id
                    if dcm_pts_name in anon_ids:
                        dcm_pts_name = anon_ids[dcm_pts_name]
                        if skip_already_processed:
                            tbar.update()
                            continue
                except AttributeError as exc:
                    if filename.upper() == 'DCMDIR' or filename.upper() == 'DICOMDIR':
                        os.remove(fullfilepath)  # DICOMDIR files are useless, they are only descriptive files for CD/DVD of dicoms
                        # DOES NOT WORK: pydicom can read and edit dicomdir files but cannot save them yet!
                        #dcmdata = dicom.read_dicomdir(r'dicoms\ANTOINE_el\EPI_T1\DICOMDIR')
                        #for record in dcmdata.patient_records:
                            #record.PatientName = anon_id
                        continue
                    else:
                        raise
                # Anonymize
                dcmdata.PatientName = anon_id
                dcmdata.PatientID = anon_id
                if [0x33,0x1013] in dcmdata:  # custom patientname field...
                    dcmdata[0x33,0x1013].value = anon_id
                # Delete private fields
                for field in fields_to_del:
                    if field in dcmdata:
                        if isinstance(field, str):
                            del dcmdata[dcmdata.data_element(field).tag]
                        else:
                            del dcmdata[field]
                if remove_private_tags:
                    dcmdata.remove_private_tags()
                # Try to anonymize hidden name fields
                hfields = find_hidden_name_fields(dcmdata, [dcm_pts_name, pts_name], hfields)
                for dcmfield in hfields:
                    if dcmfield in dcmdata:
                        dcmdata[dcmfield].value = re.sub(dcm_pts_name.replace(' ', '[\W]+'), anon_id, dcmdata[dcmfield].value, flags=re.I)
                        dcmdata[dcmfield].value = re.sub(pts_name.replace(' ', '[\W]+'), anon_id, dcmdata[dcmfield].value, flags=re.I)
                # Last check just in case we could not remove the name everywhere!
                dcm_data_str = _unidecode(str(dcmdata).decode('latin1').replace('^', ' ')).lower().strip()  # read all fields at once
                if dcm_pts_name in str(dcm_data_str) or pts_name in str(dcm_data_str):  # if patient's name is still in the file, that's bad!
                    print('Hidden name fields found: %s' % hfields)
                    print('names: %s - %s' %(dcm_pts_name, pts_name))  # debugline
                    print(str(dcm_data_str))
                    raise ValueError('Error: could not remove name totally (there must be an additional non-standard PatientName field) from file: %s' % fullfilepath)
                # Save anonymized dicom file
                dcmdata.save_as(fullfilepath)
                # Close the dicom file
                #os.close(os_id)
                del dcmdata
                count_anon += 1
            except (InvalidDicomError) as exc:
                pass
            except AttributeError as exc:
                print(fullfilepath)
                raise
        tbar.update()  # update progressbar
tbar.close()

print('Hidden name fields found (and automagically anonymized): %s' % hfields)
print('Total dicom anonymized: %i over %i total. Total dicom files skipped: %i. Total reports/non-dicom files deleted: %i.' % (count_anon, count_files, count_files_skipped, count_delete))

In [None]:
# Rename files if filename include a patient's name

# Compile regex to find any patient name (of any patient!) in a string. Non-alphabetical characters are ignored.
filename_patterns = re.compile('(' + '|'.join(re.sub('[^a-zA-Z]+', '[^a-zA-Z]*', s) for s in dcm_unique) + ')', flags=re.I)

uni_rootpath = unicode(rootpath, 'latin1')  # convert rootpath to unicode before walking with os.listdir and recwalk, so we get back unicode strings too (else we won't be able to enter folders with accentuated characters)
subjects_list = get_list_of_folders(uni_rootpath)

count_files = 0
# Precompute total number of files (for progressbar)
print('Precomputing total number of files, please wait...')
for subject in _tqdm(subjects_list, unit='folders', desc='PRECOMP'):
    fullpath = os.path.join(uni_rootpath, subject)
    for dirpath, filename in recwalk(fullpath, topdown=False, folders=True):
        count_files += 1

# Rename files if they have a patient's name
count_moved = 0
tbar = _tqdm(total=count_files, unit='files', desc='ANONFN')
print('Anonymizing of file/folder names, please wait...')
for folder in subjects_list:  # do not rename the top directories, this will be done separately
    if verbose:
        print('- Processing top folder %s' % (folder))
    fullpath = os.path.join(uni_rootpath, folder)
    for dirpath, filename in recwalk(fullpath, topdown=False, folders=True):
        # Find any name (of any patient) in the filename
        # TODO: construct re all permutations of all names, and re.compile, it will be fast
        # TODO: try to do levenshtein distance on names? (but just with current patient name, else it will take too much time with all patients...) it will considerably slow down the anonymization... Is there a faster way?
        matchs = filename_patterns.finditer(filename)
        # If found, we find the anonymized id for each match to replace
        to_replace = []
        for m in matchs:
            # Clean up the name
            pts_name_in_filename = re.sub('[^a-zA-Z]+', ' ', m.group(1).lower())
            # Find the closest unique name
            dst_mat = dist_matrix([pts_name_in_filename], dcm_unique)
            # Get the anonymized id from unique name
            if dst_mat[pts_name_in_filename]:
                anon_id = name_to_anon_ids[dst_mat[pts_name_in_filename][0]]
            else:  # could not find an id, just anonymize with a random name
                anon_id = 'anon'
            # Add slide index and anonymized id to replace all at once later
            to_replace.append( ( anon_id, slice(m.start(1), m.end(1)) ) )
        # Replace all matchs at once
        if to_replace:
            # Can't modify strings, need to convert to a list
            filename_anon = list(filename)
            # For each match, replace with anonymized id
            for anon_id, slidx in to_replace[::-1]:  # reverse list because else the subsequent items won't be aligned anymore when we will replace the first items in the list
                filename_anon[slidx] = anon_id
            # Convert back to a string
            filename_anon = ''.join(filename_anon)
            # Rename the file/folder
            shutil.move(os.path.join(dirpath, filename), os.path.join(dirpath, filename_anon))
            count_moved += 1
        tbar.update()
tbar.close()
print('Total dicom files/folders moved: %i over %i total.' % (count_moved, count_files))

In [None]:
# Rename folders if enabled
rename_folders = True  # rename root folder # TODO: rename any subfolder with patient's name
delete_empty = True  # delete empty folders (or not containing any DICOM, such as nifti folders)?

count_folder = 0
count_skipped = 0
count_empty = 0
uni_rootpath = unicode(rootpath, 'latin1')  # convert rootpath to unicode before walking with os.listdir and recwalk, so we get back unicode strings too (else we won't be able to enter folders with accentuated characters)
if rename_folders:
    print('Launching anonymization of dicom folders, please wait...')
    # Get folder_to_name mapping
    _, folder_to_name = get_dcm_names_from_dir(uni_rootpath)
    _, folder_to_name = get_dcm_names_from_zip(uni_rootpath, folder_to_name=folder_to_name)
    # Get list of folders
    subjects_list = get_list_of_folders(uni_rootpath)
    for subject in _tqdm(subjects_list, unit='folder', desc='RENAME'):
        # Already anonymized folder, just skip
        if subject in anon_ids or re.match('(^%s.+)_s\d+$' % anon_prefix, subject):
            count_skipped += 1
            continue
        pts_name = folder_to_name[subject]
        # Already partially anonymized, we get the original name from the anonymization mapping
        if pts_name in anon_ids:
            pts_name = anon_ids[pts_name]
        # Special case: no dicom with a patient name can be found inside the folder (might be nifti files instead?), so we just remove this folder!
        if pts_name is None and delete_empty:
            filepath = os.path.join(uni_rootpath, subject)
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)
            count_empty += 1
            continue
        # Fetch the anonymized id from folder name (because we already looked inside to get the first dicom's patientname)
        anon_id = name_to_anon_ids[dcmname_to_uniquename[pts_name]]
        if verbose:
            print('- Processing subject %s -> %s in folder %s' % (pts_name, anon_id, subject))
        fullpath = os.path.join(uni_rootpath, subject)
        # Rename subject directory
        new_folder_name = os.path.join(uni_rootpath, anon_id)
        if not os.path.exists(new_folder_name):
            os.rename(fullpath, new_folder_name)
        else:
            # if new folder already exists, find a new name (append "_sx" where x is a number)
            for i in range(2, 1000):
                alt_folder_name = "%s_s%i" % (new_folder_name, i)
                if not os.path.exists(alt_folder_name):
                    os.rename(fullpath, alt_folder_name)
                    break
        count_folder += 1

print('Total dicom folders renamed (anonymized): %i over %i total. Skipped: %i. Empty folders (or containing non-dicom files) and thus deleted: %i.' % (count_folder, len(subjects_list), count_skipped, count_empty))

In [None]:
# Shorten (again) anonymized demographics to only the subjects we have dicom folders for
demo_anon_csv = 'demographics_anonymized.csv'
cf_anon = pd.read_csv(demo_anon_csv, sep=';').fillna('')
# Get list of anonymized dicom names
dcm_ids, _ = get_dcm_names_from_dir(rootpath)
# Shorten anonymized demographics to only the ids present in dicoms
cf_anon = cf_anon[cf_anon['name'].isin(dcm_ids)]
# Save shortened anonymized demographics
cf_anon.to_csv(demo_anon_csv, sep=';', na_rep='NA', index=False)
print('Shortened anonymized demographics (to only the dicoms available) were saved to %s.' % demo_anon_csv)

In [None]:
# TODO: add new ids (new dicom folders) even if not in demographics

-------------------------------------------------------
## Test

In [None]:
dcmdata = dicom.read_file(r'G:\Topreproc\ReportsTun\dicoms2\DeBeleyr_Jesse\1.3.12.2.1107.5.2.32.35033.2013091614000026078119909.0.0.0\1.3.12.2.1107.5.2.32.35033.2013091614054181995757948.dcm', stop_before_pixels=True)

In [None]:
hfields = find_hidden_name_fields(dcmdata, ['jesse'])
for dcmfield in hfields:
    print(dcmfield)
    print(dcmdata[dcmfield].value)

In [None]:
dcm_pts_name = pts_name = 'jesse'
for dcmfield in hfields:
    if dcmfield in dcmdata:
        dcmdata[dcmfield].value = re.sub(dcm_pts_name, anon_id, dcmdata[dcmfield].value, flags=re.I)
        dcmdata[dcmfield].value = re.sub(pts_name, anon_id, dcmdata[dcmfield].value, flags=re.I)
find_hidden_name_fields(dcmdata, ['jesse'])


In [None]:
str(_unidecode(str(dcmdata).decode('latin1').replace('^', ' ')).lower().strip()).find('jesse')  # read all fields at once

In [None]:
str(dcmdata)

In [None]:
dcmdata[0x0029, 0x1010].value

In [None]:
a = dcmdata.keys()[0]
dcmdata[a].value

In [None]:
dcm_pts_name = 'jesse'
anon_id = 'subj003'

def find_hidden_name_fields(dcmdata, dcm_pts_name, hidden_name_fields=None):
    if hidden_name_fields is None:
        hidden_name_fields = set()
    for dcmfield in dcmdata.keys(): # different from dir()?
        if dcmdata[dcmfield]:
            try:
                #dcmfieldval = dcmdata.data_element(dcmfield).value
                dcmfieldval = dcmdata[dcmfield].value
                check = False
                if isinstance(dcmfieldval, list):
                    check = dcm_pts_name in [s.lower() if isinstance(s, str) else s for s in dcmfieldval]
                elif isinstance(dcmfieldval, (int, float)):
                    check = False
                else:
                    check = (dcmfieldval.lower().find(dcm_pts_name) >= 0)
                if check:
                    hidden_name_fields.add(dcmfield)
            except AttributeError:
                print(dcmfield)
                raise
    return hidden_name_fields

hfields = find_hidden_name_fields(dcmdata, dcm_pts_name)
print(hfields)
for dcmfield in hfields:
    dcmdata[dcmfield].value = re.sub(dcm_pts_name, anon_id, dcmdata[dcmfield].value, flags=re.I)

In [None]:
dcmdata.keys()

In [None]:
for s in dcmdata.formatted_lines():
    print(s)

In [None]:
dcmdata.dir() # dcmdata[0x0029, 0x1010] or CSA Image Header

In [None]:
dcmdata[0x33,0x1013].value = 'haha'
del dcmdata[0x33,0x1013]
[0x33,0x1013] in dcmdata

In [None]:
dcmdata.PatientWeight
dcmdata.dir('OtherPatientNames')

In [None]:
del dcmdata

In [None]:
pts_name = _unidecode(dcmdata.PatientName.decode('latin1').replace('^', ' ')).lower().strip()
dcm_data_str = _unidecode(str(dcmdata).decode('latin1').replace('^', ' ')).lower().strip()
#pts_name = dcmdata.PatientName.decode('latin1')
pts_name
pts_name in dcm_data_str

In [None]:
dcmdata[0x0033, 0x1013].value.decode('latin1').lower()

In [None]:
pts_name in dcmdata[0x0033, 0x1013].value

In [None]:
type(pts_name)

In [None]:
dcmdata = dicom.read_dicomdir(r'dicoms\ANTOINE_el\EPI_T1\DICOMDIR')
dcmdata
# if [0x0004, 0x1130] in dcmdata

In [None]:
dcmdata = dicom.read_dicomdir(r'dicoms\ANTOINE_el\EPI_T1\DICOMDIR')
for record in dcmdata.patient_records:
    record.PatientName = 'hehe'

In [None]:
for record in dcmdata.patient_records:
    print(record.PatientName)

In [None]:
dcmdata.save_as(r'dicoms\ANTOINE_el\EPI_T1\DICOMDIR')