In [1]:
import os
import subprocess
import glob
import pandas as pd
import heudiconv
import sys
import config.core as cfg

## Heudiconv wrapper script
This script runs heudiconv on raw MR data (dicom files), generating an overview tsv file that lists metadata for each image type (e.g. sequence that was used to generate it). <br>
Output files can be found in the ./data directory after running the script.

Requirements:
- *config vars* as specified in config module (will automatically be imported after specifying data root),
    - DICOM_ROOT_PATH
    - DCM_PATTERN

Output:
- *.heudiconv folder* containing the summaries, one for each session (most importantly: containing the dicominfo.tsv file with the sequence infos, or a modification version thereof)

In [3]:
#cfg.initialise_config("/mnt/labdrive", force = True)
cfg.initialise_config(keyword = "Work", force = True)

Configuration initialised with base directory: /home/luisass/Work


In [4]:
IGNORE_PILOTS = True #ignore the scans that have the word "pilot" in their name
IGNORE_HIDDEN = True #ignore hidden folders starting with a '.'. E.g. path/to/.thisisafoldertoignore
OUTPUT_DIR = os.path.join(os.getcwd(), "data")

print(f"OUTPUT FILES to '{OUTPUT_DIR}'")

cfg.print_config_vars(["DCM_PATTERN", "DICOM_ROOT_PATH"])

OUTPUT FILES to '/home/luisass/Work/MR/MR-sequence-inspection/data'

=== from /home/luisass/Work/MR/MR-sequence-inspection/config/cfgvars.py
DCM_PATTERN               = /mnt/labdrive/this/is/a/path/to/root/{subject}/SCANS/*/DICOM/*.dcm
DICOM_ROOT_PATH           = /mnt/labdrive/this/is/a/path/to/root


In [3]:
#create output path if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok = True)
    print(f"Created {OUTPUT_DIR}")

Created /home/luisass/Work/drive_backup_git/MR-analysis/MR-Sequence-inspection/data


In [None]:
session_dirs = list()
for root, dirs, files in os.walk(cfg.DICOM_ROOT_PATH, topdown=True):
    #exclude hidden folders and pilot scans
    if IGNORE_HIDDEN:
        dirs[:] = [d for d in dirs if d[0] != '.']
    if IGNORE_PILOTS:
        dirs[:] = [d for d in dirs if "pilot" not in d.lower()]
    for dir in dirs:
        if dir == "SCANS":
            session_dirs.append(root)
            dirs.remove("SCANS")

sessions = [os.path.basename(p) for p in session_dirs]

print(f"Ignoring Pilots: {IGNORE_PILOTS}\nIgnoring hidden folders: {IGNORE_HIDDEN}\n")
print(f"Found following sessions at path {cfg.DICOM_ROOT_PATH}:")
sessions

Ignoring Pilots: True
Ignoring hidden folders: True

Found following sessions at path /home/luisass/this/is/a/path/to/root:


[]

In [5]:
#not necessary to run this cell, just for info on the heudiconv tool :)
!heudiconv --help

usage: heudiconv [-h] [--version]
                 [-d DICOM_DIR_TEMPLATE | --files [FILES ...]]
                 [-s [SUBJS ...]] [-c {dcm2niix,none}] [-o OUTDIR]
                 [-l LOCATOR] [-a CONV_OUTDIR] [--anon-cmd ANON_CMD]
                 [-f HEURISTIC] [-p] [-ss SESSION]
                 [-b [BIDSOPTION1 [BIDSOPTION2 ...]]] [--overwrite]
                 [--datalad] [--dbg]
                 [--command {heuristics,heuristic-info,ls,populate-templates,sanitize-jsons,treat-jsons,populate-intended-for}]
                 [-g {studyUID,accession_number,all,custom}] [--minmeta]
                 [--random-seed RANDOM_SEED] [--dcmconfig DCMCONFIG]
                 [-q {SLURM,None}] [--queue-args QUEUE_ARGS]

Example: heudiconv -d 'rawdata/{subject}' -o . -f heuristic.py -s s1 s2 s3

options:
  -h, --help            show this help message and exit
  --version             show program's version number and exit
  -d DICOM_DIR_TEMPLATE, --dicom_dir_template DICOM_DIR_TEMPLATE
          

In [None]:
#============================
FORCE_HEUDICONV = False #change this flag if you want to re-run all subjects, even if a tsv file already exists
#============================

#check which subjects 
dicominfo_file_paths = glob.glob(r'**/dicominfo*.tsv', root_dir= OUTPUT_DIR, recursive= True, include_hidden=True)
already_processed_sessions = [os.path.basename(os.path.dirname(os.path.dirname(p))) for p in dicominfo_file_paths]

if len(sessions) == 0:
    print("... nothing to process!")
else:
    # run heudiconv on each session
    for i, session in enumerate(sessions):
        if session in already_processed_sessions and not FORCE_HEUDICONV:
            print(f"=== SESSION ({i+1}/{len(sessions)}): {session:>20s} --> dicominfo tsv file exists. Won't run heudiconv.")
        else:
            print(f"==== SESSION ({i+1}/{len(sessions)}): {session:>20s} --> RUNNING HEUDICONV!")
            cmd = rf'heudiconv -d {cfg.DCM_PATTERN} -s {session} -o {OUTPUT_DIR} -c none -f convertall'
            print(f"Command: {cmd}")

            # merge stderr into stdout so we only read one stream
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                shell=True
            )

            # Stream all output live
            for line in iter(process.stdout.readline, ''):
                print(line.strip())
                sys.stdout.flush()


            # Wait for process to finish and get exit code
            returncode = process.wait()
            if returncode != 0:
                print(f"--- heudiconv FAILED for session {session} with exit code {returncode} ---")

    print("\n... success, all sessions processed!")


... nothing to process!


# Delete sensitive columns

In [None]:
dicominfo_file_paths = glob.glob(r'**/dicominfo*.tsv', root_dir= OUTPUT_DIR, recursive= True, include_hidden=True)
dicominfo_file_paths = [os.path.join(OUTPUT_DIR, p) for p in dicominfo_file_paths]

#to read out scan IDs of dicominfos (just for sanity check)
dicominfo_scan_IDs = list()

columns_to_delete = ["patient_age", "patient_sex"]
for info_file_path in dicominfo_file_paths:
    if os.path.exists(info_file_path):
        dicominfo_scan_IDs.append(os.path.basename(os.path.dirname(os.path.dirname(info_file_path))))
        tsv_file = pd.read_csv(info_file_path, sep='\t', encoding='latin-1', on_bad_lines='skip')
        if any(c in tsv_file.columns for c in columns_to_delete):
            tsv_file.drop(columns=columns_to_delete, inplace=True, errors="ignore")
            # Write the modified DataFrame back to a new TSV file
            new_file_path = os.path.join(os.path.dirname(info_file_path), cfg.DICOMINFO_TSV_NAME)
            tsv_file.to_csv(new_file_path, sep='\t', index=False)
            print(f"deleted sensitive columns for {info_file_path}")
            os.remove(info_file_path)
        else:
            print(f"No sensitive columns found for {info_file_path}")

# Check for missing data or missing info sheet

In [8]:
if sorted(sessions) != sorted(dicominfo_scan_IDs):
    no_info = list(set(sessions) - set(dicominfo_scan_IDs))
    no_data = list(set(dicominfo_scan_IDs) - set(sessions))
    print(f"Existing scans that have no dicominfo.tsv: {no_info}")
    print(f"Existing dicominfo.tsv where no scan is found: {no_data}")
    raise ValueError("Found mismatch in found Scan data and existing summary sheets. See print output above.")
else:
    print("For all scans a dicominfo sheet was found, and for all dicominfo sheets a scan was found.")

For all scans a dicominfo sheet was found, and for all dicominfo sheets a scan was found.
