In [1]:
from pathlib import Path
import tarfile
import os
import subprocess
import shlex
import json

from curriculum_deeplab.utils.common_utils import get_script_dir
THIS_SCRIPT_DIR = get_script_dir()

In [2]:
# Constants and preparation
TCIA_DATA_DIR = Path(THIS_SCRIPT_DIR, "tmp/tcia_data")
TCIA_DATA_CONVENIENT_DIR = Path(THIS_SCRIPT_DIR, "tmp/tcia_data_convenient")
TCIA_DATA_BASIC_PREPROCESSING_DIR = Path(THIS_SCRIPT_DIR, "tmp/tcia_data_preprocessed")
CROSSMODA_DATALOADER_DIR = Path(THIS_SCRIPT_DIR, "tmp/crossmoda_data_dataloader")

tcia_file_path = TCIA_DATA_DIR.joinpath("VS.tcia")
tcia_contours_path = TCIA_DATA_DIR.joinpath("Vestibular-Schwannoma-SEG_contours_Mar_2021.zip")
# tcia_reg_matrices_path = TCIA_DATA_DIR.joinpath("Vestibular-Schwannoma-SEG_matrices_Mar_2021.zip")
TCIA_DATA_DIR.mkdir(parents=True, exist_ok=True)
TCIA_DATA_CONVENIENT_DIR.mkdir(parents=True, exist_ok=True)
TCIA_DATA_BASIC_PREPROCESSING_DIR.mkdir(parents=True, exist_ok=True)
CROSSMODA_DATALOADER_DIR.mkdir(parents=True, exist_ok=True)

URL_TCIA = '"https://wiki.cancerimagingarchive.net/download/attachments/70229053/Vestibular-Schwannoma-SEG%20Feb%202021%20manifest.tcia?api=v2"'
URL_TCIA_CONTOURS = '"https://wiki.cancerimagingarchive.net/download/attachments/70229053/Vestibular-Schwannoma-SEG%20contours%20Mar%202021.zip?api=v2"'
# URL_TCIA_REGISTION_MATRICES = '"https://wiki.cancerimagingarchive.net/download/attachments/70229053/Vestibular-Schwannoma-SEG_matrices%20Mar%202021.zip?api=v2"'

### Download TCIA data

In [5]:
URL_NBIA_DEB = "https://cbiit-download.nci.nih.gov/nbia/releases/ForTCIA/NBIADataRetriever_4.2/nbia-data-retriever-4.2.deb"
nbia_deb_path = Path(THIS_SCRIPT_DIR, "tools/nbia/nbia-data-retriever-4.2.deb")
nbia_bin_path = Path(nbia_deb_path.parent, "opt/nbia-data-retriever/nbia-data-retriever")

In [4]:
# Download and extract nbia data retriever


nbia_deb_path.parent.mkdir(parents=True, exist_ok=True)
!curl {URL_NBIA_DEB} -o {nbia_deb_path}

os.chdir(nbia_deb_path.parent)
!ar -x {nbia_deb_path}
!tar -xf data.tar.xz
os.chdir(THIS_SCRIPT_DIR)


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
 57 65.5M   57 37.9M    0     0  1123k      0  0:00:59  0:00:34  0:00:25 1166k^C
ar: /share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tools/nbia/nbia-data-retriever-4.2.deb is not a valid archive
tar: data.tar.xz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [7]:
# Download TCIA manifest file 
# https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=70229053
tcia_file_path.parent.mkdir(exist_ok=True, parents=True)
!curl -k {URL_TCIA} -o {tcia_file_path}
!curl -k {URL_TCIA_CONTOURS} -o {tcia_contours_path}
# !curl {URL_TCIA_REGISTION_MATRICES} -o {tcia_reg_matrices_path}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  120k  100  120k    0     0  82369      0  0:00:01  0:00:01 --:--:-- 82424
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 15.9M  100 15.9M    0     0  1330k      0  0:00:12  0:00:12 --:--:-- 2400k


In [6]:
# Download TCIA VS data 
# https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=70229053
nbia_libs_path = Path(nbia_deb_path.parent, "opt/nbia-data-retriever/runtime/lib")
if str(nbia_libs_path) not in os.environ['LD_LIBRARY_PATH']:
    os.environ['LD_LIBRARY_PATH'] = f"{os.environ['LD_LIBRARY_PATH']}:{nbia_libs_path}"
!echo "Y" | {nbia_bin_path} --cli {tcia_file_path} -d {TCIA_DATA_DIR} -u {input("username")} -p {input("password")}

The download log can be found at /share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/NBIADataRetrieverCLI-20223906113950.log
2022-04-06 11:39:50: INFO: Using manifiest file: /share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/VS.tcia

2022-04-06 11:39:50: INFO: Running with option: quiet = false; verbose = false; force = false

2022-04-06 11:39:50: INFO: The type of data downloading is DICOM

Data Usage Policy

Any user accessing TCIA data must agree to:
- Not use the requested datasets, either alone or in concert with any other information, to identify or contact individual participants from whom data and/or samples were collected and follow all other conditions specified in the TCIA Site Disclaimer. Approved Users also agree not to generate and use information (e.g., facial images or comparable representations) in a manner that could allow the identities o

In [37]:
# # https://wiki.cancerimagingarchive.net/display/Public/NBIA+Advanced+REST+API+Guide
# import re
# import zipfile
# series_pattern = re.compile(r"^[0-9\.]+$")
# def get_list_of_series(tcia_manifest_file_path):
#     series_ids = []
#     with open(tcia_manifest_file_path, 'r') as tcia_manifest_file:
#         for line in tcia_manifest_file:
#             for match in re.finditer(series_pattern, line.strip()):
#                 series_ids.append(match.group())
#     return series_ids

# def request_token(username, password, client_id, client_secret):
#     request_token_address = f"'username={username}&password={password}&client_id={client_id}&client_secret={client_secret}&grant_type=password'"
    
#     query_output = subprocess.check_output(shlex.split(f"curl -s -d {request_token_address} -X POST -k 'https://services.cancerimagingarchive.net/nbia-api/oauth/token'"))
#     try:
#         token = json.loads(query_output)['access_token']
#     except (json.JSONDecodeError, KeyError):
#         raise ValueError("TCIA token could not be fetched. Please check your credentials.")

#     return token

# def get_metadata(token, series_id):
#     rest_download_address = f"https://services.cancerimagingarchive.net/nbia-api/services/v1/getSeriesMetaData?SeriesInstanceUID={series_id}"
#     authorization_request = f'"Authorization:Bearer {token}"'
#     query_output = subprocess.check_output(shlex.split(f"curl -s -k {rest_download_address}"))

#     try:
#         metadata = json.loads(query_output)[0]
#     except (json.JSONDecodeError):
#         return None

#     return metadata
#     # !curl -k {rest_download_address} --output {output_path}

# def download_restricted_data(token, series_id, output_file_path):
#     rest_download_address = f"https://services.cancerimagingarchive.net/nbia-api/services/v1/getImage?SeriesInstanceUID={series_id}"
#     output_file_path.parent.mkdir(parents=True, exist_ok=True)
#     authorization_request = f'"Authorization:Bearer {token}"'
#     !curl -H {authorization_request} -k {rest_download_address} --output {output_file_path}

# # https://wiki.cancerimagingarchive.net/plugins/personalaccesstokens/usertokens.action
# all_series_ids = [get_list_of_series(tcia_file_path)[0]]
# # all_series_ids = ["1.3.6.1.4.1.9590.100.1.2.374115997511889073021386151921807063992"]

# token = request_token(input("Enter TCIA username"), input("Enter TCIA password"), input("Enter TCIA client_id from helpdesk"), input("Enter TCIA client_secret from helpdesk"))

# for sid in all_series_ids:
#     metadata = get_metadata(token, sid)
#     if not metadata: continue

#     zip_data_path = tcia_data_path.joinpath(metadata['Subject ID'] + ".zip")
#     try:
#         with zipfile.ZipFile(zip_data_path) as zf:
#             pass
#     except (FileNotFoundError, zipfile.BadZipfile):
#         download_restricted_data(token, sid, zip_data_path)
    

### Preprocessing step 1: Run VS_Seg convenient filenames preprocessing

In [11]:
import zipfile

# Extract contours
with zipfile.ZipFile(tcia_contours_path, 'r') as contours_file:
    contours_file.extractall(tcia_contours_path.parent)

# Merge data and contours directories
contours_trailing_path = str(Path(tcia_contours_path.parent, "contours")) + "/"
tcia_convenient_data_trailing_path = str(Path(TCIA_DATA_CONVENIENT_DIR)) + "/"
!rsync -a {contours_trailing_path} {tcia_convenient_data_trailing_path}

In [12]:
CONVENIENT_SCRIPT = Path(THIS_SCRIPT_DIR, "./tools/VS_Seg/preprocessing/TCIA_data_convert_into_convenient_folder_structure.py")
tcia_data_entrypoint = Path(TCIA_DATA_DIR, "VS/Vestibular-Schwannoma-SEG")
!python {CONVENIENT_SCRIPT} --input {tcia_data_entrypoint} --output {TCIA_DATA_CONVENIENT_DIR}

['/share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/VS/Vestibular-Schwannoma-SEG/LICENSE', '/share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/VS/Vestibular-Schwannoma-SEG/VS-SEG-001', '/share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/VS/Vestibular-Schwannoma-SEG/VS-SEG-002', '/share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/VS/Vestibular-Schwannoma-SEG/VS-SEG-003', '/share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/VS/Vestibular-Schwannoma-SEG/VS-SEG-004', '/share/data_supergrover1/weihsbach/shared_data/tmp/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/tcia_data/VS/Vestibular-Schwannoma-SEG/VS-SEG-005', '/share/data_supergrover1/weihsbach/shared

### Preprocessing step 2: Run VS_Seg Slicer.org preprocessing

In [7]:
# Build Slicer.org docker
build_dir = Path(THIS_SCRIPT_DIR, "tools/SlicerDockers")
subprocess.call(build_dir.joinpath("build.sh"), cwd=build_dir)

#1 [internal] load build definition from Dockerfile
#1 sha256:31623e871b26b3b65716d27c66688f5c8eed2f36d4eafbd9c4493d9c433e33a3
#1 transferring dockerfile: 37B 0.0s done
#1 DONE 0.1s

#2 [internal] load .dockerignore
#2 sha256:61fa21e5ef6e9df0d0003d2f22598bb778488a061dd97e7ed285ce6dd4b12e46
#2 transferring context: 2B done
#2 DONE 0.0s

#3 [internal] load metadata for docker.io/stevepieper/x11:latest
#3 sha256:c0789341f0873bc22b92299d0986138851e1fdf9c0d4ad5eb04f3910bd2a78a2
#3 DONE 1.9s

#4 [ 1/23] FROM docker.io/stevepieper/x11:latest@sha256:bcf1410f89f11e3c8693fa2e8d6b352da6c1938bf520895cf1d6340188d37999
#4 sha256:de6b7259924fadff7cfb60df7a5b7074f93c5170c4a27b26f8682a1d0806eb21
#4 DONE 0.0s

#15 [internal] load build context
#15 sha256:414e3f4757c29b8febbed34f0543be8ebeb0b18344ecee32ee7ebcefc771273a
#15 transferring context: 1.12kB done
#15 DONE 0.0s

#18 [14/23] COPY /usr /usr
#18 sha256:8deab03101a283a003753f43e505648052edfed4ec94552530a199779148c433
#18 CACHED

#11 [ 8/23] RUN apt-


Use 'docker scan' to run Snyk tests against images to find vulnerabilities and learn how to fix them


0

In [9]:
SHARED_DIR = Path(THIS_SCRIPT_DIR, "tools/docker_shared")
SHARED_INPUT_DIR = TCIA_DATA_CONVENIENT_DIR
SHARED_INPUT_DIR = Path("/Users/christianweihsbach/tcia_mangling_tmp/tcia_crossmoda_001_250_convenient") # TODO remove
SHARED_OUTPUT_DIR = TCIA_DATA_BASIC_PREPROCESSING_DIR

SLICER_PREPROCESSING_SCRIPT_PATH = Path(THIS_SCRIPT_DIR, "./tools/VS_Seg/preprocessing/data_conversion.py")
!cp {SLICER_PREPROCESSING_SCRIPT_PATH} {SHARED_DIR}

# Run docker container and script. 
# Running this script will take time. Connect to localhost:8080/ to see process
# Run built docker file
docker_cmd = f'docker run -v {SHARED_DIR}:/tmp/shared -v {SHARED_INPUT_DIR}:/tmp/shared_input -v {SHARED_OUTPUT_DIR}:/tmp/shared_output -p 8080:8080 --rm -it deep_staple/slicer'
!{docker_cmd}

2022-04-04 20:06:54,129 CRIT Supervisor running as root (no user in config file)
2022-04-04 20:06:54,131 INFO Included extra file "/etc/supervisord.d/easydav.conf" during parsing
2022-04-04 20:06:54,131 INFO Included extra file "/etc/supervisord.d/gotty.conf" during parsing
2022-04-04 20:06:54,131 INFO Included extra file "/etc/supervisord.d/nginx.conf" during parsing
2022-04-04 20:06:54,131 INFO Included extra file "/etc/supervisord.d/websockify.conf" during parsing
2022-04-04 20:06:54,131 INFO Included extra file "/etc/supervisord.d/x11vnc.conf" during parsing
2022-04-04 20:06:54,131 INFO Included extra file "/etc/supervisord.d/xdummy.conf" during parsing
2022-04-04 20:06:54,132 INFO Included extra file "/etc/supervisord.d/xinitrc.conf" during parsing
2022-04-04 20:06:54,136 INFO Creating socket unix:///var/run/easydav.sock
2022-04-04 20:06:54,137 INFO Closing socket unix:///var/run/easydav.sock
2022-04-04 20:06:54,160 INFO RPC interface 'supervisor' initialized
2022-04-04 20:06:54,1

### Preprocessing step 3: Prepare dataloader file structure

In [23]:
TCIA_TO_CROSSMODA_SCRIPT = Path(THIS_SCRIPT_DIR, "./tools/dataloader_preparation/tcia_to_crossmoda.py")

run_cmd = f"{TCIA_TO_CROSSMODA_SCRIPT} --input {TCIA_DATA_BASIC_PREPROCESSING_DIR} --output {CROSSMODA_DATALOADER_DIR}"
!python {run_cmd}

100%|███████████████████████████████████████| 1434/1434 [06:49<00:00,  3.50it/s]


### Preprocessing step 4: Add dataloader data levels L2 to L4

In [11]:
BUILD_LEVELS_SCRIPT = Path(THIS_SCRIPT_DIR, "tools/dataloader_preparation/build_levels.py")
COCHLEA_CENTERS_PATH = Path(THIS_SCRIPT_DIR, "tools/dataloader_preparation/l3_cochlea_centers.pth")

!python {BUILD_LEVELS_SCRIPT} --input {CROSSMODA_DATALOADER_DIR} --cochlea-centers {COCHLEA_CENTERS_PATH}

Building L2 ...
Processing /Users/christianweihsbach/code/curriculum_deeplab/curriculum_deeplab/preprocessing/tmp/crossmoda_data_dataloader/L1_original/__omitted_labels_target_training__
 32%|█████████████▌                            | 34/105 [01:38<03:18,  2.80s/it]