<a href="https://colab.research.google.com/github/pieper/Notebooks/blob/master/TCIA_to_NII.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook was updated March, 2021 for IDC-v2.  See github.com/pieper/Notebooks for earlier versions.

In [1]:
!gcloud auth login
!gcloud config set project idc-sandbox-000


Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=cLwP5Ucgbe1et220x8V3Ps81J9hO3s&prompt=consent&access_type=offline&code_challenge=0LTdZOaUMOYIy6dQf814EJvTWoDLsBYg4q268Wc60sQ&code_challenge_method=S256

Enter verification code: 4/1AY0e-g6Ny1HIu8pInL8LzbjbuvgV71Rq9yx0nonMA0uL1uXT9CZmEUtwJtw

You are now logged in as [pieper@isomics.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


To take a quick anonymous survey, run:
  $ gcloud survey

Updated property [core/project].

Convert each series into a nifti file

In [2]:
# TODO: make a small lib in a github repo


import subprocess

def run(cmd):
  child = subprocess.run(cmd.split(), 
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            check=True)
  return child.stdout.decode().strip()

def freshToken():
  token = run('gcloud auth print-access-token')
  return token

def makeBucket(bucket, project="idc-sandbox-000"):
  try:
    run(f"gsutil mb -p {project} {bucket}")
    print(f'bucket {bucket} created')
  except subprocess.CalledProcessError:
    print(f'bucket {bucket} already exists')

makeBucket("gs://dev-isomics-idc-nii")

bucket gs://dev-isomics-idc-nii already exists


In [3]:

#
# create a python script for processing
# TODO: this should pulled from a github repo
#
pythonScript = """


import dicomweb_client
import os
import pydicom
import random
import subprocess
import sys
import time

def run(cmd):
  child = subprocess.run(cmd.split(), 
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            check=True)
  print(child.stdout.decode().strip())
  print(child.stderr.decode().strip())
  return child.stdout.decode().strip()

def freshToken():
    token = run('gcloud auth print-access-token')
    return token

def freshClient(url):
    token = freshToken()
    headers = {
        "Authorization" : "Bearer %s" % freshToken()
    }
    client = dicomweb_client.api.DICOMwebClient(url, headers=headers)
    client.set_http_retry_params(retry = True, max_attempts = 20,
                                 wait_exponential_multiplier = 2,
                                 wait_random_min=100,
                                 wait_random_max=5000)
    return client

def all_studies(client, offset=0, limit=None):
  # be more error tolerant here since there could be contention at startup
  client.set_http_retry_params(retry = True, max_attempts = 100,
                               wait_exponential_multiplier = 2,
                               wait_random_min=100,
                               wait_random_max=3000)

  studies = []
  while True:
    subset = client.search_for_studies(offset=offset)
    if len(subset) == 0:
        break
    studies.extend(subset)
    if limit is not None:
      if len(studies) > limit:
        break
    offset += len(subset)
  return studies[:limit]

def log(message, end="\\n", flush=False):
  print(str(message), end=end, flush=flush)
  fp = open("/var/www/html/index.html", "a")
  fp.write(str(message)+end)
  fp.close()

idcv2BaseURL = "https://healthcare.googleapis.com/v1beta1/projects/idc-dev-etl/locations/us-central1/datasets"

dataset = sys.argv[1]
dicomstore = sys.argv[2]
offset = sys.argv[3]
limit = sys.argv[4]

bucket = "gs://dev-isomics-idc-nii"

idcv2URL = os.path.join(idcv2BaseURL, dataset, "dicomStores", dicomstore, "dicomWeb")
idcv2Client=freshClient(idcv2URL)
log(f"Getting up to {limit} studies starting at {offset}")
studies = all_studies(idcv2Client, offset=int(offset), limit=int(limit))
log(str(len(studies)) + " studies in " + dicomstore)

while len(studies) > 0:
  study = studies.pop()
  log(f"Processing study {study}")
  idcv2Client=freshClient(idcv2URL)
  studyMetadata = dicomweb_client.api.load_json_dataset(study)
  log('processing %s' % studyMetadata.StudyInstanceUID)
  series = idcv2Client.search_for_series(studyMetadata.StudyInstanceUID)
  log("  len(series) = %d" % len(series))
  tries = 1
  try:
    for serie in series:
      idcv2Client=freshClient(idcv2URL)
      run("rm -rf /seriesInstances")
      run("rm -rf /converted")
      run("mkdir /seriesInstances")
      run("mkdir /converted")
      seriesMetadata = dicomweb_client.api.load_json_dataset(serie)
      instances = idcv2Client.search_for_instances(
        study_instance_uid=studyMetadata.StudyInstanceUID,
        series_instance_uid=seriesMetadata.SeriesInstanceUID
      )
      for instance in instances:
        instanceUID = instance['00080018']
        instance = idcv2Client.retrieve_instance(
          study_instance_uid=study['0020000D']['Value'][0],
          series_instance_uid=serie['0020000E']['Value'][0],
          sop_instance_uid= instance['00080018']['Value'][0]
        )
        log (instance.InstanceNumber, end=", ", flush=True)
        pydicom.write_file("/seriesInstances/%s.dcm" % instance.SOPInstanceUID, instance)
      sampleInstanceJSON = instance.to_json(bulk_data_element_handler=lambda x: "")
      fp = open(f"/converted/{serie['0020000E']['Value'][0]}-sampleHeader.json", "w")
      fp.write(sampleInstanceJSON)
      fp.close()
      log('done')
      cmd = "./dcm2niix "
      cmd += "-o /converted " # output dir
      cmd += "-f %s " % seriesMetadata.SeriesInstanceUID # output format
      cmd += "-c %s " % dataset # comment in nii file
      cmd += "-z o " # optimize compression (with pigz)
      cmd += "/seriesInstances " # source directory
      log(cmd)
      log(run(cmd))
      cp = False
      if cp:
        files = run("find /converted -type f").split('\\n')
        log(files)
        for filePath in files:
          fileName = os.path.basename(filePath)
          cmd = "gsutil cp %s %s/%s/%s/%s/%s" % (filePath, bucket, dataset, dicomstore, studyMetadata.StudyInstanceUID, fileName)
          log(cmd)
          run(cmd)
      else:
        cmd = f"gsutil -m cp /converted/* {bucket}/{dataset}/{dicomstore}/{studyMetadata.StudyInstanceUID}"

      log('finished series')
  except:
    log(f'error in study {study} ')
    log(sys.exc_info())
    log(sys.exc_info()[0])
    tries += 1
    if tries > 5:
      print("Giving up after 5 tries")
      break
    # queue it up to re-run and sleep for a bit
    studies.append(study)
    time.sleep(random.randint(1,5))
  log('finished study')
  log(f'{len(studies)} remaining')
log(f'finished')

"""
# end of python template

#
# create a startup script template
# - this is configured by the notebook
#   and then passed as metadata to the vm
# . to use as a boot script
#
startupScriptTemplate = """#! /bin/bash

function log() { echo $(date) $* >> /var/www/html/index.html; }


# work
apt-get update
apt-get install -y apache2
chmod a+w /var/www/html/index.html 
echo Processing %%LIMIT%% at %%OFFSET%% > /var/www/html/index.html

  log Starting
  log token $(gcloud auth print-access-token)
gsutil cp gs://dev-isomics-idc-nii/convertStudies.py convertStudies.py
  log Installing
apt-get install -y wget unzip
wget https://github.com/rordenlab/dcm2niix/releases/download/v1.0.20201102/dcm2niix_lnx.zip
unzip dcm2niix_lnx.zip

# add swap
fallocate -l 100G /swapfile
chmod 600 /swapfile
mkswap /swapfile
swapon /swapfile

apt-get install -y git python3-pip
pip3 install git+https://github.com/pieper/dicomweb-client@add-retry

  log running conversion...

python3 convertStudies.py idc v2 %%OFFSET%% %%LIMIT%% >> /var/www/html/index.html

log ------
log syslog
log ------

cat /var/log/syslog >> /var/www/html/index.html

  log done, shutting down

gsutil cp /var/www/html/index.html gs://dev-isomics-idc-nii/idc/v2/convert-log-%%OFFSET%%-%%LIMIT%%.html


## finally have the machine delete itself
# from: https://cloud.google.com/community/tutorials/create-a-self-deleting-virtual-machine
export NAME=$(curl -X GET http://metadata.google.internal/computeMetadata/v1/instance/name -H 'Metadata-Flavor: Google')
export ZONE=$(curl -X GET http://metadata.google.internal/computeMetadata/v1/instance/zone -H 'Metadata-Flavor: Google')
gcloud --quiet compute instances delete $NAME --zone=$ZONE
"""
### end of boot script template

#
# gcloud Templates
#
# - for deleting any lingering machines
deleteTemplate = "gcloud compute instances delete %%INSTANCE_NAME%% \
 --quiet --project idc-sandbox-000 --zone us-central1-a"
# - for running the actual job
# - needs cloud-platform scope to access healthcare API
# - make sure the service account is whitelisted for public healthcare datset access
# -- https://services.google.com/fb/forms/cloudhealthcarepublicdatasetaccess/
#
runTemplate = "gcloud compute instances create %%INSTANCE_NAME%% \
 --machine-type=e2-custom-2-16384 --async \
 --boot-disk-size=500GB \
 --scopes=https://www.googleapis.com/auth/cloud-platform \
 --service-account=spd-dicom-service@idc-sandbox-000.iam.gserviceaccount.com \
 --tags http-server \
 --project idc-sandbox-000 \
 --zone us-central1-a \
 --metadata-from-file startup-script=startup-%%INSTANCE_NAME%%.sh"

totalStudyCount = 41553 # calculated for v2
instanceCount = 10
studiesPerInstance = totalStudyCount // instanceCount
extraStudies = totalStudyCount % instanceCount


open(f'convertStudies.py', 'w').write(pythonScript)
run(f"gsutil cp convertStudies.py gs://dev-isomics-idc-nii/convertStudies.py")

totalStudies = 0
for instanceIndex in range(instanceCount):
    studyLimit = studiesPerInstance + extraStudies if instanceIndex == instanceCount-1 else studiesPerInstance
    studyOffset = studiesPerInstance * instanceIndex
    instanceName = f"sdp-idc-nii-{instanceIndex}"
    deleteCommand = deleteTemplate.replace("%%INSTANCE_NAME%%", instanceName)
    try:
      run(deleteCommand)
    except:
      print('nothing to delete')


    startupScript = startupScriptTemplate.replace("%%INSTANCE_NAME%%", instanceName).replace("%%OFFSET%%", str(studyOffset)).replace("%%LIMIT%%", str(studyLimit))
    open(f"startup-{instanceName}.sh", 'w').write(startupScript)

    runCommand = runTemplate.replace("%%INSTANCE_NAME%%", instanceName)
    print(runCommand)
    print(studyOffset, studyLimit)
    run(runCommand)
print(f"\n - Should be {instanceCount} machines running")

nothing to delete
gcloud compute instances create sdp-idc-nii-0  --machine-type=e2-custom-2-16384 --async  --boot-disk-size=500GB  --scopes=https://www.googleapis.com/auth/cloud-platform  --service-account=spd-dicom-service@idc-sandbox-000.iam.gserviceaccount.com  --tags http-server  --project idc-sandbox-000  --zone us-central1-a  --metadata-from-file startup-script=startup-sdp-idc-nii-0.sh
0 4155
nothing to delete
gcloud compute instances create sdp-idc-nii-1  --machine-type=e2-custom-2-16384 --async  --boot-disk-size=500GB  --scopes=https://www.googleapis.com/auth/cloud-platform  --service-account=spd-dicom-service@idc-sandbox-000.iam.gserviceaccount.com  --tags http-server  --project idc-sandbox-000  --zone us-central1-a  --metadata-from-file startup-script=startup-sdp-idc-nii-1.sh
4155 4155
nothing to delete
gcloud compute instances create sdp-idc-nii-2  --machine-type=e2-custom-2-16384 --async  --boot-disk-size=500GB  --scopes=https://www.googleapis.com/auth/cloud-platform  --ser

In [4]:
!gcloud compute instances list --project idc-sandbox-000

### delete all of them:
# gcloud compute instances delete $(gcloud compute instances list --filter="name~'sdp-idc-nii*'" --format="value(name)")


NAME                             ZONE           MACHINE_TYPE   PREEMPTIBLE  INTERNAL_IP  EXTERNAL_IP      STATUS
af-gpu-feb-temp                  us-central1-a  n1-standard-4               10.128.0.81                   TERMINATED
af-wsi                           us-central1-a  e2-highmem-4                10.128.0.42                   TERMINATED
sdp-april                        us-central1-a  e2-standard-8               10.128.0.16  34.68.87.171     RUNNING
sdp-idc-nii-0                    us-central1-a  e2-highmem-2                10.128.0.23  35.193.173.139   RUNNING
sdp-idc-nii-1                    us-central1-a  e2-highmem-2                10.128.0.24  34.71.39.192     RUNNING
sdp-idc-nii-2                    us-central1-a  e2-highmem-2                10.128.0.26  35.202.227.79    RUNNING
sdp-idc-nii-3                    us-central1-a  e2-highmem-2                10.128.0.27  35.188.213.188   RUNNING
sdp-idc-nii-4                    us-central1-a  e2-highmem-2                10.128.

***Other Experiments***

# Feature Extraction

```
time  gsutil ls gs://dev-isomics-idcv2-nii/\*/\* | tee series

# extract.sh
#!/bin/bash

  echo $*
  gsutil cp $1 .
  export name=$(basename $1)  
  gunzip ${name}
  export name=$(basename -s .gz ${name})  
  ./featExtract1.6/featExtract.ubu -w $name ${name}.key
  gsutil cp ${name}.key gs://dev-isomics-features
  rm ${name}
  rm ${name}.key

cat series | grep .nii.gz | xargs -n 1 -P 4 ./extract.sh

```


In [None]:
print(run("date"))
collections = healthcareDatasets()
print(collections)

for collection in collections:
    print(run("date"))
    print(collection)
    # convert this inner loop into a compute instance launch
    print( run("gsutil -u idc-sandbox-000 du -sh gs://gcs-public-data--healthcare-idcv2-%s" % collection) )

In [None]:
%%bash

# sudo apt-get install parallel

#time  gsutil ls gs://dev-isomics-idcv2-nii/\*/\* | tee series
# 

cat << EOF > extract.sh
#!/bin/bash
  echo $*
  gsutil cp $1 .
  export name=$(basename $1)  
  gunzip ${name}
  export name=$(basename -s .gz ${name})  
  ./featExtract1.6/featExtract.ubu -w $name ${name}.key
  gsutil cp ${name}.key gs://dev-isomics-features
  rm ${name}
  rm ${name}.key

EOF

./extract.sh gs://dev-isomics-idcv2-nii/4d-lung/1.3.6.1.4.1.14519.5.2.1.6834.5010.101259051157428667137154755870/1.3.6.1.4.1.14519.5.2.1.6834.5010.110117618917781740286980097865.nii.gz


time cat series | grep ct-lymph | head -100 | grep .nii.gz | parallel -j 4 ./extract.sh

time cat series | grep .nii.gz | xargs -n 1 -P 4  ./extract.sh 2>&1 | tee runlog.txt 
