In [3]:
import os
import numpy as np
import pandas as pd
from google.cloud import bigquery

In [4]:
PROJECT_ID = ""

if not os.getenv("IS_TESTING"):
    # Get your Google Cloud project ID from gcloud
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Project ID:  medical-imaging-ai


In [6]:
collection_ID = "lctsc"
bq_client = bigquery.Client(PROJECT_ID)

selection_query = f"\
  WITH studies AS (SELECT DISTINCT StudyInstanceUID FROM `bigquery-public-data.idc_current.dicom_all` \
                  WHERE collection_id =  \"{collection_ID}\"  \
                    LIMIT 10) \
  SELECT  \
    StudyInstanceUID, \
    SeriesInstanceUID, \
    SOPInstanceUID, \
    gcs_url \
  FROM \
    `bigquery-public-data.idc_current.dicom_all` INNER JOIN studies USING(StudyInstanceUID) \
    "

selection_result = bq_client.query(selection_query)
selection_df = selection_result.result().to_dataframe()

In [None]:
pd.set_option('display.max_colwidth', None)

selection_df

In [8]:
# save the list of files
# save the list of GCS URLs into a file
import os
idc_download_folder = "./IDC_downloads"
if not os.path.exists(idc_download_folder):
  os.mkdir(idc_download_folder)

selection_manifest = os.path.join(idc_download_folder, "lctsc_manifest.txt")
selection_df["gcs_url"].to_csv(selection_manifest, header=False, index=False)

In [10]:
# create a staging bucket
bucket_name = "gs://"+PROJECT_ID+"_lab1"
print("Bucket Name: ",bucket_name)
stage_dir = bucket_name+"/LCTSC"
print("Staging Directory: ",stage_dir)

Bucket Name:  gs://medical-imaging-ai_lab1
Staging Directory:  gs://medical-imaging-ai_lab1/LCTSC


In [None]:
#create the bucket
!gsutil mb $bucket_name

In [11]:
# copy files to staging area
# download is this simple!
!cat ./IDC_downloads/lctsc_manifest.txt | gsutil -m cp -I $stage_dir

Copying gs://public-datasets-idc/9f0e2b4c-28b3-400d-9f9a-a5e5cf0eda42.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/e45977aa-91e1-4128-8f9b-99dcb584e174.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/09213a10-090c-4b29-a36b-b7f2e9fbb679.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/f2227611-e4ba-427c-9b16-0b6a8e86a244.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/9420ab6a-9290-4af0-88b9-d5860116de34.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/fe617655-9b3b-49a2-b94a-68b602f5f18c.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/acc74947-4b98-4243-a479-10b3976dd234.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/8ac8c2c6-00f6-488b-ba6f-44b637ed1b2a.dcm [Content-Type=application/dicom]...
Copying gs://public-datasets-idc/3d632b11-fe8a-45e3-b873-aac04fbbb27e.dcm [Content-Type=application/dicom]...
Copying gs