<a href="https://colab.research.google.com/github/myazann/Lung_Cancer/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git init
!git pull https://github.com/myazann/TCIA-API-SDK.git

!cp -r /content/tcia-rest-client-python/src/* /content

!rm -rf tcia-rest-client-java/
!rm -rf tcia-rest-client-python/

!cp /content/drive/MyDrive/Annotation.zip /content
!unzip Annotation.zip > /dev/null
!rm Annotation.zip

!pip install xmltodict
!rm -rf sample_data
!rm sample.py
!rm README.md

!pip install pip install pandasql
from tciaclient import TCIAClient
import json
import os
import numpy as np
from xml.etree import cElementTree as ElementTree
import xmltodict
import urllib.request, urllib.error, urllib.parse
import subprocess
import pandas as pd
import pandasql as ps

tc = TCIAClient(baseUrl="https://services.cancerimagingarchive.net/services/v4", resource = "TCIA")

##Get Annotations, Patients, Series ID's and SOP ID's

In [8]:
## Get bounding boxs associated with the images
folders = np.array(os.listdir("Annotation"))

bboxs = {}

for folder in folders:
  path = "Annotation/" + folder
  for img in os.listdir(path):
    file_path = path + "/" + img
    try:
      dt = xmltodict.parse(ElementTree.tostring(ElementTree.parse(file_path).getroot()))["annotation"]["object"]
      if isinstance(dt, list):
        box_list = []
        for box in dt:
          box_list.append(box)
      else:
        dt = dt["bndbox"]
      img_name = folder + "-" + img.split(".x")[0]
      bboxs[img_name] = dt
    except Exception as e:
      print(e)

with open('bboxs.json', "w") as json_file:
  json.dump(bboxs, json_file)


## Get the series id of each patient with the most images
clctn = "Lung-PET-CT-Dx"
series = json.loads(tc.get_series(collection = clctn, modality = "CT").read())
ptnt_to_series_ids = pd.DataFrame()

pt_id = []
series_id = []
img_count = []

for elem in series:
  try:
    if elem["BodyPartExamined"] == "CHEST":
      pt_id.append(elem["PatientID"].split("-")[1])
      series_id.append(elem["SeriesInstanceUID"])
      img_count.append(elem["ImageCount"])
  except Exception as e:
    print(e)

ptnt_to_series_ids["Patient_ID"] = pt_id
ptnt_to_series_ids["Series_ID"] = series_id
ptnt_to_series_ids["Image_Count"] = img_count

## Pandas groupby don't work this way so I used an SQL statement
ptnt_to_series_ids = ps.sqldf("""SELECT Patient_ID,MAX(Image_Count) Image_Count ,
MAX(Series_ID) Series_ID FROM ptnt_to_series_ids
GROUP BY Patient_ID""")

ptnt_to_series_ids = dict((p,s) for p,s in ptnt_to_series_ids[["Patient_ID","Series_ID"]].values)

## Get the image id's (SOP) in each series
sop_to_series_ids = {}

for series in ptnt_to_series_ids.values():
  sop = json.loads(tc.get_SOP_instance(series).read())
  for id in sop:
    sop_to_series_ids[id["SOPInstanceUID"]] = series

imgs_with_bboxs = [box.split("-")[1] for box in bboxs.keys()]
sop_to_series_ids = dict((img, sop_to_series_ids[img]) for img in sop_to_series_ids.keys() if img in imgs_with_bboxs)

with open('ptnt_to_series_ids.json', 'w') as json_file:
  json.dump(ptnt_to_series_ids, json_file)

with open('sop_to_series_ids.json', 'w') as json_file:
  json.dump(sop_to_series_ids, json_file)

## Download images

In [None]:
if not os.path.exists("images"):
  os.mkdir("images")
  if os.path.exists("./drive/MyDrive/images.zip"):
    subprocess.call(["cp","./drive/MyDrive/images.zip","."])
    subprocess.call(["unzip","images.zip","-d","images"])

existing_images = [img.split(".d")[0] for img in os.listdir("images")]
sop_to_series_ids = dict((img, sop_to_series_ids[img]) for img in sop_to_series_ids.keys() if img not in existing_images)

for img in sop_to_series_ids.keys():
  download_path = "./images/" + img + ".dcm"
  tc.get_single_image(sop_to_series_ids[img], img, downloadPath = download_path)
  if len(os.listdir("images")) % 1000 == 0:
    print("Saving images!")
    subprocess.call(["zip","-r","images.zip","images"])
    subprocess.call(["mv","images.zip","./drive/MyDrive/"])


subprocess.call(["zip","-r","images.zip","images",">", "/dev/null"])
subprocess.call(["mv","images.zip","./drive/MyDrive/"])