<a href="https://colab.research.google.com/github/myazann/Lung_Cancer/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git init
!git pull https://github.com/myazann/TCIA-API-SDK.git
!cp -r /content/tcia-rest-client-python/src/* /content

!rm -rf tcia-rest-client-java/
!rm -rf tcia-rest-client-python/

!cp /content/drive/MyDrive/Annotation.zip /content
!cp /content/drive/MyDrive/bboxs.json /content
!cp /content/drive/MyDrive/ptnt_to_series_ids.json /content
!unzip Annotation.zip > /dev/null
!rm Annotation.zip

!pip install pydicom
!pip install xmltodict
!pip install pandasql
!pip install med2image

!rm -rf sample_data
!rm sample.py
!rm README.md

In [2]:
import json
import os
import numpy as np
import subprocess
import pandas as pd
import itertools

import cv2
from pydicom import dcmread
from tciaclient import TCIAClient
from xml.etree import cElementTree as ElementTree
import xmltodict
import urllib.request, urllib.error, urllib.parse
import pandasql as ps
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure
from PIL import Image


tc = TCIAClient(baseUrl="https://services.cancerimagingarchive.net/services/v4", resource = "TCIA")

## Get Bounding Boxes

In [3]:
## Get bounding boxs associated with the images

if os.path.exists("bboxs.json"):
  
  with open("bboxs.json") as file:
    bboxs = json.load(file)

else:

  folders = np.array(os.listdir("Annotation"))

  bboxs = {}

  for folder in folders:
    path = "Annotation/" + folder
    for img in os.listdir(path):
      file_path = path + "/" + img
      try:
        dt = xmltodict.parse(ElementTree.tostring(ElementTree.parse(file_path).getroot()))["annotation"]["object"]
        if isinstance(dt, list):
          continue
        else:
          dt = dt["bndbox"]
          dt["class"] = folder[0]
          dt["patient"] = folder
          img_name = img.split(".x")[0]
          bboxs[img_name] = dt
      except Exception as e:
        print(e)

  with open('bboxs.json', "w") as json_file:
    json.dump(bboxs, json_file)


##Get Annotations, Patients, Series ID's and SOP ID's

In [4]:
## Get the series id of each patient with the most images

if os.path.exists("ptnt_to_series_ids.json"):
  with open("ptnt_to_series_ids.json") as file:
    ptnt_to_series_ids = json.load(file)

else:
  clctn = "Lung-PET-CT-Dx"
  series = json.loads(tc.get_series(collection=clctn, modality="CT").read())
  ptnt_to_series_ids = pd.DataFrame()

  pt_id = []
  series_id = []
  img_count = []

  for elem in series:
    try:
      if elem["BodyPartExamined"] == "CHEST":
        pt_id.append(elem["PatientID"].split("-")[1])
        series_id.append(elem["SeriesInstanceUID"])
        img_count.append(elem["ImageCount"])
    except Exception as e:
      print(e)

  ptnt_to_series_ids["Patient_ID"] = pt_id
  ptnt_to_series_ids["Series_ID"] = series_id
  ptnt_to_series_ids["Image_Count"] = img_count

  ptnt_to_series_ids = ps.sqldf(
                                 """SELECT Patient_ID,MAX(Image_Count) Image_Count ,
                                    MAX(Series_ID) Series_ID FROM ptnt_to_series_ids
                                    GROUP BY Patient_ID"""
                                    )

  ptnts_with_bboxs = [box.split("-")[0] for box in bboxs.keys()]
  ptnt_to_series_ids = ptnt_to_series_ids.loc[ptnt_to_series_ids["Patient_ID"].isin(ptnts_with_bboxs)]

  _, ptnt_to_series_ids = train_test_split(ptnt_to_series_ids, test_size=0.25, random_state=0)

  ptnt_to_series_ids = dict((p,s) for p,s in ptnt_to_series_ids[["Patient_ID","Series_ID"]].values)

  with open('ptnt_to_series_ids.json', 'w') as json_file:
    json.dump(ptnt_to_series_ids, json_file)

  !cp ptnt_to_series_ids.json /content/drive/MyDrive

## Download images

In [None]:
counter = 0

if not os.path.exists("dcm_images"):
  os.mkdir("dcm_images")
  ptnt_to_series_ids_nd = ptnt_to_series_ids
else:
  downloaded_patients = os.listdir("dcm_images")
  counter = len(downloaded_patients)
  ptnt_to_series_ids_nd = dict((p,s) for p,s in ptnt_to_series_ids.items() if p not in downloaded_patients)


for ptnt in ptnt_to_series_ids_nd:

  print(ptnt)
  ptnt_folder = "dcm_images/" + ptnt
  os.mkdir(ptnt_folder)

  tc.get_image(ptnt_to_series_ids_nd[ptnt], ptnt_folder, ptnt + ".zip")

  zip_folder = ptnt_folder + "/" + ptnt + ".zip"
  subprocess.check_call(["unzip", str(zip_folder), "-d", str(ptnt_folder)])
  subprocess.call(["rm", zip_folder])

  counter += 1

  print(counter)


## Convert dicom files to jpg images

In [None]:
if not os.path.exists("jpg_images"):
  os.mkdir("jpg_images")

for ptnt in os.listdir("dcm_images"):
  print(ptnt)
  for dcm in os.listdir("dcm_images/" + ptnt):
    try:
      file_name = "dcm_images/" + ptnt + "/" + dcm
      dcm_file = dcmread(file_name)

      if (dcm_file.SOPInstanceUID in bboxs.keys()) and ("ORIGINAL" in dcm_file.ImageType):

        if not os.path.exists("jpg_images/" + ptnt):
          os.mkdir("jpg_images/" + ptnt)

        pixel_array = dcm_file.pixel_array
        if len(pixel_array.shape) == 3:
          pixel_array = cv2.cvtColor(pixel_array, cv2.COLOR_RGB2GRAY)

        
        new_path = "jpg_images/" + ptnt + "/" + dcm_file.SOPInstanceUID + ".jpg"

        pixel_array = 255*(
                         (pixel_array - np.min(pixel_array)) \
                         /(np.max(pixel_array) - np.min(pixel_array))
                         )

        if len(pixel_array.shape) == 2:
          pixel_array = pixel_array[..., None]

        cv2.imwrite(new_path, pixel_array)
        
    except Exception as e:
      print(e)

## Split files into train and test (%80-%20). A patient who is in train cannot be in test and classes should be distributed equally in both sets

In [None]:
ptnt_classes = {}

ptnt_classes["A"] = [k for k in ptnt_to_series_ids.keys() if "A" in k]
ptnt_classes["B"] = [k for k in ptnt_to_series_ids.keys() if "B" in k]
ptnt_classes["G"] = [k for k in ptnt_to_series_ids.keys() if "G" in k]

train_ptnts = []
val_ptnts = []

for cls in ptnt_classes.keys():
  permutated_list = list(np.random.permutation(np.array(ptnt_classes[cls])))
  val_len = int(len(permutated_list)*0.25)

  val_ptnts.extend(permutated_list[0:val_len])
  train_ptnts.extend(permutated_list[val_len:])
  

!mkdir /content/lung_ct_train
!mkdir /content/lung_ct_val


for ptnt in os.listdir("jpg_images"):
  cur_path = "jpg_images/" + ptnt
  
  if ptnt in train_ptnts:
    os.mkdir("/content/lung_ct_train/" + ptnt)
    new_path = "lung_ct_train/" + ptnt
  else:
    os.mkdir("/content/lung_ct_val/" + ptnt)
    new_path = "lung_ct_val/" + ptnt
  os.rename(cur_path, new_path)


subprocess.call(["zip","-r","lung_ct_train.zip","lung_ct_train",">", "/dev/null"])
subprocess.call(["mv","lung_ct_train.zip","./drive/MyDrive/"])

subprocess.call(["zip","-r","lung_ct_val.zip","lung_ct_val",">", "/dev/null"])
subprocess.call(["mv","lung_ct_val.zip","./drive/MyDrive/"])