# Cortar as imagens

A partir de um arquivo CSV (localizado em `data_csv/`), esse script identifica a região de interesse de todos os nódulos para cada fase (A, B e C) e recorta as imagens. 

Essa fase de pré-processamento é necessária para melhorar a acurácia da rede de predição proposta no trabalho. 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
#from PIL import Image
#import cv2
from tqdm import tqdm
#import math

In [None]:
pip install pydicom

Collecting pydicom
[?25l  Downloading https://files.pythonhosted.org/packages/f4/15/df16546bc59bfca390cf072d473fb2c8acd4231636f64356593a63137e55/pydicom-2.1.2-py3-none-any.whl (1.9MB)
[K     |▏                               | 10kB 13.4MB/s eta 0:00:01[K     |▍                               | 20kB 18.8MB/s eta 0:00:01[K     |▌                               | 30kB 23.3MB/s eta 0:00:01[K     |▊                               | 40kB 27.0MB/s eta 0:00:01[K     |▉                               | 51kB 28.2MB/s eta 0:00:01[K     |█                               | 61kB 29.8MB/s eta 0:00:01[K     |█▏                              | 71kB 30.7MB/s eta 0:00:01[K     |█▍                              | 81kB 31.4MB/s eta 0:00:01[K     |█▋                              | 92kB 31.5MB/s eta 0:00:01[K     |█▊                              | 102kB 32.1MB/s eta 0:00:01[K     |██                              | 112kB 32.1MB/s eta 0:00:01[K     |██                              | 122kB 32.1

In [None]:
import pydicom

## 1) Lendo o CSV para cada fase

Abrir os arquivos CSV, e interpretar o valor contido nas tabelas.

In [None]:
csv_node_A = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_csv/crop_node_a.csv'
csv_node_B = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_csv/crop_node_b.csv'
csv_node_C = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_csv/crop_node_c.csv'

In [None]:
nod_A = pd.read_csv(csv_node_A)
nod_B = pd.read_csv(csv_node_B)
nod_C = pd.read_csv(csv_node_C)

In [None]:
type(nod_A)

pandas.core.frame.DataFrame

In [None]:
nod_A.head()

Unnamed: 0,num_nod,tam_nod_tc_cm,invasao_microvascular,paciente_id,x0,y0,x1,y1,s0,s1
0,N1,1.9,1,88,,,,,12.0,20.0
1,N2,3.5,1,88,,,,,30.0,40.0
2,N3,1.7,0,88,,,,,21.0,27.0
3,N4,2.1,1,6,,,,,3.0,8.0
4,N6,4.1,0,6,,,,,24.0,36.0


In [None]:
nod_A.iloc[10:20] 

Unnamed: 0,num_nod,tam_nod_tc_cm,invasao_microvascular,paciente_id,x0,y0,x1,y1,s0,s1
10,N15,1.3,0,62,87.0,257.0,140.0,300.0,24.0,40.0
11,N16,3.2,0,33,111.0,181.0,166.0,235.0,8.0,30.0
12,N18,3.0,0,138,81.0,230.0,131.0,274.0,14.0,38.0
13,N19,1.1,0,108,111.0,193.0,153.0,230.0,7.0,28.0
14,N20,1.5,0,108,102.0,175.0,154.0,241.0,11.0,28.0
15,N21,1.3,1,108,123.0,260.0,169.0,294.0,16.0,35.0
16,N22,2.6,0,144,115.0,209.0,167.0,253.0,6.0,16.0
17,N23,3.5,0,143,181.0,240.0,234.0,285.0,14.0,26.0
18,N24,3.1,0,66,142.0,263.0,190.0,293.0,10.0,17.0
19,N25,4.7,0,25,167.0,136.0,239.0,184.0,42.0,63.0


## 2) Encontrar as imagens correspondentes para cada ROI

Uma vez que sabemos as regiões, devemos associar elas para as imagens que devem ser recortadas.

In [None]:
# Verificar se todas as fases possuem o mesmo número de nódulos
if (len(nod_A) == len(nod_B) == len(nod_C)):
  print('Número de nódulos:', len(nod_A))
else:
  print('Aviso: o número de nódulos está diferente para cada fase!')
  print('Número de nódulos: \n\tFase A:', len(nod_A))
  print('\tFase B:', len(nod_B))
  print('\tFase C:', len(nod_C))

Número de nódulos: 200


In [None]:
# para cada nódulo, verificar se as informações necessárias estão presentes

print('Fase A - Arterial')
nod_list_A = []
empty_list_A = []
nan_cnt = 0
for idx in range(len(nod_A)):
  nod = nod_A.iloc[idx]
  try:
    x0 = int(nod['x0'])
    y0 = int(nod['y0'])
    x1 = int(nod['x1'])
    y1 = int(nod['y1'])
    s0 = int(nod['s0'])
    s1 = int(nod['s1'])
    paciente_id = int(nod['paciente_id'])
    invasao_microvascular = bool(nod['invasao_microvascular'])
    num_nod = nod['num_nod']
    nod_dict = {
        'paciente_id': paciente_id,
        'invasao_microvascular': invasao_microvascular,
        'num_nod':num_nod,
        'x0':x0,
        'y0':y0,
        'x1':x1,
        'y1':y1,
        's0':s0,
        's1':s1,
    }
    nod_list_A.append(nod_dict)
  except ValueError:
    nan_cnt += 1
    empty_list_A.append(idx)

print('')
if nan_cnt > 0:
  print(f"Aviso: Número de nódulos vazios: {nan_cnt}")


Fase A - Arterial

Aviso: Número de nódulos vazios: 178


In [None]:
print('\nAbordagem 1: empty list')
for idx in range(len(nod_A)):
  if idx not in empty_list_A:
    print(f"{idx}\t{nod_A.iloc[idx]['num_nod']}"
        + f"\tslices:{nod_A.iloc[idx]['s0']}-{nod_A.iloc[idx]['s1']}"
        + f"\tpx-init:({nod_A.iloc[idx]['x0']},{nod_A.iloc[idx]['y0']})"
        + f"\tpx-final:({nod_A.iloc[idx]['x1']},{nod_A.iloc[idx]['y1']})")


Abordagem 1: empty list
8	N11	slices:16.0-29.0	px-init:(183.0,138.0)	px-final:(238.0,180.0)
9	N12	slices:8.0-31.0	px-init:(71.0,238.0)	px-final:(105.0,280.0)
10	N15	slices:24.0-40.0	px-init:(87.0,257.0)	px-final:(140.0,300.0)
11	N16	slices:8.0-30.0	px-init:(111.0,181.0)	px-final:(166.0,235.0)
12	N18	slices:14.0-38.0	px-init:(81.0,230.0)	px-final:(131.0,274.0)
13	N19	slices:7.0-28.0	px-init:(111.0,193.0)	px-final:(153.0,230.0)
14	N20	slices:11.0-28.0	px-init:(102.0,175.0)	px-final:(154.0,241.0)
15	N21	slices:16.0-35.0	px-init:(123.0,260.0)	px-final:(169.0,294.0)
16	N22	slices:6.0-16.0	px-init:(115.0,209.0)	px-final:(167.0,253.0)
17	N23	slices:14.0-26.0	px-init:(181.0,240.0)	px-final:(234.0,285.0)
18	N24	slices:10.0-17.0	px-init:(142.0,263.0)	px-final:(190.0,293.0)
19	N25	slices:42.0-63.0	px-init:(167.0,136.0)	px-final:(239.0,184.0)
20	N26	slices:6.0-19.0	px-init:(173.0,139.0)	px-final:(219.0,169.0)
21	N27	slices:11.0-38.0	px-init:(75.0,167.0)	px-final:(170.0,240.0)
22	N28	slices:32.0-3

In [None]:
print('\nAbordagem 2: criar um novo dicionário, apenas com os dados preenchidos')
for idx in range(len(nod_list_A)):
  print(f"{idx}\t{nod_list_A[idx]['num_nod']}"
        + f"\t{nod_list_A[idx]['paciente_id']}"
        + f"\tslices:{nod_list_A[idx]['s0']}-{nod_list_A[idx]['s1']}"
        + f"\tpx-init:({nod_list_A[idx]['x0']},{nod_list_A[idx]['y0']})"
        + f"\tpx-final:({nod_list_A[idx]['x1']},{nod_list_A[idx]['y1']})")


Abordagem 2: criar um novo dicionário, apenas com os dados preenchidos
0	N11	116	slices:16-29	px-init:(183,138)	px-final:(238,180)
1	N12	124	slices:8-31	px-init:(71,238)	px-final:(105,280)
2	N15	62	slices:24-40	px-init:(87,257)	px-final:(140,300)
3	N16	33	slices:8-30	px-init:(111,181)	px-final:(166,235)
4	N18	138	slices:14-38	px-init:(81,230)	px-final:(131,274)
5	N19	108	slices:7-28	px-init:(111,193)	px-final:(153,230)
6	N20	108	slices:11-28	px-init:(102,175)	px-final:(154,241)
7	N21	108	slices:16-35	px-init:(123,260)	px-final:(169,294)
8	N22	144	slices:6-16	px-init:(115,209)	px-final:(167,253)
9	N23	143	slices:14-26	px-init:(181,240)	px-final:(234,285)
10	N24	66	slices:10-17	px-init:(142,263)	px-final:(190,293)
11	N25	25	slices:42-63	px-init:(167,136)	px-final:(239,184)
12	N26	52	slices:6-19	px-init:(173,139)	px-final:(219,169)
13	N27	126	slices:11-38	px-init:(75,167)	px-final:(170,240)
14	N28	126	slices:32-37	px-init:(79,309)	px-final:(116,340)
15	N29	26	slices:43-53	px-init:(147,22

In [None]:
print('Fase B - Portal')

Fase B - Portal


In [None]:
print('Fase C - Equilibrio / Excretor')

Fase C - Equilibrio / Excretor


## 3) Recortar as imagens

*   Abrir as imagens em DICOM
*   Converter para NUMPY
*   Recortar as matrizes



In [None]:
data_img = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/CASOS_CHC_DICOM/'

In [None]:
# associar as regiões com as imagens a serem cortadas (em progresso)
print('Fase A - Arterial')

stat_sli = []
stat_area = []

#data_imgs_A = glob(data_img + '*/A/')
#print(data_imgs_A)

for idx in range(len(nod_list_A)):
  # find images
  data_img_A = (data_img + str(nod_list_A[idx]['paciente_id']) + '/A/')
  print(f"{nod_list_A[idx]['num_nod']} directory: {data_img_A}")
  # select slices
  slices = []
  for root, dirs, files in os.walk(data_img_A):
    for f in files:
      num = int( f.replace('imagem','').replace('.dcm','') )
      if nod_list_A[idx]['s0'] < num < nod_list_A[idx]['s1']:
        slices.append(f)
  print("Slices list:", slices)
  num_sli = len(slices)
  print(f"Number of slices: {num_sli}")
  # select region of interest
  crop_px = (nod_list_A[idx]['y0'], # first line
             nod_list_A[idx]['y1'], # last line
             nod_list_A[idx]['x0'], # first column
             nod_list_A[idx]['x1']  # last column
             )
  area = (crop_px[1]-crop_px[0])*(crop_px[3]-crop_px[2])
  print(f"Region of interest: {crop_px} -> Total area={area}-px")
  # statistics
  stat_sli.append(num_sli)
  stat_area.append(area)
  print("")

  # TODO: crop images ==================
  # crop_list_A = glob(data_img_A + '*')


Fase A - Arterial
N11 directory: gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/CASOS_CHC_DICOM/116/A/
Slices list: ['imagem19.dcm', 'imagem26.dcm', 'imagem20.dcm', 'imagem23.dcm', 'imagem24.dcm', 'imagem27.dcm', 'imagem28.dcm', 'imagem25.dcm', 'imagem18.dcm', 'imagem17.dcm', 'imagem22.dcm', 'imagem21.dcm']
Number of slices: 12
Region of interest: (138, 180, 183, 238) -> Total area=2310-px

N12 directory: gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/CASOS_CHC_DICOM/124/A/
Slices list: ['imagem12.dcm', 'imagem10.dcm', 'imagem11.dcm', 'imagem9.dcm', 'imagem26.dcm', 'imagem30.dcm', 'imagem24.dcm', 'imagem29.dcm', 'imagem13.dcm', 'imagem18.dcm', 'imagem20.dcm', 'imagem25.dcm', 'imagem16.dcm', 'imagem19.dcm', 'imagem15.dcm', 'imagem14.dcm', 'imagem22.dcm', 'imagem27.dcm', 'imagem21.dcm', 'imagem23.dcm', 'imagem17.dcm', 'imagem28.dcm']
Number of slices: 22
Region of interest: (238, 280, 71, 105) -> Total area=1428-px

N15 directory: gdrive/Shareddrives/MO2

In [None]:
print(f"Minimum slices = {min(stat_sli)}")
print(f"Maximum slices = {max(stat_sli)}")
print(f"Average slices = {sum(stat_sli)//len(stat_sli)}")
print("")
print(f"Minimum area = {min(stat_area)}")
print(f"Maximum area = {max(stat_area)}")
print(f"Average area = {sum(stat_area)//len(stat_area)}")

Minimum slices = 4
Maximum slices = 26
Average slices = 13

Minimum area = -2250
Maximum area = 6935
Average area = 1902


In [None]:
nod_list_A[idx]

{'invasao_microvascular': False,
 'num_nod': 'N35',
 'paciente_id': 36,
 's0': 29,
 's1': 40,
 'x0': 100,
 'x1': 137,
 'y0': 279,
 'y1': 292}

#### TESTES Cortando as Matrizes

In [None]:
def crop_img(xinterval, yinterval, sliceinterval, pacient_id):
  #print('paciente_id', pacient_id, 'Intervalo eixo X: ',xinterval,'Intervalo eixo Y: ', yinterval, 'Intervalo slices: ', sliceinterval )
  #print(type(sliceinterval))
  #print(data_img, pacient_id)
  path_img = data_img + str(pacient_id) + '/A/' 
  data_img_A = glob(path_img)

  #print(data_img_A)
  
  for sli in data_img_A:
    print(sli)
    #dicom = pydicom.dcmread(sli + 'imagem8.dcm')
    #list_slices(sli, data_img_A)
    list_slices(sli, path_img)
    

def list_slices(sliceinterval, imgpath):
  img_slice_list = list(range(1, 300))
  dots = sliceinterval.find(':')
  begin_slice = int(sliceinterval[0:dots])
  end_slice = int(sliceinterval[dots+1:len(sliceinterval)])
  select_slice_list = img_slice_list[begin_slice:end_slice]
  print(imgpath, sliceinterval, begin_slice, end_slice)
  print(select_slice_list)
  print(img_slice_list[begin_slice:end_slice])
  while begin_slice < end_slice:
    print(begin_slice)
    begin_slice += 1




In [None]:
dicom = pydicom.dcmread('gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/CASOS_CHC_DICOM/116/A/imagem22.dcm')

In [None]:
dicom.pixel_array[183:238,138:180]

array([[ 16, -37, -12, ...,  42,  48,  75],
       [-19, -20, -15, ...,  66,  91,  69],
       [-15, -15, -41, ...,  60,  68,  37],
       ...,
       [ 73,  81,  68, ...,  63,  63,  46],
       [ 64,  58,  63, ...,  78,  57,  64],
       [ 31,  69,  28, ...,  46,  65,  45]], dtype=int16)

In [None]:
np.array(dicom.pixel_array[183:238,138:180])

array([[ 16, -37, -12, ...,  42,  48,  75],
       [-19, -20, -15, ...,  66,  91,  69],
       [-15, -15, -41, ...,  60,  68,  37],
       ...,
       [ 73,  81,  68, ...,  63,  63,  46],
       [ 64,  58,  63, ...,  78,  57,  64],
       [ 31,  69,  28, ...,  46,  65,  45]], dtype=int16)

In [None]:
dicom

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 184
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.2.392.200036.9116.2.6.1.48.1214214735.1278391298.308681
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.3.6.1.4.1.32203
(0002, 0013) Implementation Version Name         SH: 'PIXEON'
-------------------------------------------------
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.392.200036.9116.2.6.1.48.1214214735.1278391298.308681
(0008, 0020) Study Date                          DA: '20100706'
(0008, 0021) Series Date                         DA: '

In [None]:
list_slices('84:51','path')

path 84:51 84 51
[]
[]


In [None]:
for n in nod_list_A:
  xinterval = str(n['x0']) + ':' + str(n['x1'])
  yinterval = str(n['y0']) + ':' + str(n['y1'])
  sliceinterval = str(n['s0']) + ':' + str(n['s1'])
  pacient_id = n['pacient_id']
  crop_img(xinterval, yinterval, sliceinterval, pacient_id)

['gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/CASOS_CHC_DICOM/116/A/']
gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/CASOS_CHC_DICOM/116/A/


ValueError: ignored

In [None]:
for img in tqdm(data_img_A):
  img_id = os.path.basename(img)
  frame_path =  glob(img_id)
  #print(frame_path)
  #frame_path =  glob(img + '/C/*')
  # print('Qtd frames: ', len(frame_path))
  for frame in frame_path:
    # print(frame)
    img = Image.open(frame)
    img = img.convert('L')
    img = img.resize((128, 128))
    img.show()

100%|██████████| 118/118 [00:00<00:00, 18162.49it/s]


In [None]:
# abrir as imagens em DICOM

In [None]:
# converter para NUMPY

In [None]:
# recortar as imagens

## 4) Salvar as imagens 

Opções para exportar as imagens dos nódulos recorados:

*   Salvar as imagens em DICOM, em uma nova pasta
*   Serializar e salvar em formato PICKLE

## N) Outros trechos de código (Archieved)

Trechos extraídos do código `voxel_processing.ipynb`, e mantidos aqui apenas para referência.

In [None]:
data_map = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_csv/mapeamento_nomes_invasao.csv'
data_imgs = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/CASOS_CHC_CORTES/*'

dataset_map = pd.read_csv(data_map)

In [None]:
train = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_csv/train.csv'
val = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_csv/val.csv'
test = 'gdrive/Shareddrives/MO286 - Visualização de Dados/data_csv/test.csv'

N_SLICES = 64

In [None]:
def get_maximum(list_matrix):

  if len(list_matrix) == 1:
    return list_matrix[-1]

  for idx in range(len(list_matrix)-1):

    if idx == 0:  
      maximum = np.maximum(list_matrix[idx], list_matrix[idx+1])

    else:
      maximum = np.maximum(maximum, list_matrix[idx+1])

  return maximum


In [None]:
def get_minimum(list_matrix):

  if len(list_matrix) == 1:
    return list_matrix[-1]

  for idx in range(len(list_matrix)-1):

    if idx == 0:  
      minimum = np.minimum(list_matrix[idx], list_matrix[idx+1])

    else:
      minimum = np.minimum(minimum, list_matrix[idx+1])

  return minimum

In [None]:
def get_chuncks(slices, n_chuncks):

  '''Receives a list of images

  returns the list chuncks

  '''

  n_chuncks = math.ceil(len(slices) / n_chuncks)

  for i in range(0, len(slices), n_chuncks):
    yield slices[i:i+n_chuncks]

In [None]:
N_CHUNCKS = 64

patients = glob(data_imgs)

In [None]:
voxels = []
patients_ids = []
labels = []

for pat in tqdm(patients):
  pat_id = int(os.path.basename(pat))
  
  patients_ids.append(pat_id)
  labels.append(dataset_map.imv[dataset_map.key == pat_id].values[0])

  slices = []
  
  frame_path =  glob(pat + '/C/*')
  # print('Qtd frames: ', len(frame_path))

  for frame in frame_path:

    img = Image.open(frame)
    img = img.convert('L')
    img = img.resize((128, 128))

    slices.append(np.array(img))
  
  voxel = []
  try:
      
    for chunck in get_chuncks(slices, N_CHUNCKS):

      voxel.append(np.average(chunck, axis=0))

    voxels.append(voxel)

  except Exception as e:
    
    print('Paciente {} não contém dados listados como Série Equilíbrio'.format(os.path.basename(pat)))

100%|██████████| 126/126 [1:43:47<00:00, 49.43s/it]


In [None]:
new_data = pd.DataFrame({'paciente_id':patients_ids, 'label':labels})

In [None]:
new_voxels = []

for v in voxels:

  diff = N_CHUNCKS - len(v)

  if diff != 0:

    slices_to_add = v[-diff:]
    new_voxels.append(v + slices_to_add)

  else:
    new_voxels.append(v)
    

In [None]:
new_data['img_volume'] = new_voxels

In [None]:
new_data.to_pickle('gdrive/Shareddrives/MO286 - Visualização de Dados/data_images/train_val_test_datasets/crop_mean_64_slices/crop_mean_64_slices.pkl')