# Modification

**dicom full-preprocessing (misc1) v3 - interpolate only 2axes to inpaint**
1. We resample along the vertical and horizontal axes but we don't resample along the slices. This creates a smaller volume to apply inpainting. The slices axis has to be resamples later
1. We dilate the union of the segmentations
1. We dilate the lungs mask with kernel=1 (it was 5)

In [None]:
# Compared to the previous version (v2), this script removes the scans with bad
# slices (>2.5mm or inconsistency between spacing and thickness)

import os # module for interfacing with the os
import numpy as np # numpy for arrays etc
import pandas as pd # module for creating and querying data tables (databases) efficiently
import pylidc as pl # module for handling the LIDC dataset
import matplotlib.pyplot as plt # plotting utilities
import scipy.ndimage # 
import scipy.sparse
import scipy
from preprocessing.preprocess_functions import *
from utils_LIDC.utils_LIDC import *
from pylidc.utils import consensus
from skimage.morphology import ball, dilation

In [None]:
from scipy import sparse
from tqdm import tqdm_notebook

In [None]:
LIDC_PATH = '/data/datasets/LIDC-IDRI/' # original LIDC data
# annotations = pd.read_csv('/data/datasets/LIDC-IDRI/annotations.csv')
LIDC_IDs = os.listdir(f'{LIDC_PATH}LIDC-IDRI')
LIDC_IDs = [i for i in LIDC_IDs if 'LIDC' in i]
LIDC_IDs = np.sort(LIDC_IDs)

# output path
path_dest = f'/data/OMM/Datasets/LIDC_other_formats/LIDC_preprocessed_3D v4 - inpaint before preprocess/' 
if not os.path.exists(path_dest): os.makedirs(path_dest)

In [None]:
# Get all the scans for X patient(s)
df = pd.read_csv('/data/datasets/LIDC-IDRI/annotations.csv')

#%%
scans_with_errors = []
errorScansFile = open(path_dest + "scans_with_errors.txt","w")

numVoxelsPerLungSeg = []
listOfRejectedPatients = []
rejectListFile = open(path_dest + "rejectedPatients.txt","w") 

listOfUsedPatients = []
useListFile = open(path_dest + "usedPatients.txt","w") 

requiredSelemWidth = []
selemZWidthFile = open(path_dest + "segmentationSelemZWidths.txt","w") 

for idx, k in enumerate(LIDC_IDs):
    # SCAN idx==41, 61 has an error
    if idx<=782:continue
#     if idx <=10:continue
#     if idx ==300:break

    k = LIDC_IDs[idx]

    #if idx>5:break
    print(f'preprocessing: {idx}, {k}')
       
    df_patient = df.loc[df['patientid']==int(k[-4:])] 
    pid = k
    
    # query the LIDC images with patient_id = pid 
    # HERE WE JUST USE THE FIRST ONE!!
    idx_scan = 0 
    
    # get the scan object for this scan
    scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid)[idx_scan] 
    
    # here we can reject according to any criteria we like
    thickSlice = (scan.slice_thickness > 3) | (scan.slice_spacing > 3)
    missingSlices = len(np.unique(np.round(100*np.diff(scan.slice_zvals)))) != 1
    if (thickSlice)  :
        # we want to reject this scan/patient
        print('Undesirable slice characteristics, rejecting')
        listOfRejectedPatients.append(pid)
        continue
    elif (missingSlices):
        print('Missing slices, rejecting')
        listOfRejectedPatients.append(pid)
        continue
    else:
        # we will use this scan
        listOfUsedPatients.append(pid)
        #continue # this lets us quickly check the outcome of the selection
    
    print('Loading and converting to HU')
    curr_patient_pixels, spacing_orig = custom_load_scan_to_HU(scan)

    print('Resampling to isotropic resolution')
    pix_resampled, spacing = resample_grid_except_slices(curr_patient_pixels, spacing_orig, [1,1,1])
    
    print('Segmenting the lungs and dilating the mask')
    try:
        segmented_lungs_fill, requiredSelemWidthTmp = segment_lung_mask(pix_resampled, True)
    except IndexError: continue
    requiredSelemWidth.append(requiredSelemWidthTmp)
    # Dilate the mask
    selem = ball(1) # radius of 5 mm
    dilated = dilation(segmented_lungs_fill, selem) # dilate a bit according to the tut
    # Apply the mask
    pix_resampled_to_use = pix_resampled*dilated
    # count the number of lung voxels to find those which are badly segmented
    numVoxelsPerLungSeg.append(np.count_nonzero(dilated))
    
    print('Finding nodule masks')
    # The mask
    # put the mask on an array with the same shape as the original volume
    one_segmentation_consensus = np.zeros_like(curr_patient_pixels)
    one_segmentation_maxvol = np.zeros_like(curr_patient_pixels)
    labelledNods = np.zeros_like(curr_patient_pixels)

    # get all the annotations for this scan
    ids = [i.id for i in scan.annotations] # this gives the annotation IDs (note that they are not in order in the annotations.csv)
     
    # we split the df for patient pid into the part for just this scan
    df_patient_partX = df_patient.loc[df_patient.annotation_id.isin(ids)]
    unique_nodules = np.unique(df_patient_partX['cluster_id'].values)
    nods = scan.cluster_annotations() # get the annotations for all nodules in this scan

    for unique_nodule in unique_nodules:
        df_nodule = df_patient_partX.loc[df_patient_partX['cluster_id']==unique_nodule] # this gives all annotations for this nodule (cluster)
     
        anns = nods[unique_nodule] # then choose the annotation for the nodule we are considering
    
        try:
            # cmask = consensus mask, cbbox = consensus bounding box, masks = original annotations
            cmask,cbbox,masks = consensus(anns, clevel=0.5, pad=[(0,0), (0,0), (0,0)])
        except NameError:
            scans_with_errors.append(pid)
            continue
     
        # we want to save the consensus AND the mask of all segmented voxels in all annotations
        one_mask_consensus = cmask
        one_mask_maxvol = np.zeros_like(cmask)
        for mask in masks:
            one_mask_maxvol = (one_mask_maxvol > 0) | (mask > 0)    
        
        # pylidc loads in a different order to our custom 3D dicom reader, so need to swap dims
        one_mask_consensus = np.swapaxes(one_mask_consensus,1,2);one_mask_consensus = np.swapaxes(one_mask_consensus,0,1)
        one_mask_maxvol = np.swapaxes(one_mask_maxvol,1,2);one_mask_maxvol = np.swapaxes(one_mask_maxvol,0,1)
        
        # Dilate the mask
        one_mask_maxvol = dilation(one_mask_maxvol)
        
        # fill the consensus bounding box with the mask to get a nodule segmentation in original image space (presumably the cbbox is big enough for all the individual masks)
        one_segmentation_consensus[cbbox[2].start:cbbox[2].stop,cbbox[0].start:cbbox[0].stop,cbbox[1].start:cbbox[1].stop] = one_mask_consensus
        one_segmentation_maxvol[cbbox[2].start:cbbox[2].stop,cbbox[0].start:cbbox[0].stop,cbbox[1].start:cbbox[1].stop] = one_mask_maxvol
        labelledNods[cbbox[2].start:cbbox[2].stop,cbbox[0].start:cbbox[0].stop,cbbox[1].start:cbbox[1].stop] = one_mask_maxvol * (unique_nodule + 1) # label each nodule with its 'cluster_id'

    pass 

    labelledNods = labelledNods - 1 # to get background = - 1, and each nodule to contain its cluster id

    # Resample the nodule segmentation
    mask_consensus_resampled, _ = resample_grid_except_slices(one_segmentation_consensus, spacing_orig, [1,1,1],'nearest') # first patient still has the voxel size of the original image to enable the resampling
    mask_maxvol_resampled, _ = resample_grid_except_slices(one_segmentation_maxvol, spacing_orig, [1,1,1],'nearest') # first patient still has the voxel size of the original image to enable the resampling
    labelledNods_resampled, _ = resample_grid_except_slices(labelledNods, spacing_orig, [1,1,1],'nearest') # first patient still has the voxel size of the original image to enable the resampling

    print('Saving...')
    # now we save the results, saving each slice as a sparse array to cut down on size!
    # (currently just saving the last nodule per scan?)
    if not os.path.exists(f'{path_dest}{k}/scans'): os.makedirs(f'{path_dest}{k}/scans')
    if not os.path.exists(f'{path_dest}{k}/consensus_masks'): os.makedirs(f'{path_dest}{k}/consensus_masks')
    if not os.path.exists(f'{path_dest}{k}/maxvol_masks'): os.makedirs(f'{path_dest}{k}/maxvol_masks')
    if not os.path.exists(f'{path_dest}{k}/lung_masks'): os.makedirs(f'{path_dest}{k}/lung_masks')
    if not os.path.exists(f'{path_dest}{k}/cluster_id_images'): os.makedirs(f'{path_dest}{k}/cluster_id_images')


    for idj,(slice_pix, slice_mask_consensus, slice_mask_maxvol,slice_lungseg, slice_cluster_id_image) in enumerate(zip(pix_resampled_to_use, mask_consensus_resampled, mask_maxvol_resampled,dilated,labelledNods_resampled)):
        sparse_matrix = scipy.sparse.csc_matrix(slice_pix)
        sparse_matrix2 = scipy.sparse.csc_matrix(slice_mask_consensus)
        sparse_matrix3 = scipy.sparse.csc_matrix(slice_mask_maxvol)
        sparse_matrix4 = scipy.sparse.csc_matrix(slice_lungseg)
        sparse_matrix5 = scipy.sparse.csc_matrix(slice_cluster_id_image)

        scipy.sparse.save_npz(f'{path_dest}{k}/scans/slice_{idj:04d}.npz', sparse_matrix, compressed=True)
        scipy.sparse.save_npz(f'{path_dest}{k}/consensus_masks/slice_m_{idj:04d}.npz', sparse_matrix2, compressed=True)
        scipy.sparse.save_npz(f'{path_dest}{k}/maxvol_masks/slice_m_{idj:04d}.npz', sparse_matrix3, compressed=True)
        scipy.sparse.save_npz(f'{path_dest}{k}/lung_masks/slice_m_{idj:04d}.npz', sparse_matrix4, compressed=True)
        scipy.sparse.save_npz(f'{path_dest}{k}/cluster_id_images/slice_m_{idj:04d}.npz', sparse_matrix5, compressed=True)

#%% save some summary output
np.savetxt(path_dest + 'segmentation_results.dat', numVoxelsPerLungSeg)

np.savetxt(rejectListFile,listOfRejectedPatients,'%10s')
rejectListFile.close()

np.savetxt(useListFile,listOfUsedPatients,'%10s')
useListFile.close()

np.savetxt(selemZWidthFile,requiredSelemWidth,'%u')
selemZWidthFile.close()

np.savetxt(errorScansFile,scans_with_errors,'%10s')
errorScansFile.close()

#%% plot segmentation results
# ax = plt.hist(numVoxelsPerLungSeg,100)
# plt.xlabel('Number of voxels in segmentation')

In [None]:
slice_n = 90
print(f'original shape {np.shape(curr_patient_pixels), spacing}')
print(f'resampled shape {np.shape(pix_resampled)}')
print(np.shape(pix_resampled_to_use),np.shape(mask_maxvol_resampled))
fig, ax = plt.subplots(1,3, figsize=(14,5))
ax[0].imshow(curr_patient_pixels[slice_n])
ax[1].imshow(pix_resampled_to_use[slice_n])
ax[2].imshow(mask_maxvol_resampled[slice_n])

---

---

In [None]:
def make3d_from_sparse(path):
    slices_all = os.listdir(path)
    slices_all = np.sort(slices_all)
    for idx, i in enumerate(slices_all):
        sparse_matrix = sparse.load_npz(f'{path}{i}')
        array2d = np.asarray(sparse_matrix.todense())
        if idx == 0: 
            scan3d = array2d
            continue
        scan3d = np.dstack([scan3d,array2d])
    return scan3d

In [None]:
name = 'LIDC-IDRI-0001'

In [None]:
f'{path_data}{name}/scans/'

In [None]:
def read_slices3D_v2(path_data, ii_ids):
    """Read VOLUMES of lung, mask outside lungs and nodule, mask nodule, mask outside"""
    #ii_ids = f'LIDC-IDRI-{idnumber:04d}'
    print(f'reading scan {ii_ids}')
    vol = make3d_from_sparse(f'{path_data}{ii_ids}/scans/')
    mask = make3d_from_sparse(f'{path_data}{ii_ids}/consensus_masks/')
    mask_maxvol = make3d_from_sparse(f'{path_data}{ii_ids}/maxvol_masks/')
    mask_lungs = make3d_from_sparse(f'{path_data}{ii_ids}/lung_masks/')  
    # rearrange axes to slices first
    vol = np.swapaxes(vol,1,2)
    vol = np.swapaxes(vol,0,1)
    mask = np.swapaxes(mask,1,2)
    mask = np.swapaxes(mask,0,1)
    mask_maxvol = np.swapaxes(mask_maxvol,1,2)
    mask_maxvol = np.swapaxes(mask_maxvol,0,1)
    mask_lungs = np.swapaxes(mask_lungs,1,2)
    mask_lungs = np.swapaxes(mask_lungs,0,1)
    # Find the minimum box that contain the lungs 
    min_box = np.where(vol!=0)
    min_box_c = min_box[0]
    min_box_x = min_box[1]
    min_box_y = min_box[2]
    # Apply the minimum box to the vol and masks
    vol_small = vol[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
    mask_small = mask[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
    mask_maxvol_small = mask_maxvol[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
    mask_lungs_small = mask_lungs[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)] 
    # Get the mask_maxvol_small and the mask_lungs_small together
    mask_maxvol_and_lungs = 1- ((1-mask_lungs_small) | mask_maxvol_small)
    mask_lungs_small2 = mask_lungs_small | mask_maxvol_small
    return vol_small, mask_maxvol_small, mask_maxvol_and_lungs, mask_lungs_small2

In [None]:
vol_small, mask_maxvol_small, mask_maxvol_and_lungs_small, mask_lungs_small = read_slices3D_v2(path_dest, 'LIDC-IDRI-0001')

In [None]:
ii = 59+12
fig, ax = plt.subplots(1,4,figsize=(14,4))
ax[0].imshow(vol_small[ii])
ax[1].imshow(mask_maxvol_small[ii])
ax[2].imshow(mask_maxvol_and_lungs_small[ii])
ax[3].imshow(mask_lungs_small[ii])

In [None]:
ii = 59+12
fig, ax = plt.subplots(1,4,figsize=(14,4))
ax[0].imshow(vol_small[ii])
ax[1].imshow(mask_maxvol_small[ii])
ax[2].imshow(mask_maxvol_and_lungs_small[ii])
ax[3].imshow(mask_lungs_small[ii])

In [None]:
for i in tqdm_notebook(np.arange(1,10)):
    name = f'LIDC-IDRI-{i:04d}'
    print(name)
    vol_small, mask_maxvol_small, mask_maxvol_and_lungs, mask_lungs_small = read_slices3D(name)
    z,x,y = np.where(mask_maxvol_small==1)
    z_median = np.median(z)
    slice_n = int(z_median)
    fig, ax = plt.subplots(1,4, figsize=(14,5))
    ax[0].imshow(vol_small[slice_n], vmin=0, vmax=1)
    ax[1].imshow(mask_maxvol_small[slice_n])
    ax[2].imshow(mask_maxvol_and_lungs[slice_n])
    ax[3].imshow(mask_lungs_small[slice_n])

In [None]:
i = 'LIDC-IDRI-0001'
lungs = make3d_from_sparse(f'{path_dest}{i}/scans/')
mask = make3d_from_sparse(f'{path_dest}{i}/maxvol_masks/')
mask_lungs = make3d_from_sparse(f'{path_dest}{i}/lung_masks/')

In [None]:
np.where(mask==1)

In [None]:
# rearrange axes to slices first
vol = lungs
vol = np.swapaxes(vol,1,2)
vol = np.swapaxes(vol,0,1)
mask = np.swapaxes(mask,1,2)
mask = np.swapaxes(mask,0,1)
mask_lungs = np.swapaxes(mask_lungs,1,2)
mask_lungs = np.swapaxes(mask_lungs,0,1)
# Find the minimum box that contain the lungs 
min_box = np.where(vol!=0)
min_box_c = min_box[0]
min_box_x = min_box[1]
min_box_y = min_box[2]
vol_small = vol[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
# Apply the same minimum box to the mask
mask_small = mask[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
mask_lungs_small = mask_lungs[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
lungs = vol_small
mask = mask_small
mask_lungs = mask_lungs_small
np.shape(lungs), np.shape(mask), np.shape(mask_lungs)

In [None]:
slice_n=70
fig, ax = plt.subplots(1,3, figsize=(14,5))
ax[0].imshow(lungs[slice_n])
ax[1].imshow(mask[slice_n])
ax[2].imshow(mask_lungs_small[slice_n])