In [2]:
import numpy as np
import scipy.sparse as sparse
import os
import matplotlib.pyplot as plt
import time
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
%matplotlib inline

In [3]:
!pwd

/home/om18/Documents/KCL/Feb 5 19 - Region proposal nodule detection


In [4]:
path_data = f'/data/OMM/Datasets/LIDC_other_formats/LIDC_preprocessed_3D v2/'
ids = os.listdir(path_data)
ids = np.sort(ids)

## Functions

In [5]:
def make3d_from_sparse(path):
    slices_all = os.listdir(path)
    slices_all = np.sort(slices_all)
    for idx, i in enumerate(slices_all):
        sparse_matrix = sparse.load_npz(f'{path}{i}')
        array2d = np.asarray(sparse_matrix.todense())
        if idx == 0: 
            scan3d = array2d
            continue
        scan3d = np.dstack([scan3d,array2d])
    return scan3d

In [6]:
def plot_3d(image, threshold=-300, alpha=.70):
    
    # Position the scan upright, 
    # so the head of the patient would be at the top facing the camera
    p = image.transpose(2,1,0)
    
    verts, faces, x,y = measure.marching_cubes(p, threshold)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')

    # Fancy indexing: `verts[faces]` to generate a collection of triangles
    mesh = Poly3DCollection(verts[faces], alpha=alpha)
    face_color = [0.45, 0.45, 0.75]
    mesh.set_facecolor(face_color)
    ax.add_collection3d(mesh)

    ax.set_xlim(0, p.shape[0])
    ax.set_ylim(0, p.shape[1])
    ax.set_zlim(0, p.shape[2])

    plt.show()

In [7]:
def merge_common_elements(repeated_pairs):
    # Merge common elements
    # https://stackoverflow.com/questions/4842613/merge-lists-that-share-common-elements
    out = []
    while len(repeated_pairs)>0:
        first, *rest = repeated_pairs
        first = set(first)
        lf = -1
        while len(first)>lf:
            lf = len(first)

            rest2 = []
            for r in rest:
                if len(first.intersection(set(r)))>0:
                    first |= set(r)
                else:
                    rest2.append(r)     
            rest = rest2
        out.append(first)
        repeated_pairs = rest
    return out

## Main loop

In [8]:
# There were some scans where the pre-processing segmentation did not work.
# Check function in curate the 3D reconstructed 80px data.ipynb
vols_with_segmentation_errors = ['LIDC-IDRI-0116', 'LIDC-IDRI-0136', 'LIDC-IDRI-0146', 
'LIDC-IDRI-0231', 'LIDC-IDRI-0304', 'LIDC-IDRI-0309', 'LIDC-IDRI-0332', 'LIDC-IDRI-0344', 
'LIDC-IDRI-0391', 'LIDC-IDRI-0473',  'LIDC-IDRI-0478', 'LIDC-IDRI-0537', 'LIDC-IDRI-0582', 
'LIDC-IDRI-0612', 'LIDC-IDRI-0621', 'LIDC-IDRI-0652', 'LIDC-IDRI-0655', 'LIDC-IDRI-0797', 
'LIDC-IDRI-0807', 'LIDC-IDRI-0864', 'LIDC-IDRI-0908', 'LIDC-IDRI-0918']

In [9]:
path_dest_data = '/data/OMM/Datasets/LIDC_other_formats/LIDC 3D reconstruction 80px v4 - with coords/'

In [10]:
ids_with_shape_error = []
for idx_ids, i_ids in enumerate(ids):
    if i_ids in vols_with_segmentation_errors: continue #don't process segmentation errors
    start = time.time()
#     if idx_ids<=96:continue # !!!!!!!!!!!!!!!!!!!!!!!! REMOVE !!!!!!!!!!!!!!!!!!!
    vol = make3d_from_sparse(f'{path_data}{i_ids}/scans/')
    mask = make3d_from_sparse(f'{path_data}{i_ids}/consensus_masks/') # modified v4
    # rearrange axes to slices first
    vol = np.swapaxes(vol,1,2)
    vol = np.swapaxes(vol,0,1)
    mask = np.swapaxes(mask,1,2)
    mask = np.swapaxes(mask,0,1)
    # Find the minimum box that contain the lungs 
    min_box = np.where(vol!=0)
    min_box_c = min_box[0]
    min_box_x = min_box[1]
    min_box_y = min_box[2]
    vol_small = vol[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
    # Apply the same minimum box to the mask
    mask_small = mask[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
    # Get small cubes of size X
    cubes = []
    cubes_mask = []
    cubes_coords = []
    cube_size = 80
    cubes_thresh_perc = .05 # Keep only those cubes where at least some % of the voxels != 0
    cubes_thres = (cube_size**3) * cubes_thresh_perc

    borders_i, borders_j, borders_k = [], [] ,[]
    borders_axis1 = range(0,np.shape(vol_small)[0],cube_size)
    borders_axis2 = range(0,np.shape(vol_small)[1],cube_size)
    borders_axis3 = range(0,np.shape(vol_small)[2],cube_size)
    for idx, i in enumerate(borders_axis1):
        if idx >= len(borders_axis1) - 1: continue #don't do the last iteration (border is close)
        borders_i.append(i)
        for idj, j in enumerate(borders_axis2):
            if idj >= len(borders_axis2) - 1: continue #don't do the last iteration (border is close)
            if i==0: borders_j.append(j)
            for idk, k in enumerate(borders_axis3):
                if idk >= len(borders_axis3) - 1: continue #don't do the last iteration (border is close)
                if i==0 and j==0: borders_k.append(k)
                cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                if np.sum(cube!=0) > cubes_thres:
                    cubes.append(cube)
                    cubes_mask.append(cube_mask)
                    cubes_coords.append(f'{idx}_{idj}_{idk}')
    # There might be regions where a complete cube did not fit (at the end of each axis). Get those
    last_cube_c = np.shape(vol_small)[0]-cube_size
    last_cube_x = np.shape(vol_small)[1]-cube_size
    last_cube_y = np.shape(vol_small)[2]-cube_size
    last_cube_c, last_cube_x, last_cube_y
    # One side
    cubes_last = []
    cubes_mask_last = []
    cubes_coords_last = []
    borders_i_last, borders_j_last, borders_k_last = [], [] ,[]
    # one side has to be still 
    for idx, i in enumerate(range(np.shape(vol_small)[0],0,-cube_size)):
        if idx==0:continue
        if idx>=2:continue
        borders_i_last.append(i)
        for idj, j in enumerate(range(np.shape(vol_small)[1],0,-cube_size)):
            if idj==0:continue
            borders_j_last.append(j)
            for idk, k in enumerate(range(np.shape(vol_small)[2],0,-cube_size)):
                if idk==0:continue
                if idj==1:borders_k_last.append(k)
                cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                # only save those cubes where there is something
                if np.sum(cube!=0) > cubes_thres:
                    cubes_last.append(cube)
                    cubes_mask_last.append(cube_mask)
                    cubes_coords_last.append(f'x{idx-1}_x{idj-1}_x{idk-1}')
    # second side
    borders_i_last, borders_j_last, borders_k_last = [], [] ,[]
    # one side has to be still 
    for idj, j in enumerate(range(np.shape(vol_small)[1],0,-cube_size)):
        if idj==0:continue
        if idj>=2:continue
        borders_j_last.append(j)
        for idx, i in enumerate(range(np.shape(vol_small)[0],0,-cube_size)):
            if idx==0:continue
            borders_i_last.append(i)
            for idk, k in enumerate(range(np.shape(vol_small)[2],0,-cube_size)):
                if idk==0:continue
                if idj==1:borders_k_last.append(k)
                cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                if np.sum(cube!=0) > cubes_thres:
                    cubes_last.append(cube)
                    cubes_mask_last.append(cube_mask)
                    cubes_coords_last.append(f'y{idx-1}_y{idj-1}_y{idk-1}')
    # Last side
    borders_i_last, borders_j_last, borders_k_last = [], [] ,[]
    # one side has to be still 
    for idk, k in enumerate(range(np.shape(vol_small)[2],0,-cube_size)):
        if idk==0:continue
        if idk>=2:continue
        borders_k_last.append(k)
        for idx, i in enumerate(range(np.shape(vol_small)[0],0,-cube_size)):
            if idx==0:continue
            borders_i_last.append(i)
            for idj, j in enumerate(range(np.shape(vol_small)[1],0,-cube_size)):
                if idj==0:continue
                if idx==1:borders_j_last.append(j)
                cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
                if np.sum(cube!=0) > cubes_thres:
                    cubes_last.append(cube)
                    cubes_mask_last.append(cube_mask)
                    cubes_coords_last.append(f'z{idx-1}_z{idj-1}_z{idk-1}')
    
    # Check if there are cubes repeated (in the cubes added last)
    repeated_pairs = []
    for idx, i in enumerate(cubes_last):
        for idj, j in enumerate(cubes_last):
            # dont compare the same elements and dont compare the same pairs twice
            if idx<idj:
                if(np.array_equal(i,j)):
                    repeated_pairs.append([idx,idj])
    # Delete repeated cubes
        # From the pairs of repeated elements we merge those pairs that contain common elements
    out = merge_common_elements(repeated_pairs)
        # For all merged sets we keep all elements except the first one (we are going to keep the first ones)
    repeated_elements_except_firsts=[]
    for i in out:
        repeated_elements_except_firsts.extend(list(i)[1:])
        # Delete the elements kept from cubes and masks
    for i in sorted(repeated_elements_except_firsts, reverse=True):
        del cubes_last[i]
        del cubes_mask_last[i]
        del cubes_coords_last[i]
    
    # Put all cubes together
    cubes_all = cubes + cubes_last
    cubes_mask_all = cubes_mask + cubes_mask_last
    cubes_coords_all = cubes_coords + cubes_coords_last
    
    # Make sure the shapes are correct
    try:
        if np.shape(cubes_all)[1] != cube_size or np.shape(cubes_all)[2] != cube_size or np.shape(cubes_all)[3] != cube_size:
            ids_with_shape_error.append(i_ids)
            print(f'ids_with_shape_error: {i_ids}')
    except IndexError:
        ids_with_shape_error.append(i_ids)
        print(f'ids_with_shape_error: {i_ids}')
        continue
    
    # Save
    path_dest_lungs = f'{path_dest_data}lungs/'
    path_dest_masks = f'{path_dest_data}masks/'
    path_dest_coords = f'{path_dest_data}coords/'
    
    for idx, i in enumerate(cubes_all):
        np.savez_compressed(f'{path_dest_lungs}{i_ids}_cube_{idx:03d}',i)
    for idx, i in enumerate(cubes_mask_all):
        pass
        np.savez_compressed(f'{path_dest_masks}{i_ids}_cube_{idx:03d}',i)

    np.save(f'{path_dest_coords}{i_ids}_cube_coords',np.asarray(cubes_coords_all))
    np.save(f'{path_dest_coords}{i_ids}_cube_shape',np.asarray(np.shape(vol_small)))
    stop = time.time()
    print(f'done {i_ids} in {stop-start:.2f} s')

done LIDC-IDRI-0001 in 13.00 s
done LIDC-IDRI-0002 in 12.07 s
done LIDC-IDRI-0003 in 19.83 s
done LIDC-IDRI-0004 in 14.60 s
done LIDC-IDRI-0005 in 12.25 s
done LIDC-IDRI-0006 in 10.95 s
done LIDC-IDRI-0007 in 19.79 s
done LIDC-IDRI-0008 in 15.65 s
done LIDC-IDRI-0009 in 15.21 s
done LIDC-IDRI-0010 in 21.20 s
done LIDC-IDRI-0011 in 13.69 s
done LIDC-IDRI-0012 in 13.75 s
done LIDC-IDRI-0013 in 8.79 s
done LIDC-IDRI-0014 in 9.36 s
done LIDC-IDRI-0015 in 8.70 s
done LIDC-IDRI-0016 in 10.19 s
done LIDC-IDRI-0017 in 11.35 s
done LIDC-IDRI-0018 in 11.02 s
done LIDC-IDRI-0019 in 21.74 s
done LIDC-IDRI-0020 in 13.10 s
done LIDC-IDRI-0021 in 18.07 s
done LIDC-IDRI-0022 in 13.26 s
done LIDC-IDRI-0023 in 15.25 s
done LIDC-IDRI-0024 in 15.71 s
done LIDC-IDRI-0025 in 22.36 s
done LIDC-IDRI-0026 in 15.20 s
done LIDC-IDRI-0027 in 19.16 s
done LIDC-IDRI-0028 in 12.80 s
done LIDC-IDRI-0029 in 20.13 s
done LIDC-IDRI-0030 in 10.77 s
done LIDC-IDRI-0031 in 18.68 s
done LIDC-IDRI-0032 in 20.29 s
done LIDC-I

done LIDC-IDRI-0274 in 18.61 s
done LIDC-IDRI-0275 in 17.90 s
done LIDC-IDRI-0276 in 12.98 s
done LIDC-IDRI-0277 in 11.52 s
done LIDC-IDRI-0278 in 10.91 s
done LIDC-IDRI-0279 in 13.65 s
done LIDC-IDRI-0280 in 11.34 s
done LIDC-IDRI-0281 in 11.69 s
done LIDC-IDRI-0282 in 11.29 s
done LIDC-IDRI-0283 in 13.52 s
done LIDC-IDRI-0284 in 17.97 s
done LIDC-IDRI-0285 in 20.38 s
done LIDC-IDRI-0286 in 14.30 s
done LIDC-IDRI-0287 in 10.69 s
done LIDC-IDRI-0288 in 10.97 s
done LIDC-IDRI-0289 in 16.41 s
done LIDC-IDRI-0290 in 13.18 s
done LIDC-IDRI-0291 in 13.46 s
done LIDC-IDRI-0294 in 6.64 s
done LIDC-IDRI-0295 in 12.37 s
done LIDC-IDRI-0296 in 17.29 s
done LIDC-IDRI-0297 in 10.90 s
done LIDC-IDRI-0298 in 11.06 s
done LIDC-IDRI-0300 in 15.04 s
done LIDC-IDRI-0303 in 18.41 s
done LIDC-IDRI-0305 in 9.87 s
done LIDC-IDRI-0306 in 9.18 s
done LIDC-IDRI-0307 in 10.41 s
done LIDC-IDRI-0310 in 9.33 s
done LIDC-IDRI-0311 in 12.07 s
done LIDC-IDRI-0312 in 8.98 s
done LIDC-IDRI-0313 in 11.22 s
done LIDC-IDR

done LIDC-IDRI-0593 in 4.41 s
done LIDC-IDRI-0594 in 8.07 s
done LIDC-IDRI-0595 in 7.18 s
done LIDC-IDRI-0596 in 10.15 s
done LIDC-IDRI-0597 in 22.55 s
done LIDC-IDRI-0598 in 11.15 s
done LIDC-IDRI-0599 in 8.20 s
done LIDC-IDRI-0600 in 9.70 s
done LIDC-IDRI-0601 in 9.33 s
done LIDC-IDRI-0602 in 9.51 s
done LIDC-IDRI-0603 in 11.45 s
done LIDC-IDRI-0605 in 13.50 s
done LIDC-IDRI-0606 in 7.71 s
done LIDC-IDRI-0607 in 9.03 s
done LIDC-IDRI-0608 in 9.44 s
done LIDC-IDRI-0610 in 12.37 s
done LIDC-IDRI-0611 in 7.29 s
done LIDC-IDRI-0613 in 9.54 s
done LIDC-IDRI-0615 in 11.69 s
done LIDC-IDRI-0616 in 9.99 s
done LIDC-IDRI-0617 in 11.23 s
done LIDC-IDRI-0618 in 12.63 s
done LIDC-IDRI-0619 in 8.74 s
done LIDC-IDRI-0620 in 8.49 s
done LIDC-IDRI-0622 in 8.42 s
done LIDC-IDRI-0623 in 8.72 s
done LIDC-IDRI-0624 in 9.23 s
done LIDC-IDRI-0625 in 15.02 s
done LIDC-IDRI-0626 in 9.36 s
done LIDC-IDRI-0627 in 10.55 s
done LIDC-IDRI-0628 in 18.83 s
done LIDC-IDRI-0629 in 5.66 s
done LIDC-IDRI-0630 in 8.24 

done LIDC-IDRI-0928 in 8.05 s
done LIDC-IDRI-0929 in 12.49 s
done LIDC-IDRI-0930 in 9.28 s
done LIDC-IDRI-0931 in 17.29 s
done LIDC-IDRI-0932 in 18.29 s
done LIDC-IDRI-0933 in 11.66 s
done LIDC-IDRI-0934 in 9.55 s
done LIDC-IDRI-0935 in 13.70 s
done LIDC-IDRI-0936 in 7.13 s
done LIDC-IDRI-0937 in 6.34 s
done LIDC-IDRI-0938 in 12.54 s
done LIDC-IDRI-0941 in 14.47 s
done LIDC-IDRI-0942 in 8.27 s
done LIDC-IDRI-0944 in 10.47 s
done LIDC-IDRI-0945 in 15.12 s
done LIDC-IDRI-0946 in 13.22 s
done LIDC-IDRI-0948 in 7.25 s
done LIDC-IDRI-0949 in 9.18 s
done LIDC-IDRI-0950 in 13.36 s
done LIDC-IDRI-0951 in 8.83 s
done LIDC-IDRI-0952 in 8.50 s
done LIDC-IDRI-0954 in 13.63 s
done LIDC-IDRI-0956 in 8.58 s
done LIDC-IDRI-0958 in 18.72 s
done LIDC-IDRI-0959 in 8.50 s
done LIDC-IDRI-0961 in 18.12 s
done LIDC-IDRI-0962 in 12.83 s
done LIDC-IDRI-0964 in 9.40 s
done LIDC-IDRI-0965 in 13.95 s
done LIDC-IDRI-0966 in 10.85 s
done LIDC-IDRI-0967 in 9.00 s
done LIDC-IDRI-0968 in 12.61 s
done LIDC-IDRI-0969 in

NotADirectoryError: [Errno 20] Not a directory: '/data/OMM/Datasets/LIDC_other_formats/LIDC_preprocessed_3D v2/rejectedPatients.txt/scans/'

In [None]:
np.shape(vol_small)

In [None]:
ids_with_shape_error

In [None]:
i_ids

In [None]:
idx_ids

In [None]:
i_ids

In [None]:
index = 570
vol = make3d_from_sparse(f'{path_data}{ids[index]}/scans/')
mask = make3d_from_sparse(f'{path_data}{ids[index]}/masks/')

In [None]:
vol = np.swapaxes(vol,1,2)
vol = np.swapaxes(vol,0,1)
mask = np.swapaxes(mask,1,2)
mask = np.swapaxes(mask,0,1)
print(np.shape(vol),np.shape(mask))

In [None]:
fig, ax = plt.subplots(1,figsize=(6,6))
ax.imshow(vol[140,:,:])

In [None]:
plt.hist(vol.flatten(), bins=80);

In [None]:
np.sum(vol!=0)

In [None]:
plot_3d(vol)

In [None]:
print(np.unique(mask))
plot_3d(mask, threshold=0)

# Find number of nodules in mask

In [None]:
str_3D_strict=np.array([[[0, 0, 0],[0, 1, 0],[0, 0, 0]],
                        [[0, 1, 0],[1, 1, 1],[0, 1, 0]],
                        [[0, 0, 0],[0, 1, 0],[0, 0, 0]]], dtype='uint8')
str_3D=np.array([[[1, 1, 1],[1, 1, 1],[1, 1, 1]],
                 [[1, 1, 1],[1, 1, 1],[1, 1, 1]],
                 [[1, 1, 1],[1, 1, 1],[1, 1, 1]]], dtype='uint8')

In [None]:
labeled, nr_objects = ndimage.label(mask, structure=str_3D) 
nr_objects

# Find the minimum box that contain the lungs 

In [None]:
print(np.shape(vol))
min_box = np.where(vol!=0)
min_box_c = min_box[0]
min_box_x = min_box[1]
min_box_y = min_box[2]
vol_small = vol[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]
print(np.min(min_box_c),np.max(min_box_c),np.min(min_box_x),np.max(min_box_x),np.min(min_box_y),np.max(min_box_y))

In [None]:
plot_3d(vol_small)

In [None]:
# Apply the same minimum box to the mask
mask_small = mask[np.min(min_box_c):np.max(min_box_c),np.min(min_box_x):np.max(min_box_x),np.min(min_box_y):np.max(min_box_y)]

In [None]:
plot_3d(mask_small, threshold=0)

# Get small cubes of size X

In [None]:
cubes = []
cubes_mask = []
cube_size = 80
borders_i, borders_j, borders_k = [], [] ,[]
for i in range(0,np.shape(vol_small)[0],cube_size):
    borders_i.append(i)
    for j in range(0,np.shape(vol_small)[1],cube_size):
        if i==0: borders_j.append(j)
        for k in range(0,np.shape(vol_small)[2],cube_size):
            if i==0 and j==0: borders_k.append(k)
            cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            if np.sum(cube!=0) > 0:
                cubes.append(cube)
                cubes_mask.append(cube_mask)

In [None]:
print(f'# lung cubes = {len(cubes)}, shape = {np.shape(cubes[0])}')
print(f'# mask cubes = {len(cubes_mask)}, shape = {np.shape(cubes_mask[0])}')


In [None]:
pixels_in_cubes_masks = [np.sum(i) for i in cubes_mask]
print(pixels_in_cubes_masks)

In [None]:
plot_3d(cubes[10])

In [None]:
borders_i, borders_j, borders_k

In [None]:
# Plot the regions where the cubes were obtained from
borders = np.zeros_like(vol_small)
for idx, i in enumerate(borders_i):
    if idx==0:continue
    borders[i,:,:]=1
for idx, i in enumerate(borders_j):
    if idx==0:continue
    borders[:,i,:]=1
for idx, i in enumerate(borders_k):
    if idx==0:continue
    borders[:,:,i]=1

In [None]:
plot_3d(borders, threshold=0, alpha=.3)

# There might be regions where a complete cube did not fit (at the end of each axis). Get those

In [None]:
last_cube_c = np.shape(vol_small)[0]-cube_size
last_cube_x = np.shape(vol_small)[1]-cube_size
last_cube_y = np.shape(vol_small)[2]-cube_size
last_cube_c, last_cube_x, last_cube_y

In [None]:
np.shape(vol_small)

In [None]:
# One side
cubes_last = []
cubes_mask_last = []
borders_i_last, borders_j_last, borders_k_last = [], [] ,[]
# one side has to be still 
for idx, i in enumerate(range(np.shape(vol_small)[0],0,-cube_size)):
    if idx==0:continue
    if idx>=2:continue
    borders_i_last.append(i)
    for idj, j in enumerate(range(np.shape(vol_small)[1],0,-cube_size)):
        if idj==0:continue
        borders_j_last.append(j)
        for idk, k in enumerate(range(np.shape(vol_small)[2],0,-cube_size)):
            if idk==0:continue
            if idj==1:borders_k_last.append(k)
            cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            # only save those cubes where there is something
            if np.sum(cube!=0) > 0:
                cubes_last.append(cube)
                cubes_mask_last.append(cube_mask)

In [None]:
len(cubes_last), len(cubes_mask_last)

In [None]:
borders_last = np.zeros_like(vol_small)

In [None]:
for idx, i in enumerate(borders_i_last):
    borders_last[i,:,:]=1
for idx, i in enumerate(borders_j_last):
    borders_last[borders_i_last[0]:,i,:]=1
for idx, i in enumerate(borders_k_last):
    borders_last[borders_i_last[0]:,:,i]=1

In [None]:
plot_3d(borders_last, threshold=0, alpha=.3)

In [None]:
# second side
borders_i_last, borders_j_last, borders_k_last = [], [] ,[]
# one side has to be still 
for idx, i in enumerate(range(np.shape(vol_small)[1],0,-cube_size)):
    if idx==0:continue
    if idx>=2:continue
    borders_j_last.append(i)
    for idj, j in enumerate(range(np.shape(vol_small)[0],0,-cube_size)):
        if idj==0:continue
        borders_i_last.append(j)
        for idk, k in enumerate(range(np.shape(vol_small)[2],0,-cube_size)):
            if idk==0:continue
            if idj==1:borders_k_last.append(k)
            cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            if np.sum(cube!=0) > 0:
                cubes_last.append(cube)
                cubes_mask_last.append(cube_mask)

In [None]:
len(cubes_last), len(cubes_mask_last)

In [None]:
borders_last = np.zeros_like(vol_small)

In [None]:
for idx, i in enumerate(borders_i_last):
    borders_last[i,borders_j_last[0]:,:]=1
for idx, i in enumerate(borders_j_last):
    borders_last[:,i,:]=1
for idx, i in enumerate(borders_k_last):
    borders_last[:,borders_j_last[0]:,i]=1

In [None]:
plot_3d(borders_last, threshold=0, alpha=.3)

In [None]:
# Last side
borders_i_last, borders_j_last, borders_k_last = [], [] ,[]
# one side has to be still 
for idx, i in enumerate(range(np.shape(vol_small)[2],0,-cube_size)):
    if idx==0:continue
    if idx>=2:continue
    borders_k_last.append(i)
    for idj, j in enumerate(range(np.shape(vol_small)[0],0,-cube_size)):
        if idj==0:continue
        borders_i_last.append(j)
        for idk, k in enumerate(range(np.shape(vol_small)[1],0,-cube_size)):
            if idk==0:continue
            if idj==1:borders_j_last.append(k)
            cube = vol_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            cube_mask = mask_small[i:i+cube_size,j:j+cube_size,k:k+cube_size]
            if np.sum(cube!=0) > 0:
                cubes_last.append(cube)
                cubes_mask_last.append(cube_mask)

In [None]:
len(cubes_last), len(cubes_mask_last)

In [None]:
borders_last = np.zeros_like(vol_small)

In [None]:
for idx, i in enumerate(borders_i_last):
    borders_last[i,:,borders_k_last[0]:]=1
for idx, i in enumerate(borders_j_last):
    borders_last[:,i,borders_k_last[0]:]=1
for idx, i in enumerate(borders_k_last):
    borders_last[:,:,i]=1

In [None]:
plot_3d(borders_last, threshold=0, alpha=.3)

In [None]:
for i in cubes_mask_last:
    print(np.sum(i))

# Check if there are cubes repeated

In [None]:
for idx, i in enumerate(cubes_last):
    for idj, j in enumerate(cubes_last):
        # dont compare the same elements and dont compare the same pairs twice
        if idx<idj:
            if(np.array_equal(i,j)):
                print(idx,idj)

In [None]:
for idx, i in enumerate(cubes):
    for idj, j in enumerate(cubes):
        # dont compare the same elements and dont compare the same pairs twice
        if idx<idj:
            if(np.array_equal(i,j)):
                print(idx,idj)

# Put all cubes together

In [None]:
cubes_all = cubes + cubes_last
cubes_mask_all = cubes_mask + cubes_mask_last

In [None]:
len(cubes_all), len(cubes_mask_all)

# Save the cubes

In [None]:
path_dest_data = '/home/om18/Documents/KCL/Feb 5 19 - Region proposal nodule detection/data/'
path_dest_lungs = f'{path_dest_data}lungs/'
path_dest_masks = f'{path_dest_data}masks/'

In [None]:
scanId= ids[index]
for idx, i in enumerate(cubes_all):
    np.save(f'{path_dest_lungs}{scanId}_cube_{idx:03d}',i)
for idx, i in enumerate(cubes_mask_all):
    np.savez_compressed(f'{path_dest_masks}{scanId}_cube_{idx:03d}',i)

# Load cubes (check-up)

In [None]:
masks_with_pixels = []
for idx, i in enumerate(cubes_mask_all):
    mm = np.load(f'{path_dest_masks}{scanId}_cube_{idx:03d}.npz')
    arr = mm.items()[0][1]
    if np.sum(arr)>1:
        masks_with_pixels.append(np.sum(arr))
        print(idx, np.sum(arr))

### Viz lung with nodule

In [None]:
a = np.load(f'{path_dest_lungs}{scanId}_cube_{5:03d}.npy')
b = np.load(f'{path_dest_masks}{scanId}_cube_{5:03d}.npz')
arr = b.items()[0][1]

In [None]:
plot_3d(a)
plot_3d(arr, threshold=0)

### Viz of a nodule that was captured by two cubes (one time by cubes_last)

In [None]:
a = np.load(f'{path_dest_lungs}{scanId}_cube_{16:03d}.npy')
b = np.load(f'{path_dest_masks}{scanId}_cube_{16:03d}.npz')
arr = b.items()[0][1]
plot_3d(a)
plot_3d(arr, threshold=0)

In [None]:
dif0 = np.where(arr!=0)
a0 = dif0[0]; a1 = dif0[1]; a2 = dif0[2]
center_slice = int(np.median(a0))
fig, ax = plt.subplots(1,2,figsize=(14,5))
ax[0].imshow(a[center_slice,:,:])
ax[1].imshow(a[center_slice,:,:])
ax[1].imshow(arr[center_slice,:,:], alpha=.3)

In [None]:
a = np.load(f'{path_dest_lungs}{scanId}_cube_{41:03d}.npy')
b = np.load(f'{path_dest_masks}{scanId}_cube_{41:03d}.npz')
arr = b.items()[0][1]
plot_3d(a)
plot_3d(arr, threshold=0)

In [None]:
dif0 = np.where(arr!=0)
a0 = dif0[0]; a1 = dif0[1]; a2 = dif0[2]
center_slice = int(np.median(a0))
fig, ax = plt.subplots(1,2,figsize=(14,5))
ax[0].imshow(a[center_slice,:,:])
ax[1].imshow(a[center_slice,:,:])
ax[1].imshow(arr[center_slice,:,:], alpha=.3)

# Scans with problems

In [None]:
# find what scans had problems
scans_with_problems = []
for idx,i in enumerate(ids):
    #window = 578
    #if idx<=window:continue
    #if idx==window+20:break
    vol = make3d_from_sparse(f'{path_data}{i}/scans/')
    pixels_not_zero = np.sum(vol!=0)
    if pixels_not_zero<=1000000 or pixels_not_zero>=10000000:
        scans_with_problems.append(i)
    print(idx, i,  pixels_not_zero)

In [None]:
# Scan with errors
# NameError in full-preprocessing
error_NameError =['LIDC-IDRI-0344', 'LIDC-IDRI-0358', 'LIDC-IDRI-0365', 'LIDC-IDRI-0366', 'LIDC-IDRI-0395', 'LIDC-IDRI-0424', 'LIDC-IDRI-0430', 'LIDC-IDRI-0443', 'LIDC-IDRI-0456', 'LIDC-IDRI-0481', 'LIDC-IDRI-0490', 'LIDC-IDRI-0518', 'LIDC-IDRI-0525', 'LIDC-IDRI-0537', 'LIDC-IDRI-0557', 'LIDC-IDRI-0582', 'LIDC-IDRI-0582', 'LIDC-IDRI-0601', 'LIDC-IDRI-0608',
 'LIDC-IDRI-0617', 'LIDC-IDRI-0624', 'LIDC-IDRI-0641', 'LIDC-IDRI-0672', 'LIDC-IDRI-0678', 'LIDC-IDRI-0681', 'LIDC-IDRI-0720', 'LIDC-IDRI-0781', 'LIDC-IDRI-0787', 'LIDC-IDRI-0794', 'LIDC-IDRI-0837', 'LIDC-IDRI-0837', 'LIDC-IDRI-0838', 'LIDC-IDRI-0843', 'LIDC-IDRI-0843', 'LIDC-IDRI-0843', 'LIDC-IDRI-0851', 'LIDC-IDRI-0863', 'LIDC-IDRI-0898',
 'LIDC-IDRI-0978', 'LIDC-IDRI-0997', 'LIDC-IDRI-1002']
# Not correct segmentation (found counting pixels != 0) in build volume reconstruction
error_pixels_zeros = ['LIDC-IDRI-0018', 'LIDC-IDRI-0043', 'LIDC-IDRI-0066', 'LIDC-IDRI-0104', 'LIDC-IDRI-0109', 'LIDC-IDRI-0110', 'LIDC-IDRI-0134', 'LIDC-IDRI-0144', 'LIDC-IDRI-0181', 'LIDC-IDRI-0184', 'LIDC-IDRI-0218', 'LIDC-IDRI-0264', 'LIDC-IDRI-0305', 'LIDC-IDRI-0311', 'LIDC-IDRI-0313', 'LIDC-IDRI-0339', 'LIDC-IDRI-0341', 'LIDC-IDRI-0353', 'LIDC-IDRI-0361', 'LIDC-IDRI-0363', 'LIDC-IDRI-0368', 'LIDC-IDRI-0369', 'LIDC-IDRI-0372', 'LIDC-IDRI-0394', 'LIDC-IDRI-0395', 'LIDC-IDRI-0421',
 'LIDC-IDRI-0423', 'LIDC-IDRI-0426', 'LIDC-IDRI-0447', 'LIDC-IDRI-0467', 'LIDC-IDRI-0473', 'LIDC-IDRI-0476', 'LIDC-IDRI-0483', 'LIDC-IDRI-0514', 'LIDC-IDRI-0515', 'LIDC-IDRI-0523', 'LIDC-IDRI-0539', 'LIDC-IDRI-0552', 'LIDC-IDRI-0555', 'LIDC-IDRI-0574', 'LIDC-IDRI-0594', 'LIDC-IDRI-0597', 'LIDC-IDRI-0612', 'LIDC-IDRI-0614', 'LIDC-IDRI-0618', 'LIDC-IDRI-0619', 'LIDC-IDRI-0621', 'LIDC-IDRI-0628', 'LIDC-IDRI-0649', 'LIDC-IDRI-0655', 'LIDC-IDRI-0658',
 'LIDC-IDRI-0661', 'LIDC-IDRI-0665', 'LIDC-IDRI-0671', 'LIDC-IDRI-0697', 'LIDC-IDRI-0699', 'LIDC-IDRI-0714', 'LIDC-IDRI-0718', 'LIDC-IDRI-0723', 'LIDC-IDRI-0773', 'LIDC-IDRI-0785', 'LIDC-IDRI-0790', 'LIDC-IDRI-0796', 'LIDC-IDRI-0798', 'LIDC-IDRI-0803', 'LIDC-IDRI-0804', 'LIDC-IDRI-0828', 'LIDC-IDRI-0829', 'LIDC-IDRI-0833', 'LIDC-IDRI-0848', 'LIDC-IDRI-0850', 'LIDC-IDRI-0866', 'LIDC-IDRI-0877', 'LIDC-IDRI-0880', 'LIDC-IDRI-0882',
 'LIDC-IDRI-0890', 'LIDC-IDRI-0894', 'LIDC-IDRI-0910', 'LIDC-IDRI-0915', 'LIDC-IDRI-0928', 'LIDC-IDRI-0930', 'LIDC-IDRI-0938', 'LIDC-IDRI-0942', 'LIDC-IDRI-0945', 'LIDC-IDRI-0950', 'LIDC-IDRI-0954', 'LIDC-IDRI-0959', 'LIDC-IDRI-0969', 'LIDC-IDRI-0975', 'LIDC-IDRI-0984', 'LIDC-IDRI-0995', 'LIDC-IDRI-0999']
print(len(error_NameError), len(error_pixels_zeros))

---

# Rename files
In the early version of dicom full-preprocessing (misc1) - all patients we gave incorrect format to the sparse files names. This corrected the issue

In [None]:
path_to_rename = f'{path_data}{ids[index]}/scans/'
print(path_to_rename)
a = os.listdir(path_to_rename)
np.sort(a)[:20]

In [None]:
def rename_leading_zeros(old_name):    
    str_left = old_name.split('_')[0]
    str_key = old_name.split('_')[-1].split('.')[0]
    str_right = old_name.split('_')[-1].split('.')[-1]
    str_new = f'{str_left}_{int(str_key):04d}.{str_right}'
    return str_new

In [None]:
ids = os.listdir(path_data)
ids = np.sort(ids)
for one_id in ids:
    print(one_id)
    path_to_rename = f'{path_data}{one_id}/masks/'
    a = os.listdir(path_to_rename)
    a = np.sort(a)
    for old_name in a:
        new_name = rename_leading_zeros(old_name)
        os.rename(f'{path_to_rename}{old_name}',f'{path_to_rename}{new_name}')

In [None]:
path_to_rename = f'{path_data}{ids[index]}/scans/'
a = os.listdir(path_to_rename)
a = np.sort(a)
for old_name in a:
    new_name = rename_leading_zeros(old_name)
    os.rename(f'{path_to_rename}{old_name}',f'{path_to_rename}{new_name}')

In [None]:
a = os.listdir(path_to_rename)
np.sort(a)

In [None]:
# 'LIDC-IDRI-0018', 'LIDC-IDRI-0043', 'LIDC-IDRI-0066', 'LIDC-IDRI-0104', 'LIDC-IDRI-0109',
#  'LIDC-IDRI-0110', 'LIDC-IDRI-0134', 'LIDC-IDRI-0144', 'LIDC-IDRI-0181', 'LIDC-IDRI-0184',
#  'LIDC-IDRI-0218', 'LIDC-IDRI-0264', 'LIDC-IDRI-0305', 'LIDC-IDRI-0311', 'LIDC-IDRI-0313',
#  'LIDC-IDRI-0339', 'LIDC-IDRI-0341', 'LIDC-IDRI-0353', 'LIDC-IDRI-0361', 'LIDC-IDRI-0363',
#  'LIDC-IDRI-0368', 'LIDC-IDRI-0369', 'LIDC-IDRI-0372', 'LIDC-IDRI-0394', 'LIDC-IDRI-0395',
#  'LIDC-IDRI-0421', 'LIDC-IDRI-0423', 'LIDC-IDRI-0426', 'LIDC-IDRI-0447', 'LIDC-IDRI-0467',
#  'LIDC-IDRI-0473', 'LIDC-IDRI-0476', 'LIDC-IDRI-0483', 'LIDC-IDRI-0514', 'LIDC-IDRI-0515',
#  'LIDC-IDRI-0523', 'LIDC-IDRI-0539', 'LIDC-IDRI-0552', 'LIDC-IDRI-0555', 'LIDC-IDRI-0574',
#  'LIDC-IDRI-0594', 'LIDC-IDRI-0597', 'LIDC-IDRI-0612', 'LIDC-IDRI-0614', 'LIDC-IDRI-0618',
#  'LIDC-IDRI-0619', 'LIDC-IDRI-0621', 'LIDC-IDRI-0628', 'LIDC-IDRI-0649', 'LIDC-IDRI-0655',
#  'LIDC-IDRI-0658', 'LIDC-IDRI-0661', 'LIDC-IDRI-0665', 'LIDC-IDRI-0671', 'LIDC-IDRI-0697',
#  'LIDC-IDRI-0699', 'LIDC-IDRI-0714', 'LIDC-IDRI-0718', 'LIDC-IDRI-0723', 'LIDC-IDRI-0773',
#  'LIDC-IDRI-0785', 'LIDC-IDRI-0790', 'LIDC-IDRI-0796', 'LIDC-IDRI-0798', 'LIDC-IDRI-0803',
#  'LIDC-IDRI-0804', 'LIDC-IDRI-0828', 'LIDC-IDRI-0829', 'LIDC-IDRI-0833', 'LIDC-IDRI-0848',
#  'LIDC-IDRI-0850', 'LIDC-IDRI-0866', 'LIDC-IDRI-0877', 'LIDC-IDRI-0880', 'LIDC-IDRI-0882',
#  'LIDC-IDRI-0890', 'LIDC-IDRI-0894', 'LIDC-IDRI-0910', 'LIDC-IDRI-0915', 'LIDC-IDRI-0928',
#  'LIDC-IDRI-0930', 'LIDC-IDRI-0938', 'LIDC-IDRI-0942', 'LIDC-IDRI-0945', 'LIDC-IDRI-0950',
#  'LIDC-IDRI-0954', 'LIDC-IDRI-0959', 'LIDC-IDRI-0969', 'LIDC-IDRI-0975', 'LIDC-IDRI-0984',
#  'LIDC-IDRI-0995', 'LIDC-IDRI-0999'