**v4**
1. Saves also a version where the inpainted 
**previous**
1. The images were finally resampled in the slice direction (dicom full-preprocessing (misc1) v3.6 - interpolate only 2axes to inpaint - get the pylidc characteristics)
1. The coordinates are taken from the pylidc (after transforming to the coords in small cubes) because the coords obtained from the masks are dilated and there are cases where two nodules merge and they would be considered as one nodule

In [1]:
import os
from statistics import mode, StatisticsError
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import ndimage
from scipy.stats import mode as stats_mode
from tqdm import tqdm_notebook
# import pylidc as pl # module for handling the LIDC dataset
%matplotlib inline

## Functions

In [2]:
def box_with_masks_search(coords_Z, coords_X, coords_Y, mask_lungs_, min_coords, dist1 = 64, dist2 = 64, dist3 = 64):
    '''Finds the cube containing the nodule and as many voxels from inside the lung as possible'''
    
    coords_Z_min, coords_Z_max, coords_X_min, coords_X_max, coords_Y_min, coords_Y_max = min_coords
#     print(coords_Z_min, coords_Z_max, coords_X_min, coords_X_max, coords_Y_min, coords_Y_max)
    box_found= False
    # find where the vol_cut get more info voxels
    max_sum = 0
    for i in range(80*2):
        ii = i * 2 - (160)
        for j in range(80*2):
            jj = j * 2 - (160)
            for k in range(80*2):
                kk = k * 2 - (160)
                
                # limits of the current box
                zmin = int(coords_Z-(dist1//2)+ii)
                zmax = int(zmin + dist1)
                if zmin < 0: continue
                if zmax > np.shape(mask_lungs_)[0]: continue
                
                xmin = int(coords_X-(dist2//2)+jj); 
                xmax = int(xmin + dist2)
                if xmin < 0: continue
                if xmax > np.shape(mask_lungs_)[1]: continue
                
                ymin = int(coords_Y-(dist3//2)+kk); 
                ymax = int(ymin + dist3)
                if ymin < 0: continue
                if ymax > np.shape(mask_lungs_)[2]: continue
                
#                 print(zmin, zmax, xmin, xmax, ymin, ymax)
                
                #if the current box contains the masks
                if zmin <= coords_Z_min and zmax >= coords_Z_max and xmin <= coords_X_min and xmax >= coords_X_max and ymin <= coords_Y_min and ymax >= coords_Y_max:
                    #print(f'if 1, {zmin, zmax, xmin, xmax, ymin, ymax}')
                    vol_cut=mask_lungs_[zmin:zmax,xmin:xmax,ymin:ymax]
                    # the box contains as many info voxels as possible
                    this_sum = np.sum(vol_cut)
                    if this_sum > max_sum:
                        #print(f'if 2, {zmin, zmax, xmin, xmax, ymin, ymax}')
                        max_sum = this_sum
                        box_found = True
                        z_min_found = zmin
                        z_max_found = zmax
                        x_min_found = xmin
                        x_max_found = xmax
                        y_min_found = ymin                        
                        y_max_found = ymax 
    if box_found == False:
        z_min_found, z_max_found, x_min_found, x_max_found, y_min_found, y_max_found = -1, 1, 1, 1, 1, 1
    return z_min_found, z_max_found, x_min_found, x_max_found, y_min_found, y_max_found        

In [3]:
def from_five_to_two_classes(i):
    '''
    v4: We change the output labels from  1 & 2 to 0 & 1
    Causey et al. Highly accurate model for prediction of lung nodule malignancy with CT scans
     We tested two designs: S1 versus S45, and S12 versus S45
     Here we discard all 3's and as long as there is another number the nodule is classified as malignant or beningn
     '''
    if i == 1 or i == 2: m=0
    if i == 3: m=[]
    if i == 4 or i == 5: m=1
    return m

In [4]:
def texture3classes(array):
    '''Go from five to three classes. To be used in texture.
    Adapted from /home/om18/Documents/KCL/18 Oct 9 - ladder on LIDC plus 70 nodules/
    CNN classifier based on LIDC texture v7 - working with nodules where there are no texture ties.ipynb'''
    output = []
    for x in array:
        if x==1 or x==2: output.extend([0])
        elif x==3 or x==4: output.extend([1])
        else: output.extend([2])
    return output

In [5]:
def make_df_original_and_df_3_agree_3_classes(values, malignancies_, malignancies_mode_3_agree_, names_to_save_3_agree_, malignancies_original_):
    values3 = list(texture3classes(values))
    malignancies_.append(values3)
    malignancies_original_.append(values)

    values_mode = stats_mode(values3)[0][0] # WARNING IF THERE ARE MORE THAN ONE MODE WE TAKE THE LOWEST ONE
    # Next lines are to append to more_than_one_reviewer_agree_texture
    number_reviewers_agree = np.sum(np.asarray(values3) == [values_mode])
    if number_reviewers_agree >= 2:
        malignancies_mode_3_agree_.append(values_mode)
        names_to_save_3_agree_.append(f'{i}_{n_ndl}')
        
        
    return malignancies_, malignancies_mode_3_agree_, names_to_save_3_agree_, malignancies_original_

In [6]:
# df_one_nodule.texture.values, textures, textures_mode_3_agree, texture_names_to_save_3_agree, textures_original

In [7]:
# values3 = list(texture3classes(df_one_nodule.texture.values))
# textures.append(values3)
# textures_original.append(df_one_nodule.texture.values)
# values_mode = stats_mode(values3)[0][0]
# number_reviewers_agree = np.sum(np.asarray(values3) == [values_mode])

In [8]:
# values_mode, number_reviewers_agree

In [9]:
def plot_block_and_cube(orig, last, mask, coords_Z, orig_small, coords_Z_small, last_small, mask_small):
    fig, ax = plt.subplots(2, 3, figsize=(7,5))
    ax[0,0].imshow(orig[coords_Z])
    ax[0,0].set_title(f'({idx}){i}, {n_ndl}')
    ax[0,0].axis('on')
    ax[0,1].imshow(last[coords_Z])
    ax[0,1].axis('on')
    ax[0,2].imshow(mask[coords_Z])
    ax[0,2].axis('on')
    ax[1,0].imshow(orig_small[coords_Z_small])
    ax[1,0].axis('on')
    ax[1,1].imshow(last_small[coords_Z_small])
    ax[1,1].axis('on')
    ax[1,2].imshow(mask_small[coords_Z_small])
    ax[1,2].axis('on')
    fig.tight_layout()
def plot_cube3(orig, last, mask, coords_Z, orig_small, coords_Z_small, last_small, mask_small):
    fig, ax = plt.subplots(1, 3, figsize=(9,3))
    ax[0].imshow(orig_small[coords_Z_small])
    ax[0].set_title(f'({idx}){i}, {n_ndl}')
    ax[0].axis('off')
    ax[1].imshow(last_small[coords_Z_small])
    ax[1].axis('off')
    ax[2].imshow(mask_small[coords_Z_small])
    ax[2].axis('off')
    fig.tight_layout()

In [10]:
def compare_labeled_and_df_coords(mask, coords_Z, coords_X, coords_Y, diff_thresh = 12):
    '''
    v4: we also include the comparison across the z direction (slices)
    Compute the coords of each nodule in the mask and if they are close to the coords in the DF return them'''
    labeled, n_items = ndimage.label(mask)
    for i in np.arange(1,n_items+1):
        z,x,y = np.where(labeled==i)
        zz = int(np.median(z))
        xx = int(np.median(x))
        yy = int(np.median(y))
        if np.abs(coords_X - xx) < diff_thresh and np.abs(coords_Y - yy) < diff_thresh and np.abs(coords_Z - zz) < diff_thresh:
            minZ = min(z)
            maxZ = max(z)
            minX = min(x)
            maxX = max(x)
            minY = min(y)
            maxY = max(y)
            min_coords = [minZ, maxZ, minX, maxX, minY, maxY]
            return min_coords

## Main

In [11]:
path_data = '/data/OMM/Datasets/LIDC_other_formats/LIDC_preprocessed_3D v5 - save pylidc chars only/v19/'
path_chars =  f'{path_data}pylidc_characteristics/'
path_last = f'{path_data}arrays/last/'
path_orig = f'{path_data}arrays/orig/'
path_mask = f'{path_data}arrays/masks nodules/'
path_mask_lungs = f'{path_data}arrays/masks lungs/'
path_dest = '/data/OMM/Datasets/LIDC_other_formats/LIDC_inpainted_multiple_classification_v6_ndl_not_centered/'

In [12]:
# files DIP reconstruction
files_last = os.listdir(path_last)
files_last = np.sort(files_last)
# files pylidc characteristics
files_chars = os.listdir(path_chars)
files_chars = np.sort(files_chars)
# files nodules masks
files_mask = os.listdir(path_mask)
files_mask = np.sort(files_mask)

In [13]:
# Get the files that are common to the DIP reconstruction and the pylidc characteristics
files_last_cropped = [i.split('.npy')[0] for i in files_last]
files_chars_cropped = [i.split('.csv')[0] for i in files_chars]
files_last_cropped = list(np.unique(files_last_cropped))

files_common = list(set(files_last_cropped).intersection(set(files_chars_cropped)))
files_common = np.sort(files_common)

In [14]:
len(files_last)

1096

## Continue

In [15]:
def save_df_and_df_3_agree(path_dest_, names_to_save, feature, feature_names_to_save_3_agree, feature_mode_3_agree, feature_name, feature_original):
    '''Create and save DF of all scores and where 3 agree (of a single feature)'''
    df_to_classify_ = pd.DataFrame.from_dict({'names': names_to_save, feature_name: feature})
    df_to_classify_3_agree_ = pd.DataFrame.from_dict({'names': feature_names_to_save_3_agree, feature_name: feature_mode_3_agree})
    df_to_classify_feature_original_ = pd.DataFrame.from_dict({'names': names_to_save, feature_name: feature_original})
    df_to_classify_.to_csv(f'{path_dest_}dataframes for classification/df_classify_inpainted_{feature_name}.csv', index=False)
    df_to_classify_3_agree_.to_csv(f'{path_dest_}dataframes for classification/df_3_agree_classify_inpainted_{feature_name}.csv', index=False)
    df_to_classify_feature_original_.to_csv(f'{path_dest_}dataframes for classification/df_original_classify_inpainted_{feature_name}.csv', index=False)

In [16]:
def make_df_original_and_df_3_agree(values, malignancies_, malignancies_mode_3_agree_, names_to_save_3_agree_, malignancies_original_):
    values3 = list(map(from_five_to_two_classes, values))
    values3 = list(filter(None, values3))
    malignancies_.append(values3)
    malignancies_original_.append(values)

    try:
        values_mode = mode(values3)
        # Next lines are to append to malignancies_mode_3_agree (if at least 3 reviewers agree on malignancy)
        agree_with_mode = [1 if values_mode == i else 0 for i in values3]
        agree_with_mode = np.sum(agree_with_mode)
        if agree_with_mode >= 3:
            malignancies_mode_3_agree_.append(values_mode)
            names_to_save_3_agree_.append(f'{i}_{n_ndl}')
    except StatisticsError: pass
        
        
    return malignancies_, malignancies_mode_3_agree_, names_to_save_3_agree_, malignancies_original_

In [38]:
# names_to_save = []
# nodules_with_coords_errors = []
# malignancy_names_to_save_3_agree, malignancies, malignancies_mode_3_agree, malignancies_original = [], [], [], []
# lobulation_names_to_save_3_agree, lobulations, lobulations_mode_3_agree, lobulations_original = [], [], [], []
# sphericity_names_to_save_3_agree, sphericitys, sphericitys_mode_3_agree, sphericitys_original = [], [], [], []
# spiculation_names_to_save_3_agree, spiculations, spiculations_mode_3_agree, spiculations_original = [], [], [], []
# subtlety_names_to_save_3_agree, subtletys, subtletys_mode_3_agree, subtletys_original = [], [], [], []
# texture_names_to_save_3_agree, textures, textures_mode_3_agree, textures_original = [], [], [], []



for idx, i in tqdm_notebook(enumerate(files_common), total=len(files_common)):
    if i in ['LIDC-IDRI-0124_block2_543210', 'LIDC-IDRI-0141_block1_43210', 'LIDC-IDRI-0149_block2_4320',
            'LIDC-IDRI-0179_block1_5', 'LIDC-IDRI-0179_block1_987643210','LIDC-IDRI-0309_block2_1',
            'LIDC-IDRI-0392_block2_210']: continue
    if idx <= 445:continue # 152, 177, 189, 229, 230, 365, 445
    
    
    # Get the inpainted and original image and the mask
    try:
        last = np.load(f'{path_last}{i}.npy')
        last = np.squeeze(last)
        orig = np.load(f'{path_orig}{i}.npy')
        orig = np.squeeze(orig)
        mask = np.load(f'{path_mask}{i}.npz')
        mask = mask.f.arr_0
        mask_lungs = np.load(f'{path_mask_lungs}{i}.npz')
        mask_lungs = mask_lungs.f.arr_0
    except FileNotFoundError: continue
            
    df = pd.read_csv(f'{path_chars}{i}.csv')
    n_nodules = np.unique(df['cluster_id'].values)
    for n_ndl in n_nodules: # for each nodule in the DF

        df_one_nodule = df.loc[df['cluster_id'] == n_ndl]
        coords_Z = int(np.mean(df_one_nodule['small_coordsZ_resampled'].values))
        coords_X = int(np.mean(df_one_nodule['small_coordsX'].values))
        coords_Y = int(np.mean(df_one_nodule['small_coordsY'].values))
        # if the DF and mask coords match then use the min and max of the latter 
        coords_limit = compare_labeled_and_df_coords(mask, coords_Z, coords_X, coords_Y)
        if coords_limit == None:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
            
        # Get a cube around the nodule and the mask
        z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f = box_with_masks_search(coords_Z, coords_X, coords_Y, mask_lungs, coords_limit)
#         print(z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f)
        if z_min_f == -1:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
        orig_small = orig[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        last_small = last[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_small = mask[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_lungs_small = mask_lungs[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        # Using the coords of the cube, get the coords of the nodule inside the cube
        if np.shape(orig_small) != (64,64,64):
            nodules_with_shape_errors.append(f'{i}_{n_ndl}')
            continue
        
        # insert the inpainted voxels into the original image
        inpainted_inserted = (last_small * mask_small + (-mask_small+1)*orig_small)
        
        # Save figures and targets 3D version
        np.save(f'{path_dest}original/{i}_{n_ndl}.npy',orig_small)
        np.save(f'{path_dest}inpainted/{i}_{n_ndl}.npy',last_small)
        np.save(f'{path_dest}inpainted inserted/{i}_{n_ndl}.npy',inpainted_inserted)
        np.savez_compressed(f'{path_dest}mask/{i}_{n_ndl}',mask_small)
        np.savez_compressed(f'{path_dest}mask lungs/{i}_{n_ndl}',mask_lungs_small)
        names_to_save.append(f'{i}_{n_ndl}')
        
        # transform scores from 5 to 3 categories and make dataframes
        malignancies, malignancies_mode_3_agree, malignancy_names_to_save_3_agree, malignancies_original = make_df_original_and_df_3_agree(df_one_nodule.malignancy.values, malignancies, malignancies_mode_3_agree, malignancy_names_to_save_3_agree, malignancies_original)
        lobulations, lobulations_mode_3_agree, lobulation_names_to_save_3_agree, lobulations_original = make_df_original_and_df_3_agree(df_one_nodule.lobulation.values, lobulations, lobulations_mode_3_agree, lobulation_names_to_save_3_agree, lobulations_original)
        sphericitys, sphericitys_mode_3_agree, sphericity_names_to_save_3_agree, sphericitys_original = make_df_original_and_df_3_agree(df_one_nodule.sphericity.values, sphericitys, sphericitys_mode_3_agree, sphericity_names_to_save_3_agree, sphericitys_original)
        spiculations, spiculations_mode_3_agree, spiculation_names_to_save_3_agree, spiculations_original = make_df_original_and_df_3_agree(df_one_nodule.spiculation.values, spiculations, spiculations_mode_3_agree, spiculation_names_to_save_3_agree, spiculations_original)
        subtletys, subtletys_mode_3_agree, subtlety_names_to_save_3_agree, subtletys_original = make_df_original_and_df_3_agree(df_one_nodule.subtlety.values, subtletys, subtletys_mode_3_agree, subtlety_names_to_save_3_agree, subtletys_original)
        textures, textures_mode_3_agree, texture_names_to_save_3_agree, textures_original = make_df_original_and_df_3_agree_3_classes(df_one_nodule.texture.values, textures, textures_mode_3_agree, texture_names_to_save_3_agree, textures_original)
        
        # get 2D version of nodules
        # We use the coordsZ from the df (from pylidc) to get the slice where the nodule is located (instead) of
        # using np.where(mask==1) becuase this latter option could find incorrect nodules in blocks with more than one nodule
        coordsZ_for_2D = int(np.median(df_one_nodule['small_coordsZ_resampled'].values))
        coordsX_for_2D = int(np.median(df_one_nodule['small_coordsX'].values))
        coordsY_for_2D = int(np.median(df_one_nodule['small_coordsY'].values))
        coordsX_for_df_2d_version = coordsX_for_2D - x_min_f
        coordsY_for_df_2d_version = coordsY_for_2D - y_min_f
        try:
            mask_small_2D = mask_small[coordsZ_for_2D- z_min_f]
            orig_small_2D = orig_small[coordsZ_for_2D- z_min_f]
            last_small_2D = last_small[coordsZ_for_2D- z_min_f]
            mask_lungs_small_2D = mask_lungs_small[coordsZ_for_2D- z_min_f]
            inpainted_inserted_2D = inpainted_inserted[coordsZ_for_2D- z_min_f]
        except IndexError:
            print(f'skipped {i}')
            continue
        
        # Save figures and targets 2D version
        np.save(f'{path_dest}versions2D/original/{i}_{n_ndl}.npy',orig_small_2D)
        np.save(f'{path_dest}versions2D/inpainted/{i}_{n_ndl}.npy',last_small_2D)
        np.save(f'{path_dest}versions2D/inpainted inserted/{i}_{n_ndl}.npy',inpainted_inserted_2D)
        np.savez_compressed(f'{path_dest}versions2D/mask/{i}_{n_ndl}',mask_small_2D)
        np.savez_compressed(f'{path_dest}versions2D/mask lungs/{i}_{n_ndl}',mask_lungs_small_2D)
            
        print(f'{idx} {i}_{n_ndl} {df_one_nodule.texture.values}, {textures[-1]}')
            
        # These coords can be used to 'plot_block_and_cube'
#         coords_Z_small = coords_Z - z_min_f 
#         coords_X_small = coords_X - x_min_f 
#         coords_Y_small = coords_Y - y_min_f 
#         plot_block_and_cube(orig, last, mask, coords_Z, orig_small, coords_Z_small, last_small, mask_small)

# save the datafeames
save_df_and_df_3_agree(path_dest, names_to_save, textures, texture_names_to_save_3_agree, textures_mode_3_agree, 'texture', textures_original)
save_df_and_df_3_agree(path_dest, names_to_save, malignancies, malignancy_names_to_save_3_agree, malignancies_mode_3_agree, 'malignancy', malignancies_original)
save_df_and_df_3_agree(path_dest, names_to_save, lobulations, lobulation_names_to_save_3_agree, lobulations_mode_3_agree, 'lobulation', lobulations_original)
save_df_and_df_3_agree(path_dest, names_to_save, sphericitys, sphericity_names_to_save_3_agree, sphericitys_mode_3_agree, 'sphericity', sphericitys_original)
save_df_and_df_3_agree(path_dest, names_to_save, spiculations, spiculation_names_to_save_3_agree, spiculations_mode_3_agree, 'spiculation', spiculations_original)
save_df_and_df_3_agree(path_dest, names_to_save, subtletys, subtlety_names_to_save_3_agree, subtletys_mode_3_agree, 'subtlety', subtletys_original)

# save the 2d versions

HBox(children=(IntProgress(value=0, max=1095), HTML(value='')))

446 LIDC-IDRI-0393_block2_0_0 [4 5], [1, 2]
447 LIDC-IDRI-0394_block1_6543210_0 [4 3], [1, 1]
447 LIDC-IDRI-0394_block1_6543210_1 [5 5 3], [2, 2, 1]
447 LIDC-IDRI-0394_block1_6543210_2 [4 5 1], [1, 2, 0]
447 LIDC-IDRI-0394_block1_6543210_3 [1 1 2 1], [0, 0, 0, 0]
447 LIDC-IDRI-0394_block1_6543210_4 [2 2 5], [0, 0, 2]
447 LIDC-IDRI-0394_block1_6543210_5 [1 1 5 1], [0, 0, 2, 0]
447 LIDC-IDRI-0394_block1_6543210_6 [5], [2]
448 LIDC-IDRI-0394_block2_0_7 [1], [0]
449 LIDC-IDRI-0395_block1_10_1 [5], [2]
449 LIDC-IDRI-0395_block1_10_2 [3 4], [1, 1]
450 LIDC-IDRI-0395_block2_0_3 [5 4 5], [2, 1, 2]
451 LIDC-IDRI-0396_block1_0_0 [5], [2]
452 LIDC-IDRI-0396_block2_0_1 [1], [0]
453 LIDC-IDRI-0398_block1_43210_1 [5 5 4 5], [2, 2, 1, 2]
453 LIDC-IDRI-0398_block1_43210_2 [5], [2]
453 LIDC-IDRI-0398_block1_43210_4 [5], [2]
453 LIDC-IDRI-0398_block1_43210_5 [5], [2]
453 LIDC-IDRI-0398_block1_43210_6 [5 5 5], [2, 2, 2]
453 LIDC-IDRI-0398_block1_43210_7 [5], [2]
skipped LIDC-IDRI-0398_block1_43210
454 LI

554 LIDC-IDRI-0481_block2_3210_6 [5 5 5], [2, 2, 2]
555 LIDC-IDRI-0483_block1_0_0 [5 4], [2, 1]
556 LIDC-IDRI-0484_block1_10_0 [5 5 5 4], [2, 2, 2, 1]
556 LIDC-IDRI-0484_block1_10_1 [5], [2]
557 LIDC-IDRI-0485_block1_10_1 [5 5 5 5], [2, 2, 2, 2]
557 LIDC-IDRI-0485_block1_10_3 [5 5 4], [2, 2, 1]
557 LIDC-IDRI-0485_block1_10_4 [5 5 5 5], [2, 2, 2, 2]
558 LIDC-IDRI-0485_block2_210_0 [3 1], [1, 0]
558 LIDC-IDRI-0485_block2_210_2 [1], [0]
558 LIDC-IDRI-0485_block2_210_3 [5 5 4], [2, 2, 1]
558 LIDC-IDRI-0485_block2_210_4 [5 5 5 5], [2, 2, 2, 2]
559 LIDC-IDRI-0486_block1_10_0 [3 5 5 5], [1, 2, 2, 2]
559 LIDC-IDRI-0486_block1_10_1 [5 5 5], [2, 2, 2]
560 LIDC-IDRI-0487_block1_10_0 [5 5 4 5], [2, 2, 1, 2]
560 LIDC-IDRI-0487_block1_10_2 [5], [2]
561 LIDC-IDRI-0487_block1_32_3 [5], [2]
561 LIDC-IDRI-0487_block1_32_5 [5 5 5], [2, 2, 2]
562 LIDC-IDRI-0487_block1_654_6 [5], [2]
562 LIDC-IDRI-0487_block1_654_7 [5], [2]
562 LIDC-IDRI-0487_block1_654_8 [5], [2]
563 LIDC-IDRI-0487_block2_0_1 [5], [2]
563

651 LIDC-IDRI-0583_block2_4210_11 [5], [2]
652 LIDC-IDRI-0583_block2_87653_9 [5 5 4 5], [2, 2, 1, 2]
652 LIDC-IDRI-0583_block2_87653_11 [5], [2]
652 LIDC-IDRI-0583_block2_87653_15 [5], [2]
652 LIDC-IDRI-0583_block2_87653_16 [5], [2]
652 LIDC-IDRI-0583_block2_87653_19 [5 4 5 5], [2, 1, 2, 2]
652 LIDC-IDRI-0583_block2_87653_21 [5], [2]
653 LIDC-IDRI-0583_block2_9_15 [5], [2]
653 LIDC-IDRI-0583_block2_9_16 [5], [2]
653 LIDC-IDRI-0583_block2_9_19 [5 4 5 5], [2, 1, 2, 2]
653 LIDC-IDRI-0583_block2_9_21 [5], [2]
653 LIDC-IDRI-0583_block2_9_22 [5], [2]
654 LIDC-IDRI-0584_block1_0_0 [4], [1]
655 LIDC-IDRI-0586_block1_10_0 [5 1 5 5], [2, 0, 2, 2]
655 LIDC-IDRI-0586_block1_10_3 [3], [1]
656 LIDC-IDRI-0586_block2_10_1 [5 3 5 5], [2, 1, 2, 2]
656 LIDC-IDRI-0586_block2_10_2 [5], [2]
657 LIDC-IDRI-0587_block1_410_1 [2 3], [0, 1]
657 LIDC-IDRI-0587_block1_410_2 [5], [2]
657 LIDC-IDRI-0587_block1_410_7 [2 1], [0, 0]
658 LIDC-IDRI-0587_block2_0_0 [2 5 1 2], [0, 2, 0, 0]
skipped LIDC-IDRI-0587_block2_0
6

744 LIDC-IDRI-0662_block1_3210_2 [5], [2]
744 LIDC-IDRI-0662_block1_3210_3 [4 5], [1, 2]
744 LIDC-IDRI-0662_block1_3210_4 [5], [2]
745 LIDC-IDRI-0662_block2_0_0 [5], [2]
747 LIDC-IDRI-0666_block1_210_1 [1], [0]
747 LIDC-IDRI-0666_block1_210_2 [5 5], [2, 2]
747 LIDC-IDRI-0666_block1_210_4 [5], [2]
748 LIDC-IDRI-0666_block2_10_0 [1], [0]
748 LIDC-IDRI-0666_block2_10_3 [1], [0]
749 LIDC-IDRI-0670_block1_0_0 [1], [0]
750 LIDC-IDRI-0671_block1_43210_0 [4], [1]
750 LIDC-IDRI-0671_block1_43210_1 [4], [1]
750 LIDC-IDRI-0671_block1_43210_2 [5 5 4 5], [2, 2, 1, 2]
750 LIDC-IDRI-0671_block1_43210_3 [5 5 5 5], [2, 2, 2, 2]
750 LIDC-IDRI-0671_block1_43210_4 [4], [1]
750 LIDC-IDRI-0671_block1_43210_6 [5 5 4 5], [2, 2, 1, 2]
751 LIDC-IDRI-0671_block2_10_5 [5 4 5], [2, 1, 2]
751 LIDC-IDRI-0671_block2_10_6 [5 5 4 5], [2, 2, 1, 2]
755 LIDC-IDRI-0674_block1_0_3 [5 5 5 5], [2, 2, 2, 2]
757 LIDC-IDRI-0674_block2_1_1 [5], [2]
758 LIDC-IDRI-0674_block2_20_0 [4], [1]
758 LIDC-IDRI-0674_block2_20_2 [3], [1]
sk

838 LIDC-IDRI-0770_block2_9876543_9 [5 5 5 5], [2, 2, 2, 2]
838 LIDC-IDRI-0770_block2_9876543_10 [5 5], [2, 2]
838 LIDC-IDRI-0770_block2_9876543_11 [5], [2]
839 LIDC-IDRI-0771_block1_0_0 [4 5 5 5], [1, 2, 2, 2]
840 LIDC-IDRI-0772_block1_0_1 [4 5 5 1], [1, 2, 2, 0]
841 LIDC-IDRI-0772_block2_0_0 [5 5 5], [2, 2, 2]
842 LIDC-IDRI-0773_block1_3210_0 [5 5], [2, 2]
842 LIDC-IDRI-0773_block1_3210_2 [5], [2]
842 LIDC-IDRI-0773_block1_3210_4 [5 5 2 5], [2, 2, 0, 2]
842 LIDC-IDRI-0773_block1_3210_5 [5 4 5], [2, 1, 2]
843 LIDC-IDRI-0773_block2_10_1 [5 5], [2, 2]
843 LIDC-IDRI-0773_block2_10_3 [5], [2]
844 LIDC-IDRI-0775_block1_210_0 [5 5 4], [2, 2, 1]
844 LIDC-IDRI-0775_block1_210_2 [5 5 4 5], [2, 2, 1, 2]
844 LIDC-IDRI-0775_block1_210_3 [5 5 5 5], [2, 2, 2, 2]
skipped LIDC-IDRI-0775_block1_210
845 LIDC-IDRI-0775_block1_76543_4 [5 3 5], [2, 1, 2]
845 LIDC-IDRI-0775_block1_76543_5 [3], [1]
845 LIDC-IDRI-0775_block1_76543_6 [5 5 2 1], [2, 2, 0, 0]
845 LIDC-IDRI-0775_block1_76543_7 [5 2], [2, 0]
845 

948 LIDC-IDRI-0869_block1_3210_2 [1 1], [0, 0]
948 LIDC-IDRI-0869_block1_3210_3 [5 5], [2, 2]
949 LIDC-IDRI-0870_block1_6543210_4 [5 3], [2, 1]
949 LIDC-IDRI-0870_block1_6543210_5 [5 4], [2, 1]
949 LIDC-IDRI-0870_block1_6543210_6 [5 5 4], [2, 2, 1]
949 LIDC-IDRI-0870_block1_6543210_7 [5 5 4], [2, 2, 1]
949 LIDC-IDRI-0870_block1_6543210_8 [5 5 4], [2, 2, 1]
949 LIDC-IDRI-0870_block1_6543210_9 [5 4], [2, 1]
949 LIDC-IDRI-0870_block1_6543210_10 [5 4 3], [2, 1, 1]
950 LIDC-IDRI-0870_block2_210_0 [4 5], [1, 2]
950 LIDC-IDRI-0870_block2_210_1 [3 5], [1, 2]
950 LIDC-IDRI-0870_block2_210_2 [5 5 4], [2, 2, 1]
951 LIDC-IDRI-0870_block2_3_2 [5 5 4], [2, 2, 1]
951 LIDC-IDRI-0870_block2_3_3 [5 5 3], [2, 2, 1]
952 LIDC-IDRI-0871_block1_210_5 [4 5 5], [1, 2, 2]
952 LIDC-IDRI-0871_block1_210_6 [4 5 5 5], [1, 2, 2, 2]
952 LIDC-IDRI-0871_block1_210_8 [5 5], [2, 2]
953 LIDC-IDRI-0871_block2_543210_0 [3], [1]
953 LIDC-IDRI-0871_block2_543210_1 [1], [0]
953 LIDC-IDRI-0871_block2_543210_2 [4 5 5 5], [1, 2, 

1046 LIDC-IDRI-0973_block1_1_2 [5 5 4], [2, 2, 1]
1046 LIDC-IDRI-0973_block1_1_3 [4], [1]
1047 LIDC-IDRI-0973_block1_320_0 [5 5 4], [2, 2, 1]
1047 LIDC-IDRI-0973_block1_320_2 [5 5 4], [2, 2, 1]
1047 LIDC-IDRI-0973_block1_320_3 [4], [1]
1048 LIDC-IDRI-0974_block2_10_0 [5 5 5 5], [2, 2, 2, 2]
1048 LIDC-IDRI-0974_block2_10_1 [5 5 5], [2, 2, 2]
1049 LIDC-IDRI-0976_block1_10_1 [5 5 5 5], [2, 2, 2, 2]
1049 LIDC-IDRI-0976_block1_10_2 [5], [2]
1050 LIDC-IDRI-0976_block2_0_0 [5], [2]
1051 LIDC-IDRI-0977_block2_0_0 [5], [2]
1052 LIDC-IDRI-0978_block1_1_0 [5], [2]
1052 LIDC-IDRI-0978_block1_1_2 [5], [2]
1052 LIDC-IDRI-0978_block1_1_3 [5], [2]
1053 LIDC-IDRI-0978_block1_20_0 [5], [2]
1053 LIDC-IDRI-0978_block1_20_2 [5], [2]
1054 LIDC-IDRI-0978_block2_0_1 [5 4 5 5], [2, 1, 2, 2]
1055 LIDC-IDRI-0980_block2_210_0 [5 5 5 5], [2, 2, 2, 2]
1055 LIDC-IDRI-0980_block2_210_1 [1], [0]
1055 LIDC-IDRI-0980_block2_210_2 [3 5 4], [1, 2, 1]
1056 LIDC-IDRI-0984_block2_0_0 [2 1 5], [0, 0, 2]
1057 LIDC-IDRI-0985_bl

In [37]:
idx, i

(445, 'LIDC-IDRI-0392_block2_210')

In [None]:
path_ttt = '/data/OMM/Datasets/LIDC_other_formats/LIDC_inpainted_multiple_classification_v6_ndl_not_centered/versions2D/'
ff = os.listdir(f'{path_ttt}inpainted inserted/')
ff = np.sort(ff)
for idx, i in enumerate(ff):
    if idx >40:break
    #if '0012_block2_76543210' not in i: continue
        
    a = np.load(f'{path_ttt}inpainted inserted/{i}')
    o = np.load(f'{path_ttt}original/{i}')
    m = np.load(f'{path_ttt}mask/{i[:-1]}z')
    m = m.f.arr_0
    #labeled, numpatches = ndimage.label(m, structure=struct)
    #print(numpatches, i)
    fig, ax = plt.subplots(1,3,figsize=(14,5))
    ax[0].imshow(a)
    ax[1].imshow(m)
    ax[2].imshow(o)

In [None]:
z,x,y = np.where(mask_small==1)
zz = int(np.median(z))
fig, ax = plt.subplots(1,3,figsize=(14,5))
ax[0].imshow(mask_small[zz])
ax[1].imshow(inpainted_inserted[zz])
ax[2].imshow(orig_small[zz])

In [None]:
struct=np.ones((3,3,3), dtype="bool8")
labeled, numpatches = ndimage.label(mask_small, structure=struct)
numpatches

In [None]:
sizes = ndimage.sum(mask_small,labeled,range(1,numpatches+1)) 
# To get the indices of all the min/max patches.
maxp = np.where(sizes==sizes.max())[0] + 1 
minp = np.where(sizes==sizes.min())[0] + 1

In [None]:
max_index = np.zeros(numpatches + 1, np.uint8)
max_index[maxp] = 1
max_feature = max_index[labeled]

In [None]:
np.shape(max_index)

In [None]:
plt.imshow(max_feature[zz])

In [None]:
coords_Z, coords_X, coords_Y

In [None]:
compare_labeled_and_df_coords(mask, coords_Z, coords_X, coords_Y, diff_thresh = 12)

In [None]:
df = pd.read_csv(f'{path_dest}dataframes for classification/df_original_classify_inpainted_texture.csv')
df

In [None]:
df_one_nodule

In [None]:
z,x,y = np.where(mask==1)
zz = int(np.median(z))
fig, ax = plt.subplots(1,2)
ax[0].imshow(mask[zz])
ax[1].imshow(orig[zz])

In [None]:
plt.imshow(mask_small[coordsZ_for_2D - z_min_f])

In [None]:
for i in [mask_small_2D, orig_small_2D, last_small_2D, mask_lungs_small_2D, inpainted_inserted_2D]:
    plt.figure()
    plt.imshow(i)

In [None]:
names_to_save = []
nodules_with_coords_errors = []
malignancy_names_to_save_3_agree, malignancies, malignancies_mode_3_agree, malignancies_original = [], [], [], []
lobulation_names_to_save_3_agree, lobulations, lobulations_mode_3_agree, lobulations_original = [], [], [], []
sphericity_names_to_save_3_agree, sphericitys, sphericitys_mode_3_agree, sphericitys_original = [], [], [], []
spiculation_names_to_save_3_agree, spiculations, spiculations_mode_3_agree, spiculations_original = [], [], [], []
subtlety_names_to_save_3_agree, subtletys, subtletys_mode_3_agree, subtletys_original = [], [], [], []
texture_names_to_save_3_agree, textures, textures_mode_3_agree, textures_original = [], [], [], []



for idx, i in tqdm_notebook(enumerate(files_common), total=len(files_common)):
    if idx == 10:break
    
    # Get the inpainted and original image and the mask
    try:
        last = np.load(f'{path_last}{i}.npy')
        last = np.squeeze(last)
        orig = np.load(f'{path_orig}{i}.npy')
        orig = np.squeeze(orig)
        mask = np.load(f'{path_mask}{i}.npz')
        mask = mask.f.arr_0
        mask_lungs = np.load(f'{path_mask_lungs}{i}.npz')
        mask_lungs = mask_lungs.f.arr_0
    except FileNotFoundError: continue
            
    df = pd.read_csv(f'{path_chars}{i}.csv')
    n_nodules = np.unique(df['cluster_id'].values)
    for n_ndl in n_nodules: # for each nodule in the DF

        df_one_nodule = df.loc[df['cluster_id'] == n_ndl]
        coords_Z = int(np.mean(df_one_nodule['small_coordsZ_resampled'].values))
        coords_X = int(np.mean(df_one_nodule['small_coordsX'].values))
        coords_Y = int(np.mean(df_one_nodule['small_coordsY'].values))
        # if the DF and mask coords match then use the min and max of the latter 
        coords_limit = compare_labeled_and_df_coords(mask, coords_Z, coords_X, coords_Y)
        if coords_limit == None:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
            
        # Get a cube around the nodule and the mask
        z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f = box_with_masks_search(coords_Z, coords_X, coords_Y, mask_lungs, coords_limit)
#         print(z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f)
        if z_min_f == -1:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
        orig_small = orig[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        last_small = last[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_small = mask[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_lungs_small = mask_lungs[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        # Using the coords of the cube, get the coords of the nodule inside the cube
        if np.shape(orig_small) != (64,64,64):
            nodules_with_shape_errors.append(f'{i}_{n_ndl}')
            continue
        
        # insert the inpainted voxels into the original image
        inpainted_inserted = (last_small * mask_small + (-mask_small+1)*orig_small)
        
        # Save figures and targets
        np.save(f'{path_dest}original/{i}_{n_ndl}.npy',orig_small)
        np.save(f'{path_dest}inpainted/{i}_{n_ndl}.npy',last_small)
        np.save(f'{path_dest}inpainted inserted/{i}_{n_ndl}.npy',inpainted_inserted)
        np.savez_compressed(f'{path_dest}mask/{i}_{n_ndl}',mask_small)
        np.savez_compressed(f'{path_dest}mask lungs/{i}_{n_ndl}',mask_lungs_small)
        names_to_save.append(f'{i}_{n_ndl}')
        
        malignancies, malignancies_mode_3_agree, malignancy_names_to_save_3_agree, malignancies_original = make_df_original_and_df_3_agree(df_one_nodule.malignancy.values, malignancies, malignancies_mode_3_agree, malignancy_names_to_save_3_agree, malignancies_original)
        lobulations, lobulations_mode_3_agree, lobulation_names_to_save_3_agree, lobulations_original = make_df_original_and_df_3_agree(df_one_nodule.lobulation.values, lobulations, lobulations_mode_3_agree, lobulation_names_to_save_3_agree, lobulations_original)
        sphericitys, sphericitys_mode_3_agree, sphericity_names_to_save_3_agree, sphericitys_original = make_df_original_and_df_3_agree(df_one_nodule.sphericity.values, sphericitys, sphericitys_mode_3_agree, sphericity_names_to_save_3_agree, sphericitys_original)
        spiculations, spiculations_mode_3_agree, spiculation_names_to_save_3_agree, spiculations_original = make_df_original_and_df_3_agree(df_one_nodule.spiculation.values, spiculations, spiculations_mode_3_agree, spiculation_names_to_save_3_agree, spiculations_original)
        subtletys, subtletys_mode_3_agree, subtlety_names_to_save_3_agree, subtletys_original = make_df_original_and_df_3_agree(df_one_nodule.subtlety.values, subtletys, subtletys_mode_3_agree, subtlety_names_to_save_3_agree, subtletys_original)
        textures, textures_mode_3_agree, texture_names_to_save_3_agree, textures_original = make_df_original_and_df_3_agree(df_one_nodule.texture.values, textures, textures_mode_3_agree, texture_names_to_save_3_agree, textures_original)
        
        # get 2D version of nodules
        coordsZ_for_2D = int(np.median(df_one_nodule['small_coordsZ_resampled'].values))
        coordsX_for_2D = int(np.median(df_one_nodule['small_coordsX'].values))
        coordsY_for_2D = int(np.median(df_one_nodule['small_coordsY'].values))
        coordsX_for_df_2d_version = coordsX_for_2D - x_min_f
        coordsY_for_df_2d_version = coordsY_for_2D - y_min_f
        mask_small_2D = mask_small[coordsZ_for_2D+1- z_min_f]
        orig_small_2D = orig_small[coordsZ_for_2D+1- z_min_f]
        last_small_2D = last_small[coordsZ_for_2D+1- z_min_f]
        mask_lungs_small_2D = mask_lungs_small[coordsZ_for_2D+1- z_min_f]
        inpainted_inserted_2D = inpainted_inserted[coordsZ_for_2D+1- z_min_f]
            
        print(f'texture {i}_{n_ndl} {df_one_nodule.texture.values}, {textures[-1]}')
            
        # These coords can be used to 'plot_block_and_cube'
#         coords_Z_small = coords_Z - z_min_f 
#         coords_X_small = coords_X - x_min_f 
#         coords_Y_small = coords_Y - y_min_f 
#         plot_block_and_cube(orig, last, mask, coords_Z, orig_small, coords_Z_small, last_small, mask_small)

save_df_and_df_3_agree(names_to_save, textures, texture_names_to_save_3_agree, textures_mode_3_agree, 'texture', textures_original)
save_df_and_df_3_agree(names_to_save, malignancies, malignancy_names_to_save_3_agree, malignancies_mode_3_agree, 'malignancy', malignancies_original)
save_df_and_df_3_agree(names_to_save, lobulations, lobulation_names_to_save_3_agree, lobulations_mode_3_agree, 'lobulation', lobulations_original)
save_df_and_df_3_agree(names_to_save, sphericitys, sphericity_names_to_save_3_agree, sphericitys_mode_3_agree, 'sphericity', sphericitys_original)
save_df_and_df_3_agree(names_to_save, spiculations, spiculation_names_to_save_3_agree, spiculations_mode_3_agree, 'spiculation', spiculations_original)
save_df_and_df_3_agree(names_to_save, subtletys, subtlety_names_to_save_3_agree, subtletys_mode_3_agree, 'subtlety', subtletys_original)

# df_to_classify = pd.DataFrame.from_dict({'names': names_to_save, 'malignancy': malignancies})
# df_to_classify_3_agree = pd.DataFrame.from_dict({'names': malignancy_names_to_save_3_agree, 'malignancy': malignancies_mode_3_agree})
# df_to_classify.to_csv(f'{path_dest}df_classify_inpainted_malignancy.csv', index=False)
# df_to_classify_3_agree.to_csv(f'{path_dest}df_3_agree_classify_inpainted_malignancy.csv', index=False)

In [None]:
names_to_save = []
malignancies_original = []
malignancies = []
nodules_with_coords_errors = []
malignancies_mode, malignancies_mode_3_agree = [], []
names_to_save, names_to_save_3_agree = [], []

for idx, i in tqdm_notebook(enumerate(files_common), total=len(files_common)):
    if idx == 10:break
#     i_test = 0
#     if idx < i_test: continue
#     if idx >= i_test + 1: break
#     print(i)
    
    # Get the inpainted and original image and the mask
    try:
        last = np.load(f'{path_last}{i}.npy')
        last = np.squeeze(last)
        orig = np.load(f'{path_orig}{i}.npy')
        orig = np.squeeze(orig)
        mask = np.load(f'{path_mask}{i}.npz')
        mask = mask.f.arr_0
        mask_lungs = np.load(f'{path_mask_lungs}{i}.npz')
        mask_lungs = mask_lungs.f.arr_0
    except FileNotFoundError: continue
            
    df = pd.read_csv(f'{path_chars}{i}.csv')
    n_nodules = np.unique(df['cluster_id'].values)
    for n_ndl in n_nodules: # for each nodule in the DF

        df_one_nodule = df.loc[df['cluster_id'] == n_ndl]
        coords_Z = int(np.mean(df_one_nodule['small_coordsZ_resampled'].values))
        coords_X = int(np.mean(df_one_nodule['small_coordsX'].values))
        coords_Y = int(np.mean(df_one_nodule['small_coordsY'].values))
        # if the DF and mask coords match then use the min and max of the latter 
        coords_limit = compare_labeled_and_df_coords(mask, coords_Z, coords_X, coords_Y)
        if coords_limit == None:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
            
        # Get a cube around the nodule and the mask
        z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f = box_with_masks_search(coords_Z, coords_X, coords_Y, mask_lungs, coords_limit)
#         print(z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f)
        if z_min_f == -1:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
        orig_small = orig[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        last_small = last[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_small = mask[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_lungs_small = mask_lungs[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        # Using the coords of the cube, get the coords of the nodule inside the cube
        if np.shape(orig_small) != (64,64,64):
            nodules_with_shape_errors.append(f'{i}_{n_ndl}')
            continue
            
        # Save figures and targets
        inpainted_inserted = (last_small * mask_small + (-mask_small+1)*orig_small)
        np.save(f'{path_dest}original/{i}_{n_ndl}.npy',orig_small)
        np.save(f'{path_dest}inpainted/{i}_{n_ndl}.npy',last_small)
        np.save(f'{path_dest}inpainted inserted/{i}_{n_ndl}.npy',inpainted_inserted)
        np.savez_compressed(f'{path_dest}mask/{i}_{n_ndl}',mask_small)
        np.savez_compressed(f'{path_dest}mask lungs/{i}_{n_ndl}',mask_lungs_small)
        names_to_save.append(f'{i}_{n_ndl}')
        
        # Get the malignancy score
        #malignancies_original.append(df_one_nodule.malignancy.values)
        malignancy = list(map(from_five_to_three_classes, df_one_nodule.malignancy.values))
        malignancy = list(filter(None, malignancy))
        malignancies.append(malignancy)
        
        try:
            malignancy_mode = mode(malignancy)
            malignancies_mode.append(malignancy_mode)
            # Next lines are to append to malignancies_mode_3_agree (if at least 3 reviewers agree on malignancy)
            agree_with_mode = [1 if malignancy_mode == i else 0 for i in malignancy]
            agree_with_mode = np.sum(agree_with_mode)
            if agree_with_mode >= 3:
                malignancies_mode_3_agree.append(malignancy_mode)
                names_to_save_3_agree.append(f'{i}_{n_ndl}')
        except StatisticsError: continue
            
            
        # These coords can be used to 'plot_block_and_cube'
#         coords_Z_small = coords_Z - z_min_f 
#         coords_X_small = coords_X - x_min_f 
#         coords_Y_small = coords_Y - y_min_f 
#         plot_block_and_cube(orig, last, mask, coords_Z, orig_small, coords_Z_small, last_small, mask_small)

df_to_classify = pd.DataFrame.from_dict({'names': names_to_save, 'malignancy': malignancies})
df_to_classify_3_agree = pd.DataFrame.from_dict({'names': names_to_save_3_agree, 'malignancy': malignancies_mode_3_agree})
df_to_classify.to_csv(f'{path_dest}df_classify_inpainted_malignancy.csv', index=False)
df_to_classify_3_agree.to_csv(f'{path_dest}df_3_agree_classify_inpainted_malignancy.csv', index=False)

In [None]:
df_to_classify.head()

In [None]:
df.columns

In [None]:
df[['lobulation', 'malignancy','sphericity', 'spiculation','subtlety', 'texture']]

In [None]:
ff = os.listdir(f'{path_dest}inpainted')
ff = np.sort(ff)

In [None]:
for idx, i in enumerate(ff):
    ndl = np.load(f'{path_dest}inpainted inserted/{i}')
    orig = np.load(f'{path_dest}original/{i}')
    mask = np.load(f'{path_dest}mask/{i[:-1]}z')
    mask = mask.f.arr_0
    z,y,x = np.where(mask==1)
    zz = int(np.median(z))
    plt.figure()
    fig, ax = plt.subplots(1,3, figsize=(14,5))
    ax[0].imshow(mask[zz])
    ax[1].imshow(ndl[zz])
    ax[2].imshow(orig[zz])
    for axx in ax.ravel(): axx.axis('off')

In [None]:
df = pd.read_csv(f'{path_chars}{files_common[0]}.csv')
df

In [None]:
df.columns

## old code

In [None]:
malignancies = []
malignancies_original = []
nodules_with_coords_errors = []
nodules_with_shape_errors = []
malignancies_mode, malignancies_mode_3_agree = [], []
names_to_save, names_to_save_3_agree = [], []

TOTAL_NODULES = 0
out=0
for idx, i in tqdm_notebook(enumerate(files_common), total=len(files_common)):
    idx_test = 0
    if idx < idx_test: continue
    if idx == idx_test +1 :break

    # Read the pylidc chars for each patient
    df=pd.read_csv(f'{path_chars}{i}.csv')
    # For each nodule get the mean coordinates and get in which block is the nodule 
    n_nodules = np.unique(df['cluster_id'].values)
    for n_ndl in n_nodules:
#         if n_ndl<=1:continue

        df_one_nodule = df.loc[df['cluster_id'] == n_ndl]
        coords_Z = int(np.mean(df_one_nodule['small_coordsZ_resampled'].values))
        coords_X = int(np.mean(df_one_nodule['small_coordsX'].values))
        coords_Y = int(np.mean(df_one_nodule['small_coordsY'].values))
        df_one_nodule_block = df_one_nodule.nodule_in_block.values[0]
        TOTAL_NODULES += 1
        
        # Get the inpainted and original image and the mask
        try:
            last = np.load(f'{path_last}{i}_block{df_one_nodule_block}.npy')
            last = np.squeeze(last)
            orig = np.load(f'{path_orig}{i}_block{df_one_nodule_block}.npy')
            orig = np.squeeze(orig)
            mask = np.load(f'{path_mask}{i}_block{df_one_nodule_block}.npz')
            mask = mask.f.arr_0
            mask_lungs = np.load(f'{path_mask_lungs}{i}_block{df_one_nodule_block}.npz')
            mask_lungs = mask_lungs.f.arr_0
        except FileNotFoundError: continue
        
        coords_limit = compare_labeled_and_df_coords(mask, coords_Z, coords_X, coords_Y)
#         print(idx, i, n_ndl, coords_Z, coords_X, coords_Y, coords_limit)
        if coords_limit == None:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
        # Get a cube around the nodule and the mask
        z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f = box_with_masks_search(coords_Z, coords_X, coords_Y, mask_lungs, coords_limit)
#         print(z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f)
        if z_min_f == -1:
            nodules_with_coords_errors.append(f'{i}_{n_ndl}')
            continue
        orig_small = orig[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        last_small = last[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_small = mask[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        mask_lungs_small = mask_lungs[z_min_f: z_max_f, x_min_f:x_max_f, y_min_f:y_max_f]
        # Using the coords of the cube, get the coords of the nodule inside the cube
        if np.shape(orig_small)[0] != 64 or np.shape(orig_small)[1] != 64 or np.shape(orig_small)[2] !=64:
            nodules_with_shape_errors.append(f'{i}_{n_ndl}')
            continue
        else:
#             print(f'{np.shape(orig_small)} {v64} {i}_{n_ndl}')
            coords_Z_small = coords_Z - z_min_f 
            coords_X_small = coords_X - x_min_f 
            coords_Y_small = coords_Y - y_min_f 

            # Get the malignancy score
            malignancy1 = df_one_nodule.malignancy.values
            malignancy = list(map(transform_malignancy, malignancy1))
            malignancy = list(filter(None, malignancy))
            try:
                malignancy_mode = mode(malignancy)
            except StatisticsError: continue

            malignancies_original.append(malignancy1)
            malignancies.append(malignancy)
            malignancies_mode.append(malignancy_mode)
            # Next lines are to append to malignancies_mode_3_agree (if at least 3 reviewers agree on malignancy)
            agree_with_mode = [1 if malignancy_mode == i else 0 for i in malignancy]
            agree_with_mode = np.sum(agree_with_mode)
            if agree_with_mode >= 3:
                malignancies_mode_3_agree.append(malignancy_mode)
                names_to_save_3_agree.append(f'{i}_{n_ndl}')

            # Save figures and targets
#             np.save(f'{path_dest}original/{i}_{n_ndl}.npy',orig_small)
#             np.save(f'{path_dest}inpainted/{i}_{n_ndl}.npy',last_small)
#             np.savez_compressed(f'{path_dest}mask/{i}_{n_ndl}',mask_small)
            names_to_save.append(f'{i}_{n_ndl}')


            #plot_block_and_cube(orig, last, mask, coords_Z, orig_small, coords_Z_small, last_small, mask_small)
        
    #if idx ==20: break
df_to_classify = pd.DataFrame.from_dict({'names': names_to_save, 'malignancy': malignancies_mode})
df_to_classify_3_agree = pd.DataFrame.from_dict({'names': names_to_save_3_agree, 'malignancy': malignancies_mode_3_agree})
df_to_classify.to_csv(f'{path_dest}df_classify_inpainted_malignancy.csv', index=False)
df_to_classify_3_agree.to_csv(f'{path_dest}df_3_agree_classify_inpainted_malignancy.csv', index=False)

In [None]:
len(nodules_with_shape_errors), len(nodules_with_coords_errors)

## Add the nodules from v17v2

In [None]:
path_data = '/data/OMM/Datasets/LIDC_other_formats/LIDC_preprocessed_3D v5 - save pylidc chars only/v17v2/'
path_chars =  f'{path_data}pylidc_characteristics/'
path_last = f'{path_data}arrays/last/'
path_orig = f'{path_data}arrays/orig/'
path_mask = f'{path_data}arrays/masks nodules/'
path_mask_lungs = f'{path_data}arrays/masks lungs/'
path_dest = '/data/OMM/Datasets/LIDC_other_formats/LIDC_inpainted_malignancy_classification v2/'

In [None]:
# files DIP reconstruction
files_last = os.listdir(path_last)
files_last = np.sort(files_last)
# files pylidc characteristics
files_chars = os.listdir(path_chars)
files_chars = np.sort(files_chars)
# files nodules masks
files_mask = os.listdir(path_mask)
files_mask = np.sort(files_mask)

In [None]:
# Get the files that are common to the DIP reconstruction and the pylidc characteristics
files_last_cropped = [i.split('.npy')[0] for i in files_last]
files_chars_cropped = [i.split('.csv')[0] for i in files_chars]
files_last_cropped = list(np.unique(files_last_cropped))

files_common = list(set(files_last_cropped).intersection(set(files_chars_cropped)))
files_common = np.sort(files_common)

In [None]:
len(files_common), files_common[:10]

In [None]:
i = files_common[0]
print(i)
df=pd.read_csv(f'{path_chars}{i}.csv')
# For each nodule get the mean coordinates and get in which block is the nodule 
n_nodules = np.unique(df['cluster_id'].values)
for n_ndl in n_nodules:
    print(n_ndl)

In [None]:
plt.imshow(mask[coords_Z_small])

In [None]:
z,x,y = np.where(mask==1)
zz = int(np.median(z))
print(zz)
plt.imshow(mask[zz]);

In [None]:
TOTAL_NODULES

In [None]:
malignancies_final = []
for i in malignancies: 
    if len(np.unique(i)) == 1:
        malignancies_final.append(list(np.unique(i)))
malignancies_final = np.squeeze(malignancies_final)

In [None]:
len(malignancies_final), np.sum(malignancies_final==1), np.sum(malignancies_final==2)

In [None]:
plt.hist(malignancies_final);

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(6.5,5))
ax[0].imshow(orig[coords_Z])
ax[0].axis('off')
ax[1].imshow(last[coords_Z])
ax[1].axis('off')
ax[2].imshow(mask[coords_Z])
ax[2].axis('off')

In [None]:
z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f = box_with_masks_search(coords_Z, coords_X, coords_Y, mask_lungs)
z_min_f, z_max_f, x_min_f, x_max_f, y_min_f, y_max_f

In [None]:
np.shape(mask_lungs)

In [None]:
def box_coords_contain_masks_right_size_search(coord_max_sideX, coord_min_sideX, side, slice_middle, xmed_1, ymed_1, xmed_2, ymed_2, mask_lungs_small, dist1 = 96, dist2 = 160, dist3 = 96):
    # limits of the nodules masks
    if len(coord_max_sideX) > 0:
        coord_ = [i[0] for i in coord_max_sideX]
        z_max_sideX = np.max(coord_)
        coord_ = [i[0] for i in coord_min_sideX]
        z_min_sideX = np.min(coord_)
        coord_ = [i[1] for i in coord_max_sideX]
        x_max_sideX = np.max(coord_)
        coord_ = [i[1] for i in coord_min_sideX]
        x_min_sideX = np.min(coord_)
        coord_ = [i[2] for i in coord_max_sideX]
        y_max_sideX = np.max(coord_)
        coord_ = [i[2] for i in coord_min_sideX]
        y_min_sideX = np.min(coord_)

    # find if the coords are closer to the center of the right or left lung
    if side == 1:
        xmed_X = xmed_1
        ymed_X = ymed_1
    elif side == 2:
        xmed_X = xmed_2
        ymed_X = ymed_2
    box_found = False  
    
    # find where the vol_cut get more info voxels
    max_sum = 0
    for i in range(30):
        ii = i * 4 - 58
        for j in range(19):
            jj = j * 3 - 27
            for k in range(19):
                kk = k * 4 - 36
                
                # limits of the current box
                zmin = int(slice_middle-(dist1//2)+ii)
                zmin = np.max([zmin, 0]); zmax = int(zmin + dist1)
                
                xmin = int(xmed_X-(dist2//2)+jj); 
                xmin = np.max([xmin, 0]); xmax = int(xmin + dist2)
                
                ymin = int(ymed_X-(dist3//2)+kk); 
                ymin = np.max([ymin, 0]); ymax = int(ymin + dist3)
            
                #max_cut = mask_maxvol_small[zmin:zmax, xmin:xmax, zmin:zmax]
            
                #if there is a nodule
                if len(coord_max_sideX) > 0:
                    #if the current box contains the masks
                    if zmin < z_min_sideX and zmax > z_max_sideX and xmin < x_min_sideX and xmax > x_max_sideX and ymin < y_min_sideX and ymax > y_max_sideX:
                        #if the current box is inside the scan (small) limits
                        if zmin >= 0 and zmax <= np.shape(mask_lungs_small)[0] and xmin >= 0 and xmax <= np.shape(mask_lungs_small)[1] and ymin >= 0 and ymax <= np.shape(mask_lungs_small)[2]:
                            vol_cut=mask_lungs_small[zmin:zmax,xmin:xmax,ymin:ymax]
                            # the box contains as many info voxels as possible
                            this_sum = np.sum(vol_cut)
                            if this_sum > max_sum:
                                max_sum = this_sum
                                coords_i = ii; coords_j=jj; coords_k=kk
                                box_found = True
                                z_min_sideX_found = zmin
                                z_max_sideX_found = zmax
                                x_min_sideX_found = xmin
                                x_max_sideX_found = xmax
                                y_min_sideX_found = ymin                        
                                y_max_sideX_found = ymax 
                else: # if it doesn't contain the masks just look for max value of info voxels
                    vol_cut=mask_lungs_small[zmin:zmax,xmin:xmax,ymin:ymax]
                    #if the current box is inside the scan (small) limits
                    if zmin >= 0 and zmax <= np.shape(mask_lungs_small)[0] and xmin >= 0 and xmax <= np.shape(mask_lungs_small)[1] and ymin >= 0 and ymax <= np.shape(mask_lungs_small)[2]:
                        # the box contains as many info voxels as possible
                        this_sum = np.sum(vol_cut)
                        if this_sum >= max_sum:
                            max_sum = this_sum
                            coords_i = ii; coords_j=jj; coords_k=kk
                            box_found = True
                            z_min_sideX_found = zmin
                            z_max_sideX_found = zmax
                            x_min_sideX_found = xmin
                            x_max_sideX_found = xmax
                            y_min_sideX_found = ymin                        
                            y_max_sideX_found = ymax 
            #print(int(zmin < z_min_sideX) , int(zmax > z_max_sideX) , int(xmin < x_min_sideX) , int(xmax > x_max_sideX) , int(ymin < y_min_sideX) , int(ymax > y_max_sideX))
    if box_found == True:
        return z_min_sideX_found, z_max_sideX_found, x_min_sideX_found, x_max_sideX_found, y_min_sideX_found, y_max_sideX_found

In [None]:
df_one_nodule

In [None]:
for idx, i in enumerate(files_last):
    if idx==1:break
    if i[:14] in files_common:
                
        # Get the pylidc characteristics
        file_name=i.split('_block')[0]
        df=pd.read_csv(f'{path_chars}{file_name}.csv')
        # For each nodule get the mean coordinates
        n_nodules = np.unique(df['cluster_id'].values)
        nodules_coords_pylidc = []
        for n_ndl in n_nodules:
            df_one_nodule = df.loc[df['cluster_id'] == n_ndl]
            coords_Z = int(np.mean(df_one_nodule['small_coordsZ'].values))
            coords_X = int(np.mean(df_one_nodule['small_coordsX'].values))
            coords_Y = int(np.mean(df_one_nodule['small_coordsY'].values))
            nodules_coords_pylidc.append([coords_Z, coords_X, coords_Y])
            df_one_nodule.nodule_in_block.values[0]
        
        ndl = np.load(f'{path_ndl}{i[:-1]}z')
        ndl = ndl.f.arr_0
                
        # Get the coords of the nodule(s) according to the MASK
        nodules_coords_mask = []
        labeled, nr_objects = ndimage.label(ndl)
        for m in np.arange(1,nr_objects + 1):
            print(i)
            z,x,y=np.where(labeled==m)
            zz = int(np.median(z))
            xx = int(np.median(x))
            yy = int(np.median(y))
            nodules_coords_mask.append([zz, xx, yy])
        
        # Get the inpainted image
        img = np.load(f'{path_last}{i}')
        img = np.squeeze(img)
        

        
        # Print figure
        legend=f'{i}\n\
coords_mask = {zz, xx, yy}\ncoords_pylidc={len(nodules_coords_pylidc)}' 
        fig, ax = plt.subplots(1,3, figsize=(12,5))
        ax[0].imshow(ndl[zz])
        ax[1].imshow(img[zz])
        ax[0].text(5, 35, legend, color='#FFFFFF')

In [None]:
nodules = np.unique(df.cluster_id.values)
nodules

In [None]:
idx, i

In [None]:
df

In [None]:
n_nodules = np.unique(df['cluster_id'].values)

In [None]:
for i in n_nodules:
    df_one_nodule = df.loc[df['cluster_id'] == i]
    coords_Z = int(np.mean(df_one_nodule['small_coordsZ'].values))
    coords_X = int(np.mean(df_one_nodule['small_coordsX'].values))
    coords_Y = int(np.mean(df_one_nodule['small_coordsY'].values))
    print(coords_Z, coords_X, coords_Y)

In [None]:
df.loc[df['cluster_id'] == 1]

In [None]:
labeled, nr_objects = ndimage.label(ndl)
nr_objects

In [None]:
for i in np.arange(1,nr_objects + 1):
    print(i)
    z,x,y=np.where(labeled==i)
    zz = int(np.median(z))
    xx = int(np.median(x))
    yy = int(np.median(y))
    plt.imshow(ndl[zz])

In [None]:
z,x,y=np.where(labeled==1)
zz = int(np.median(z))
xx = int(np.median(x))
yy = int(np.median(y))
plt.imshow(ndl[zz])

In [None]:
z,x,y=np.where(labeled==2)
zz = int(np.median(z))
xx = int(np.median(x))
yy = int(np.median(y))
plt.imshow(ndl[zz])