In [2]:
import tensorflow as tf
import scipy
import sklearn
from sklearn.model_selection import train_test_split
from nilearn.image import concat_imgs, mean_img, resample_img

import tensorflow as tflow
from tensorflow.keras.layers import Flatten, Dense, Dropout
from keras.layers.core import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

import os

import nibabel as nib
import nilearn

import json 

import numpy as np
import math
from random import sample

from sklearn.utils.class_weight import compute_class_weight

#import tensorflow_addons as tfa

In [3]:
!ls './preprocessed-data/images/'

case_00000.nii.gz case_00075.nii.gz case_00150.nii.gz case_00225.nii.gz
case_00001.nii.gz case_00076.nii.gz case_00151.nii.gz case_00226.nii.gz
case_00002.nii.gz case_00077.nii.gz case_00152.nii.gz case_00227.nii.gz
case_00003.nii.gz case_00078.nii.gz case_00153.nii.gz case_00228.nii.gz
case_00004.nii.gz case_00079.nii.gz case_00154.nii.gz case_00229.nii.gz
case_00005.nii.gz case_00080.nii.gz case_00155.nii.gz case_00230.nii.gz
case_00006.nii.gz case_00081.nii.gz case_00156.nii.gz case_00231.nii.gz
case_00007.nii.gz case_00082.nii.gz case_00157.nii.gz case_00232.nii.gz
case_00008.nii.gz case_00083.nii.gz case_00158.nii.gz case_00233.nii.gz
case_00009.nii.gz case_00084.nii.gz case_00159.nii.gz case_00234.nii.gz
case_00010.nii.gz case_00085.nii.gz case_00160.nii.gz case_00235.nii.gz
case_00011.nii.gz case_00086.nii.gz case_00161.nii.gz case_00236.nii.gz
case_00012.nii.gz case_00087.nii.gz case_00162.nii.gz case_00237.nii.gz
case_00013.nii.gz case_00088.nii.gz case_00163.nii.

In [4]:
# rootdir = './preprocessed-data/images'
# image_paths_list = []
# annotation_paths_list = []

# for file in os.listdir(rootdir):
#     d = os.path.join(rootdir, file)
#     image_paths_list.append(d)
    
    
# image_dict = {}

# for img_path in image_paths_list:
#     case_id = img_path[27:37]
#     ct_nii = nib.load(img_path).get_fdata()
#     print(case_id, ct_nii.shape)
#     image_dict[case_id] = ct_nii

# with open('../kits21/kits21/data/kits.json') as user_file:
#     file_contents = user_file.read()

# meta_list = json.loads(file_contents)

# labels_dict = {}
# for case in meta_list:
#     c_id = case['case_id']
#     labels_dict[c_id] = case['malignant']

# for key, value in labels_dict.items():
#     print(key, value)
    
    
# img_list, id_list, label_list = [], [], []

# common_keys = labels_dict.keys() & image_case_dict.keys()

# for k in common_keys:
#     id_list.append(k)
#     img_list.append(image_case_dict[k])
#     label_list.append(labels_dict[k])
    
# print(len(img_list), len(id_list), len(label_list))

In [5]:
def malignant_labels_to_dict(json_file_path):
    """Takes path to json file and stores malignant label for each 
    patient in a dictionary"""
    
    with open(json_file_path) as user_file:
          file_contents = user_file.read()

    meta_list = json.loads(file_contents)

    labels_dict = {}
    for case in meta_list:
        c_id = case['case_id']
        labels_dict[c_id] = case['malignant']
    
    return labels_dict

In [6]:
def generate_subsample_id_list(n_samples, pct_of_total_neg, dict_of_labels):
    subsample_id_list = []
    total_neg = len(dict_of_labels) - sum(dict_of_labels.values())
    n_neg = math.ceil(total_neg*(pct_of_total_neg/100))
    n_pos = n_samples - n_neg
    
    true_list = [k for k,v in dict_of_labels.items() if v == True]
    false_list = [k for k,v in dict_of_labels.items() if v == False]
    
    subsample_id_list.extend(false_list[:n_neg])
    subsample_id_list.extend(sample(true_list,n_pos))
                             
    print('Generated list of {0} case IDs of {1} positive and {2} negative labels'.format(
        len(subsample_id_list), n_neg, n_pos))
    
    return subsample_id_list

In [7]:
def processed_image_paths(rootdir, subfolder):
    """ Creates list of paths to images in each subfolder and corresponsing patient ID """
    
    folder_path = rootdir + subfolder
    paths_list = []
    for file in os.listdir(folder_path):
        paths_list.append(folder_path + '/' + file)
    
    #for file in os.listdir(rootdir_str):
    #    d = os.path.join(rootdir_str, file)
    #    if os.path.isdir(d):
    #        paths_list.append(d + general_filename)
            
    return paths_list

In [8]:
def load_nifti_img_and_mask_as_numpy(paths_list, subsample_list):
    """ Loads nifti images corresponding to paths lists in a dictionary 
    of case_id as keys and 3D numpy image arrays as values. Filters list by  """

    image_dict = {}

    for img_path in paths_list:
        case_id = img_path[-17:-7]
        if case_id in subsample_list:
            print('match on: ', case_id)
            ct_nii = nib.load(img_path).get_fdata()
            image_dict[case_id] = ct_nii
    
    return image_dict

In [9]:
#------------- Generate dictionary of case_id and binary malignant label --------
labels_dict = malignant_labels_to_dict('../kits21/kits21/data/kits.json')
print('Number of labels in labels_dict: ', len(labels_dict))

#------------- Sample n instances from labels_dict with all possible negative cases --------
subsample_list = generate_subsample_id_list(100, 100, labels_dict)

#------------- Generate list of paths to preprocessed images --------
rootdir = './preprocessed-data/'

image_paths_list = processed_image_paths(rootdir, 'images')
mask_paths_list = processed_image_paths(rootdir, 'masks')

print('Is image and mask path list the same lenght?: ', len(image_paths_list) == len(mask_paths_list))

#------------- Load images and masks filtered by subsample --------
image_dict = load_nifti_img_and_mask_as_numpy(image_paths_list, subsample_list)
mask_dict = load_nifti_img_and_mask_as_numpy(mask_paths_list, subsample_list)

print('Is image and mask dictionaries the same lenght?:', len(image_dict) == len(mask_dict))


Number of labels in labels_dict:  300
Generated list of 100 case IDs of 25 positive and 75 negative labels
Is image and mask path list the same lenght?:  True
match on:  case_00201
match on:  case_00246
match on:  case_00258
match on:  case_00225
match on:  case_00134
match on:  case_00034
match on:  case_00173
match on:  case_00010
match on:  case_00161
match on:  case_00102
match on:  case_00002
match on:  case_00147
match on:  case_00155
match on:  case_00055
match on:  case_00163
match on:  case_00063
match on:  case_00112
match on:  case_00071
match on:  case_00211
match on:  case_00235
match on:  case_00082
match on:  case_00190
match on:  case_00244
match on:  case_00116
match on:  case_00104
match on:  case_00167
match on:  case_00032
match on:  case_00051
match on:  case_00020
match on:  case_00120
match on:  case_00289
match on:  case_00043
match on:  case_00252
match on:  case_00215
match on:  case_00276
match on:  case_00184
match on:  case_00088
match on:  case_00188
match

In [10]:
cases = subsample_list

labels =  [labels_dict[x] for x in cases]

print("Are cases and labels lists the same length?: ", len(cases) == len(labels))

Are cases and labels lists the same length?:  True


In [11]:
id_train, id_test_temp, labels_train, labels_test_temp = train_test_split(
    cases, labels, test_size=0.30, shuffle=True, stratify=labels)

id_val, id_test, labels_val, labels_test = train_test_split(
    id_test_temp, labels_test_temp, test_size=0.50, shuffle=True, random_state=42, stratify=labels_test_temp)

In [12]:
def fill_set(x_set, y_set, image_dictionary, label_dictionary):
    x_list, y_list = [], []
    
    for idx, (x, y) in enumerate(zip(x_set, y_set)):    
        for img in image_dictionary[x]:
            x_list.append(img)
            y_list.append(label_dictionary[x])
    
    x_array, y_array = np.array(x_list), np.array(y_list).astype(int)
    #x_array = np.array(tf.expand_dims(x_array, -1))
    x_array = np.repeat(x_array[..., np.newaxis], 3, -1)

    y_array = np.asarray(y_array).astype('float32').reshape((-1,1))
    
    return x_array, y_array

x_train, y_train = fill_set(id_train, labels_train, image_dict, labels_dict)
x_val, y_val = fill_set(id_val, labels_val, image_dict, labels_dict)
#x_test, y_test = fill_set(id_test, labels_test, images_dict_preprocessed, labels_dict)


print(x_train[0].shape, y_train[0])
print(len(x_train) == len(y_train))

(224, 224, 3) [0.]
True


In [13]:
x_train.shape

(15980, 224, 224, 3)

## upsampling 

In [14]:
# from imblearn.over_sampling import SMOTE

# x_train_reshape = x_train.reshape(x_train.shape[0], 224 * 224 * 3)
# sm = SMOTE(random_state=42)
# x_smote, y_smote = sm.fit_resample(x_train_reshape, y_train)
# x_smote = x_smote.reshape(x_smote.shape[0], 224, 224, 3)


## Developing baseline model

In [25]:
# draft model
resnet = tf.keras.applications.ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3), classes=1)

for layer in resnet.layers[:143]:
    layer.trainable = False




metrics_list = [tf.keras.metrics.AUC(name = 'auc'),
                tf.keras.metrics.BinaryAccuracy(name = 'accuracy')]



#calculate class weights
class_weights = {0 : 30, 1 : 1}

optimizer_fn = tf.keras.optimizers.experimental.RMSprop(learning_rate=0.00002, jit_compile = False)


model = Sequential()
model.add(resnet)
model.add(Flatten())
model.add(Dropout(0.3)) 
model.add(Dense(8,activation='relu'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer = optimizer_fn, loss='binary_crossentropy', metrics= metrics_list)
model.summary()
model.fit(x_train, y_train, validation_data = (x_val, y_val), class_weight = class_weights, epochs=10, batch_size = 200)


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50 (Functional)       (None, 7, 7, 2048)        23587712  
                                                                 
 flatten_6 (Flatten)         (None, 100352)            0         
                                                                 
 dropout_3 (Dropout)         (None, 100352)            0         
                                                                 
 dense_6 (Dense)             (None, 256)               25690368  
                                                                 
 dense_7 (Dense)             (None, 128)               32896     
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                                 
 dense_9 (Dense)             (None, 1)                

2023-03-23 18:49:41.138620: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-03-23 18:52:28.280616: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0xe91d60cd0>

In [26]:
results = model.evaluate(x_val, y_val)
results



[12.752480506896973, 0.6540908217430115, 0.7218852043151855]

In [None]:
# [0.687402069568634, 0.7713485956192017] auc / accuracy if the model 143 layers are not trainable / 10 poch 


In [17]:
# change metrics tresholds and etc

# add more layers

# run models on random 100 images 

#tf.keras.metrics.SpecificityAtSensitivity(0.1, name = 'specificity'),

In [18]:
# https://openreview.net/pdf?id=XXtHQy0d8Y paper on segmentation