In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
from tensorflow import keras
import os
from sklearn import model_selection
from sklearn import utils
from sklearn.metrics import confusion_matrix, roc_curve, RocCurveDisplay, roc_auc_score
from scipy import ndimage
from gc import collect
import matplotlib.pyplot as plt
from collections import Counter
from seaborn import heatmap


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [None]:
def get_labels(label_file):
    """lê a tabela com as informações dos pacientes e retorna uma matriz com o ID e as labels"""
    labels = pd.read_csv(label_file)
    cancer_labels = dict()

    for p in labels.index:
        cancer_labels[labels['PatientID'][p]] = [int(labels['Normal'][p]), int(labels['Actionable'][p]), int(labels['Benign'][p]), int(labels['Cancer'][p])]

    return cancer_labels

In [None]:
labels = get_labels('/kaggle/input/labelsssss/labels.csv')
numbers_per_class = [0, 0, 0, 0]
for i in labels:
    numbers_per_class[np.argmax(labels[i])] += 1

proportion_per_class = [round(number_of_class/sum(numbers_per_class), 2) for number_of_class in numbers_per_class]
proportion_per_class

[0.35, 0.25, 0.23, 0.17]

In [None]:
path = '/kaggle/input/192x256xdepth/'
dirs = os.listdir(path)

X = [path + i for i in os.listdir(path)]
y = [np.argmax(labels[(path+i)[-14:-4]]) for i in os.listdir(path)]

In [None]:
def balancing_batch(X, y, max_value):
    numbers_per_class = sorted(Counter(y).items())

    if len(numbers_per_class) == 1:
        return X[0:1], y[0:1] #return only the first image because the batch has only one class

    X, y = utils.shuffle(X, y)
    new_X, new_y = [], []
    counter_class_zero = 0
    counter_class_one = 0
    counter_class_two = 0
    counter_class_three = 0

    max_per_class = int(max_value/4)

    for test_x, test_y in zip(X, y):
        if test_y == 0 and counter_class_zero < max_per_class:
            new_X.append(test_x)
            new_y.append(test_y)
            counter_class_zero += 1
        elif test_y == 1 and counter_class_one < max_per_class:
            new_X.append(test_x)
            new_y.append(test_y)
            counter_class_one += 1
        elif test_y == 2 and counter_class_two < max_per_class:
            new_X.append(test_x)
            new_y.append(test_y)
            counter_class_two += 1
        elif test_y == 3 and counter_class_three < max_per_class:
            new_X.append(test_x)
            new_y.append(test_y)
            counter_class_three += 1

    return np.array(new_X, dtype='float16'), np.array(new_y, dtype='uint8')

In [None]:
def separate_slices(img):
    'function to separate 2d images of 3d original image'
    slices = []

    for i in range(img.shape[-2]):
        slices.append(np.array(img[:, :, i]))

    slices.append(np.mean(img, axis=-2)) #including mean of slices

    return slices

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y,random_state=42, train_size=0.8)

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels_dir, batch_size, sub_batch_size, dim_img, training, shuffle=True):
        self.list_IDs = list_IDs # array of strings with original images name with directory
        self.labels = self.__get_labels(labels_dir) #dict with labels of all images
        self.batch_size = batch_size #3d-images per batch
        self.sub_batch_size = sub_batch_size #quantity of sub-images per batch will be choose to train
        self.dim_img = dim_img # tuple with width and height of image like (192, 256)
        self.training = training # true if generator is for training, false if generator is for validation
        self.shuffle = shuffle # true or false to shuffle data after any epochs
        self.on_epoch_end() # call of the function


    def __get_labels(self, label_file):
        'take the dict with labels of images'
        labels = pd.read_csv(label_file)
        cancer_labels = dict()

        for p in labels.index:
            cancer_labels[labels['PatientID'][p]] = [int(labels['Normal'][p]), int(labels['Actionable'][p]), int(labels['Benign'][p]), int(labels['Cancer'][p])]

        return cancer_labels

    def __data_augmentation(self, x):
        'generate variations of images'
        new_images = []
        x = x.astype('float16')
        new_images.append(x)

        x = cv2.flip(x.astype('float32'), 1).astype('float16')

        new_images.append(np.expand_dims(x, -1))

        return utils.shuffle(new_images)

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))


    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        if self.training:
            X, y = balancing_batch(X, y, self.sub_batch_size)
            return np.array(X[0:self.sub_batch_size], dtype='float16'), np.array(y[0:self.sub_batch_size], dtype='uint8')

        return np.array(X), np.array(y)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        X = []
        y = []

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            prev_len_X = len(X)
            full_image = np.load(ID)
            new_images = separate_slices(full_image)
            if self.training:
                for img in new_images:
                    X += self.__data_augmentation(img)
            else:
                X = np.array(new_images, dtype='float16')

            #adding new data labels for y array
            for _ in range(len(X) - prev_len_X):
                y.append(self.labels[ID[-14:-4]]) #'-14:-4 represent a part of string with name of original image that slices was taken'

        X, y = utils.shuffle(X, y)
        return X, y

In [None]:
class SE_ResidualUnit(keras.layers.Layer):
    def __init__(self, filters, strides=1, activation='relu', **kwargs):
        super().__init__(**kwargs)
        self.activation = keras.activations.get(activation)

        self.SE = [keras.layers.GlobalAvgPool2D(),
                   keras.layers.Flatten(),
                   keras.layers.Dense(filters//4, activation='relu'),
                   keras.layers.Dense(filters, activation='sigmoid'),
                   keras.layers.Reshape([1,1,filters])]

        self.block_layers = [keras.layers.Conv2D(filters//4, kernel_size=(1,1), strides=1, padding='same', use_bias=False),
                             keras.layers.BatchNormalization(),
                             self.activation,
                             keras.layers.Conv2D(filters//4, kernel_size=(3,4), strides=strides, padding='same', use_bias=False),
                             keras.layers.BatchNormalization(),
                             self.activation,
                             keras.layers.Conv2D(filters, kernel_size=(1,1), strides=1, padding='same', use_bias=False),
                             keras.layers.BatchNormalization()]

        self.skip_layers = []
        if strides > 1:
            self.skip_layers = [keras.layers.Conv2D(filters, kernel_size=(1,1), strides=strides, padding='same', use_bias=False),
                                keras.layers.BatchNormalization()]

    def call(self, x):
        inputs = tf.identity(x)
        x1 = tf.identity(x)

        for layer in self.block_layers:
            x1 = layer(x1)

        x2 = tf.identity(x1)
        for layer in self.SE:
            x2 = layer(x2)

        prod_calibration = x1*x2

        for layer in self.skip_layers:
            inputs = layer(inputs)


        return tf.concat([prod_calibration, inputs], axis=-1)

def get_resnet18():
    resnet18 = keras.models.Sequential()
    resnet18.add(keras.layers.Conv2D(filters=64, kernel_size=(7,9), strides=2, padding='same', use_bias=False, input_shape=(192, 256, 1)))
    resnet18.add(keras.layers.BatchNormalization())
    resnet18.add(keras.layers.Activation(keras.activations.relu))
    resnet18.add(keras.layers.MaxPool2D(pool_size=(3,4), strides=2, padding='same'))

    prev_filters = 64
    for filters in [64]*2 + [128]*2 + [256]*2 + [512]*2:
        if filters == prev_filters :
            strides = 1
        else:
            strides = 2
        resnet18.add(SE_ResidualUnit(filters, strides))
        prev_filters = filters

    resnet18.add(keras.layers.GlobalAvgPool2D())
    resnet18.add(keras.layers.Flatten())
    resnet18.add(keras.layers.Dense(units=1, activation='sigmoid'))

    return resnet18

**NORMAL**

In [None]:
checkpoint_callback_auc1 = tf.keras.callbacks.ModelCheckpoint(filepath='./models/normal_auc1/',
                                                             save_weights_only=True,
                                                             monitor='val_AUC_1',
                                                             mode='max',
                                                             save_best_only=True)
checkpoint_callback_auc2 = tf.keras.callbacks.ModelCheckpoint(filepath='./models/normal_auc2/',
                                                             save_weights_only=True,
                                                             monitor='val_AUC_2',
                                                             mode='max',
                                                             save_best_only=True)
checkpoint_callback_auc3 = tf.keras.callbacks.ModelCheckpoint(filepath='./models/normal_auc3/',
                                                             save_weights_only=True,
                                                             monitor='val_AUC_3',
                                                             mode='max',
                                                             save_best_only=True)
checkpoint_callback_auc4 = tf.keras.callbacks.ModelCheckpoint(filepath='./models/normal_auc4/',
                                                             save_weights_only=True,
                                                             monitor='val_AUC_4',
                                                             mode='max',
                                                             save_best_only=True)
checkpoint_callback_auc5 = tf.keras.callbacks.ModelCheckpoint(filepath='./models/normal_auc5/',
                                                             save_weights_only=True,
                                                             monitor='val_AUC_5',
                                                             mode='max',
                                                             save_best_only=True)

callbacks_list = [checkpoint_callback_auc1,
                  checkpoint_callback_auc2,
                  checkpoint_callback_auc3,
                  checkpoint_callback_auc4,
                  checkpoint_callback_auc5]

def lr_scheduler(epoch, lr):
    return lr*0.9

lr_decay_function = keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=0)

In [None]:
histories = []
n_splits = 5
skf = model_selection.StratifiedShuffleSplit(n_splits=n_splits, random_state=314, train_size=0.85)
for number_of_split, data in enumerate(skf.split(X_train, y_train)):
    print(f'SPLIT {number_of_split+1}/{n_splits}:')
    train = [path + dirs[i] for i in data[0]]
    val = [path + dirs[j] for j in data[1]]

    # Generators
    training_generator = DataGenerator(list_IDs=train,
                                       labels_dir='/kaggle/input/labelsssss/labels.csv',
                                       dim_img=(192, 256),
                                       batch_size=5,
                                       sub_batch_size=400,
                                       shuffle=True,
                                       training=True)

    validation_generator = DataGenerator(list_IDs=val,
                                         labels_dir='/kaggle/input/labelsssss/labels.csv',
                                         dim_img=(192, 256),
                                         batch_size=1,
                                         sub_batch_size='IGNORED', #this argument will be ignored because training is false.
                                         shuffle=True,
                                         training=False)

    resnet = get_resnet18()

    resnet.compile(loss='binary_crossentropy',
                     optimizer=keras.optimizers.Adam(learning_rate=1e-4),
                     metrics=[keras.metrics.AUC(name=f'AUC_{number_of_split+1}'),
                              keras.metrics.BinaryAccuracy(name='accuracy'),
                              keras.metrics.Precision(name='precision'),
                              keras.metrics.Recall(name='recall')])


    # Train model on dataset
    histories.append(resnet.fit(training_generator,
                                   validation_data=validation_generator,
                                   epochs=35,
                                   use_multiprocessing=True,
                                   workers=1,
                                   callbacks=[callbacks_list[number_of_split], lr_decay_function]))
    print('\n')
    collect()

In [None]:
plt.figure(figsize=(25, 50))
plt.subplot(4, 2, 1)
for i, h in enumerate(histories):
    plt.plot(list(range(1, 36)), h.history['loss'], marker='o')
    plt.title('SE ResNET-18 Loss evolution - Training: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('Loss on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

plt.subplot(4, 2, 2)
for i, h in enumerate(histories):
    plt.plot(list(range(1, 36)), h.history['val_loss'], marker='o')
    plt.title('SE ResNET-18 Loss evolution - Validations: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('Loss on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

plt.subplot(4, 2, 3)
for i, h in enumerate(histories):
    plt.plot(list(range(1, 36)), h.history['accuracy'], marker='o')
    plt.title('SE ResNET-18 Accuracy evolution - Training: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('Accuracy on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

plt.subplot(4, 2, 4)
for i, h in enumerate(histories):
    plt.plot(list(range(1, 36)), h.history['val_accuracy'], marker='o')
    plt.title('SE ResNET-18 Accuracy evolution - Validations: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('Accuracy on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

plt.subplot(4, 2, 5)
for i, h in enumerate(histories):
    plt.plot(list(range(1, 36)), h.history['recall'], marker='o')
    plt.title('SE ResNET-18 Recall evolution - Training: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('Recall on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

plt.subplot(4, 2, 6)
for i, h in enumerate(histories):
    plt.plot(list(range(1, 36)), h.history['val_recall'], marker='o')
    plt.title('SE ResNET-18 Recall evolution - Validations: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('Recall on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

plt.subplot(4, 2, 7)
for i, h in enumerate(histories):
    key_auc = f"AUC_{i+1}"
    plt.plot(list(range(1, 36)), h.history[key_auc], marker='o')
    plt.title('SE ResNET-18 AUC evolution - Training: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('AUC on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

plt.subplot(4, 2, 8)
for i, h in enumerate(histories):
    key_auc_val = f"val_AUC_{i+1}"
    plt.plot(list(range(1, 36)), h.history[key_auc_val], marker='o')
    plt.title('SE ResNET-18 AUC evolution - Validations: Normal vs others classes')
    plt.xlabel('epoch number')
    plt.ylabel('AUC on epoch')
    plt.legend(['fold-1','fold-2', 'fold-3', 'fold-4', 'fold-5'])

In [None]:
print('Best results in validations for any k-fold: ')
for i, h in enumerate(histories):
    print(f'K-FOLD {i+1}:')
    print("TRAINING RESULTS:")
    k = np.max(h.history[f'AUC_{i+1}'])
    print(f'Best AUC in train: {k}')
    k = np.max(h.history[f'accuracy'])
    print(f'Best Accuracy in train: {k}')
    k = np.max(h.history[f'precision'])
    print(f'Best Precision in train: {k}')
    k = np.max(h.history[f'recall'])
    print(f'Best Recall in train: {k}')

    print("\nVALIDATION RESULTS:")
    k = np.max(h.history[f'val_AUC_{i+1}'])
    print(f'Best AUC in validation: {k}')
    k = np.max(h.history[f'val_accuracy'])
    print(f'Best Accuracy in validation: {k}')
    k = np.max(h.history[f'val_precision'])
    print(f'Best Precision in validation: {k}')
    k = np.max(h.history[f'val_recall'])
    print(f'Best Recall in validation: {k}')
    print()
    print(f'{50*"="}')
    print()

results = np.empty((4, 5))
for i, h in enumerate(histories):
    results[0][i] = np.max(h.history[f'val_AUC_{i+1}'])
    results[1][i] = np.max(h.history['val_accuracy'])
    results[2][i] = np.max(h.history['val_precision'])
    results[3][i] = np.max(h.history['val_recall'])

print(f"Average best AUC: {np.mean(results[0])}")
print(f"standard deviation AUC: {np.std(results[0])}\n")
print(f"Average best Accuracy: {np.mean(results[1])}")
print(f"Standard Deviation Accuracy: {np.std(results[1])}\n")
print(f"Average best Precision: {np.mean(results[2])}")
print(f"Standard Deviation Precision: {np.std(results[2])}\n")
print(f"Average best Recall: {np.mean(results[3])}")
print(f"Standard Deviation Recall: {np.std(results[3])}\n")


**TESTS**

In [None]:
training_generator = DataGenerator(objective=[0, 0, 0, 1],
                                       list_IDs=X_train,
                                       labels_dir='/kaggle/input/labelsssss/labels.csv',
                                       dim_img=(192, 256),
                                       batch_size=5,
                                       sub_batch_size=400,
                                       shuffle=True,
                                       training=True)

resnet = get_resnet18()

resnet.compile(loss='binary_crossentropy',
                  optimizer=keras.optimizers.Adam(8e-7),
                  metrics=[keras.metrics.AUC(name='AUC'),
                           keras.metrics.BinaryAccuracy(name='accuracy'),
                           keras.metrics.Precision(name='precision'),
                           keras.metrics.Recall(name='recall')])


# Train model on dataset
resnet.load_weights('/kaggle/working/models/cancer_auc4/') #start with best AUC in validations
history = resnet.fit(training_generator, epochs=10)

In [None]:
res = []
for x in X_test:
    x = np.load(x)
    x = np.array(separate_slices(x))
    pred = resnet.predict(np.array(x), verbose=0)
    res.append(np.argmax(pred))
print(res)

In [None]:
matrix_confusion = np.array([[0, 0, 0, 0],
                             [0, 0, 0, 0],
                             [0, 0, 0, 0],
                             [0, 0, 0, 0]])

for y_pred, y_true in zip(res, y_test):
    matrix_confusion[y_pred][y_true] += 1

print(matrix_confusion)