# README

Questo notebook effettua training e validation su Keras application per capire quale 
combinazione di sequenza e modello sia più performante.
I modelli sono presi senza pesi e modificati per diventare un classificatore binario.
Le sequenze sono prese singolarmente.

In [2]:
from gliomi import *

# Define Model

In [3]:
import tensorflow as tf

optimizers = {
    'rmsprop': (lambda : tf.keras.optimizers.RMSprop(learning_rate=0.0001)),
    'sgd': (lambda : tf.keras.optimizers.SGD(lr=0.001, momentum=0.9, decay=0.001/10, nesterov=False)),
    'adam' : (lambda : tf.keras.optimizers.Adam(lr=0.0001))
}

# sgd = optimizers.SGD(lr=0.01, clipvalue=0.5)

In [4]:
# import tensorflow as tf

# from tensorflow.python.keras.applications import *
from tensorflow.keras.applications import *

# from tensorflow.python.keras.models import Model
from tensorflow.keras.models import Model

# from tensorflow.python.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Flatten, GlobalMaxPooling2D
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Flatten

# from keras.callbacks import EarlyStopping, ModelCheckpoint
# from tensorflow.python.keras.optimizers import SGD, Adam
# from keras.optimizers import SGD, Adam

# import tensorflow.python.keras.backend as K
import keras.backend as K

Using TensorFlow backend.


In [5]:
from tensorflow.python.keras.losses import BinaryCrossentropy
from tensorflow.python.keras.metrics import BinaryAccuracy

def get_model(model_name, optimizer, include_dropout=False):
    
    K.clear_session()

    K.set_image_data_format('channels_last')

    IMG_SHAPE = (224, 224, 1)

    # Base model is a Keras Application
    base_model = eval(model_name)(weights=None, include_top=True, input_shape=IMG_SHAPE)

    # add a global spatial average pooling layer
    # global_spatial_avg_pool_layer = GlobalMaxPooling2D()(base_model.output) # GlobalAveragePooling2D()(base_model.output)

    # flatten_layer = Flatten()(global_spatial_avg_pool_layer)

    # Feature leayer
    #if include_dropout:
        # Drop-out
        # dropout_layer = Dropout(0.5)(flatten_layer)
        # feature_layer = Dense(512, activation='relu', kernel_initializer='glorot_uniform')(dropout_layer)
    #else:
        # feature_layer = Dense(512, activation='relu', kernel_initializer='glorot_uniform')(flatten_layer)

    # and a logistic layer -- let's say we have 200 classes
    # prediction_layer = Dense(2, activation='softmax')(feature_layer)
    # prediction_layer = Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform')(feature_layer)
    prediction_layer = Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform')(base_model.output)

    # Final model
    model = Model(inputs=base_model.input, outputs=prediction_layer)
    
    # loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, name='categorical_crossentropy')
    
    # sigmoid with binary_crossentropy and single dense neuron
    
    # Compile model
    # model.compile(optimizer=optimizer, loss="mse", metrics=['accuracy'])
    model.compile(optimizer=optimizer,
                  loss=BinaryCrossentropy(from_logits=True),
                  metrics=[BinaryAccuracy()])

    return model

In [6]:
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
    
def train_and_test(model_name, model, X_train, y_train, X_test, y_test, random_state=42, epochs=100, batch_size=16):
    
    # early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
    # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs-2.2-test/{model_name}")

    aug = ImageDataGenerator(
        rotation_range=90,
        horizontal_flip=True,
        vertical_flip=True,
        # zoom_range=0.15,
        # width_shift_range=0.2,
        # height_shift_range=0.2,
        # shear_range=0.15,
        #fill_mode="nearest"
    )

    fit = model.fit(
            aug.flow(X_train, y_train, batch_size=batch_size),
            # X_train, y_train,
            epochs=epochs,
            # callbacks=[early_stopping_callback],
            validation_data=(X_test, y_test), 
            # batch_size=batch_size,
            shuffle=False)
    
    return fit

# Plotting

In [7]:
import os

def make_name(slice_dataset, dataset, model_name, optimizer_name, sequence, percentile):
    slice_dataset = os.path.basename(slice_dataset).replace("datasets-", "")
    return "-".join([slice_dataset, dataset, model_name, optimizer_name, sequence, percentile])

# Analysis

In [8]:
from sklearn.model_selection import train_test_split

import pickle
import numpy as np
import pandas as pd

class DatasetLoader():
    
    def __init__(self, dataset_path, classification_path, single=False):
        
        self.single=single
        
        self.X, self.y = get_dataset_for_classification(dataset_path, classification_path)
            
    def load(self):
        subjects = np.array(list(self.X.keys()))
        
        if self.single:
            self.slices = np.concatenate([[self.X[subject][0]] for subject in subjects])
            self.labels = np.concatenate([np.repeat((self.y)[subject], 1) for subject in subjects])
            self.subjects = np.array(subjects)
        else:
            self.slices = np.concatenate([self.X[subject] for subject in subjects])
            self.labels = np.concatenate([np.repeat((self.y)[subject], (self.X)[subject].shape[0]) for subject in subjects])
            self.subjects = np.concatenate([np.repeat(subject, (self.X)[subject].shape[0]) for subject in subjects])
        
        # Categorical
        dictionary = np.array([[0, 1], [1, 0]])
        int_labels = self.labels.astype(int)
        self.categorical_labels = dictionary[int_labels]

    def get_subjects(self):
        return self.subjects
        # return list(self.X.keys())

    def get_split(self, test_size=0.2, random_state=42):
        
        subjects = np.array(self.get_subjects())
        
        indexes = list(range(len(subjects)))
        
        train_index, test_index, _, _ = train_test_split(
            indexes, 
            self.labels,
            # stratify=self.labels,
            test_size=test_size, 
            random_state=random_state)
        
        train_subjects = subjects[train_index]
        test_subjects = subjects[test_index]
        
        X_train = self.slices[np.isin(self.subjects, train_subjects)]
        X_test = self.slices[np.isin(self.subjects, test_subjects)]
        y_train = self.labels[np.isin(self.subjects, train_subjects)]
        y_test = self.labels[np.isin(self.subjects, test_subjects)]
        
        return X_train, X_test, y_train, y_test
    
    def get_split_categorical(self, test_size=0.2, random_state=42):

        subjects = np.array(self.get_subjects())

        indexes = list(range(len(subjects)))

        train_index, test_index, _, _ = train_test_split(
            indexes,
            self.categorical_labels, 
            # stratify=self.labels,
            test_size=test_size, 
            random_state=random_state)
        
        train_subjects = subjects[train_index]
        test_subjects = subjects[test_index]
        
        X_train = self.slices[np.isin(self.subjects, train_subjects)]
        X_test = self.slices[np.isin(self.subjects, test_subjects)]
        y_train = self.categorical_labels[np.isin(self.subjects, train_subjects)]
        y_test = self.categorical_labels[np.isin(self.subjects, test_subjects)]
        
        return X_train, X_test, y_train, y_test


In [9]:
dataset_loader = DatasetLoader("/data/RMN/dataset-gliomi-cnn/datasets-tumor-crop/t1-224-100-perc.pickle", 
                               "/data/RMN/dataset-gliomi-cnn/dataset-survivor.csv", 
                               single=True)

FileNotFoundError: [Errno 2] No such file or directory: '/data/RMN/dataset-gliomi-cnn/datasets-tumor-crop/t1-224-100-perc.pickle'

In [None]:
dataset_loader.load()

In [None]:
dataset_loader.slices.shape

In [None]:
model = get_model("MobileNetV2", optimizer=optimizers["adam"](), include_dropout=True)

X_train, X_test, y_train, y_test = dataset_loader.get_split(test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
model.summary()

In [None]:
np.min(X_train), np.max(X_train), np.mean(X_train), np.std(X_train)

In [None]:
np.min(X_test), np.max(X_test), np.mean(X_test), np.std(X_test)

In [None]:
np.any(np.isnan(X_test))

In [None]:
batch_size = 16

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)

for i in range(100):
    
    print(model.predict(X_train))

    fit = model.fit(
            scaler.transform(X_train), y_train,
            epochs=1,
            validation_data=(scaler.transform(X_test), y_test), 
            batch_size=batch_size,
            shuffle=False)

In [None]:

history = train_and_test("MobileNetV2", 
                         model, 
                         X_train, 
                         y_train, 
                         X_test, 
                         y_test, 
                         random_state=42, 
                         epochs=1000, 
                         batch_size=32)

train_score = model.evaluate(X_train, y_train)

test_score = model.evaluate(X_test, y_test)


In [None]:
m = tf.keras.metrics.BinaryAccuracy()
m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]])
m.result()

In [None]:
model.metrics[1].name

In [None]:
# loss, categorical_accuracy, categorical_crossentropy
train_score

In [None]:
test_score

In [None]:
y_test, model.predict(X_test)

In [None]:
X_test_normalized =  normalize(X_test, max_value=1., axis=(1, 2))

In [None]:
np.mean(X_test_normalized), np.std(X_test_normalized), np.min(X_test_normalized), np.max(X_test_normalized)

In [None]:
np.mean(X_test), np.std(X_test), np.min(X_test), np.max(X_test)

In [None]:
# np.min(X_test, axis=0), np.max(X_test, axis=0)

In [None]:
np.mean(X_train), np.std(X_train), np.min(X_train), np.max(X_train)

In [None]:
model.predict(X_test)

In [None]:
import pandas as pd
import sys

slice_datasets = [
    # "/data/RMN/dataset-gliomi-cnn/datasets-full-brain",
    "/data/RMN/dataset-gliomi-cnn/2-datasets-tumor-crop"
]

augmentation = "aug-100-"

datasets = [
    "survivor", 
    "idh", 
    "ki67", 
    "egfr", 
    "mgmt"
]

percentiles = [
    100 
#    70
]

sequences = [
    "t1", 
    "t2", 
    "flair", 
    "rcbv", 
    "mprage"
    "adc"
]

keras_models = [
    "MobileNetV2",
    "NASNetMobile",
    "VGG19",
    "ResNet50",
    "ResNet101",
    "DenseNet169",
]

optimizer_names = [
    'rmsprop',
    'sgd',
    'adam'
]

result_file = "aug-new-results-2.2.csv"

side = 224

epochs = 500

columns = ["slice-dataset", "dataset", "model", "optimizer", "sequence", "percentile", "accuracy", "loss", "val. accuracy", "val. loss"]
rows_list = []
recover_count = 0
recover = False

if recover:
    df = pd.read_csv(result_file)
    rows_list = df.iloc[:, 1:]
    rows_list = np.array(rows_list).tolist()
    
for slice_dataset in slice_datasets:
    for dataset in datasets:
        for sequence in sequences:
            for percentile in percentiles:
                
                if recover and recover_count < len(rows_list):
                    pass
                else:
                    dataset_loader = DatasetLoader(f"{slice_dataset}/{augmentation}dataset-{dataset}-{sequence}-{side}-{percentile}-perc.pickle")
    
                for model_name in keras_models:
                    for optimizer_name in optimizer_names:

                        if recover and recover_count < len(rows_list):
                            recover_count = recover_count + 1
                            print("Skip row:", recover_count)
                            continue

                        recover = False

                        try:

                            K.clear_session()

                            plot_file_name = make_name(slice_dataset, dataset, model_name, optimizer_name, sequence, str(percentile))

                            print("Loading", model_name, "[", plot_file_name, "]")

                            model = get_model(model_name, optimizer=optimizers[optimizer_name]())

                            print("Training", model_name)

                            X_train, X_test, y_train, y_test = dataset_loader.get_split_categorical(test_size=0.2, random_state=42)

                            print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

                            history = train_and_test(model_name, model, X_train, y_train, X_test, y_test, random_state=42, epochs=epochs, batch_size=16)

                            save_plot(history, f"new-2.2-{plot_file_name}")

                            train_score = model.evaluate(X_train, y_train)

                            test_score = model.evaluate(X_test, y_test)

                            rows_list.append([
                                slice_dataset,
                                dataset,
                                model_name,
                                optimizer_name,
                                sequence,
                                percentile,
                                str(train_score[1]),
                                str(train_score[0]),
                                str(test_score[1]),
                                str(test_score[0])
                            ])

                            df = pd.DataFrame(rows_list, columns=columns)
                            df.to_csv(result_file)

                        except:
                            
                            e = sys.exc_info()[0]
                            print(e)

                            rows_list.append([
                                slice_dataset,
                                dataset,
                                model_name,
                                optimizer_name,
                                sequence,
                                percentile,
                                "Error",
                                "Error",
                                "Error",
                                "Error"
                            ])

                            df = pd.DataFrame(rows_list, columns=columns)
                            df.to_csv(result_file)                    


# Cleanup Memory

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()