# README

This notebook load dataset prepared by `1 - Load Data.ipynb` and run a series of test on pretrained Keras Application.

In particular the procedure is:
* Download a pretrained Keras Application
* Split Subject dataset in train and test
* Use train slices to in Keras Appliation to classify the slices
* Use classification prediction as SVM features to be trained with the corresponding label
* Test the classification of test slices against the trained SVM.

# Analyze

In [1]:
from gliomi import *

In [2]:
import tensorflow as tf
from keras import backend as K
from sklearn import svm
import numpy as np
from tensorflow.keras.applications import *
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import sklearn
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [3]:
keras_models = [
    "MobileNetV2",
    "NASNetMobile",
    "VGG19",
    "ResNet50",
    "ResNet101",
    "DenseNet169",
]

In [4]:
IMG_SHAPE = (224, 224, 3)

In [5]:
def make_3_channels(images):
    return np.concatenate([
        images,
        images,
        images
    ], axis=3)

In [9]:
def evaluate_model(model, dataset, times):
    
    scores = []
    
    for random_state in range(times):
        
        X_train, X_test, y_train, y_test = dataset.get_split(test_size=0.2, random_state=random_state)
            
        features_train = model.predict([make_3_channels(X_train)])

        features_test = model.predict([make_3_channels(X_test)])
            
        clf = svm.SVC()

        clf.fit(features_train, y_train)

        score = clf.score(features_test, y_test)

        print(random_state, ":", score)

        scores.append(score)

    return np.array(scores)

In [7]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pickle
import numpy as np
import pandas as pd

class DatasetLoader():
    
    def __init__(self, dataset_path, classification_path, single=False):
        
        self.single=single
        
        with open(dataset_path, "rb") as file:
            X = pickle.load(file)

        df = pd.read_csv(classification_path)
        subjects = np.array(df.iloc[:,1])
        labels = np.array(df.iloc[:,2])

        X_new = {}
        y_new = {}

        for i, subject in enumerate(subjects):
            if subject in X:
                X_new[subject] = X[subject]
                y_new[subject] = labels[i]

        self.X = X_new
        self.y = y_new
            
    def load(self):
        subjects = np.array(list(self.X.keys()))
        
        if self.single:
            self.slices = np.concatenate([[self.X[subject][0]] for subject in subjects])
            self.labels = np.concatenate([np.repeat((self.y)[subject], 1) for subject in subjects])
            self.subjects = np.array(subjects)
        else:
            self.slices = np.concatenate([self.X[subject] for subject in subjects])
            self.labels = np.concatenate([np.repeat((self.y)[subject], (self.X)[subject].shape[0]) for subject in subjects])
            self.subjects = np.concatenate([np.repeat(subject, (self.X)[subject].shape[0]) for subject in subjects])
        
        # Categorical
        dictionary = np.array([[0, 1], [1, 0]])
        int_labels = self.labels.astype(int)
        self.categorical_labels = dictionary[int_labels]

    def get_subjects(self):
        return list(self.X.keys())

    def get_split(self, test_size=0.2, random_state=42):
        
        subjects = np.array(self.get_subjects())
        
        indexes = list(range(len(subjects)))
        
        train_index, test_index, _, _ = train_test_split(
            indexes, 
            self.labels,
            # stratify=self.labels,
            test_size=test_size, 
            random_state=random_state)
        
        train_subjects = subjects[train_index]
        test_subjects = subjects[test_index]
        
        X_train = self.slices[np.isin(self.subjects, train_subjects)]
        X_test = self.slices[np.isin(self.subjects, test_subjects)]
        y_train = self.labels[np.isin(self.subjects, train_subjects)]
        y_test = self.labels[np.isin(self.subjects, test_subjects)]
        
        return X_train, X_test, y_train, y_test
    
    def get_split_categorical(self, test_size=0.2, random_state=42):

        subjects = np.array(self.get_subjects())

        indexes = list(range(len(subjects)))

        train_index, test_index, _, _ = train_test_split(
            indexes,
            self.categorical_labels, 
            # stratify=self.labels,
            test_size=test_size, 
            random_state=random_state)
        
        train_subjects = subjects[train_index]
        test_subjects = subjects[test_index]
        
        X_train = self.slices[np.isin(self.subjects, train_subjects)]
        X_test = self.slices[np.isin(self.subjects, test_subjects)]
        y_train = self.categorical_labels[np.isin(self.subjects, train_subjects)]
        y_test = self.categorical_labels[np.isin(self.subjects, test_subjects)]
        
        return X_train, X_test, y_train, y_test

In [12]:
import pandas as pd

columns = ["slice-dataset", "dataset", "model", "sequence", "percentile", "scores", "avg. scores", "std. scores"]

times = 10

rows_list = []

side = 224

recover_count = 0

recover = False

output_file = "06072020-results-2.1.csv"

if recover:
    df = pd.read_csv(output_file)
    rows_list = df.iloc[:, 1:]
    rows_list = np.array(rows_list).tolist()

for dataset in ["survivor", "idh", "ki67", "egfr", "mgmt"]:
    for slice_dataset in ["/data/RMN/dataset-gliomi-cnn/datasets-full-brain", "/data/RMN/dataset-gliomi-cnn/datasets-tumor-crop"]:
        for sequence in ["t1", "t2", "flair", "rcbv", "mprage", "adc"]:
            for percentile in [100]:
                
                if recover and recover_count < len(rows_list):
                    pass
                else:
                    dataset_loader = DatasetLoader(
                        f"{slice_dataset}/{sequence}-{side}-{percentile}-perc.pickle",
                        f"/data/RMN/dataset-gliomi-cnn/dataset-{dataset}.csv",
                        single=True)
                    dataset_loader.load()
    
                for model_name in keras_models:

                    if recover and recover_count < len(rows_list):
                        recover_count = recover_count + 1
                        print("Skip row:", recover_count)
                        continue
                
                    recover = False

#                    try:

                    K.clear_session()

                    print("Loading", model_name)

                    base_model = eval(model_name)(weights='imagenet', include_top=True, input_shape=IMG_SHAPE)

                    print("Evaluating", model_name)

                    scores = evaluate_model(base_model, dataset_loader, times)

                    scores_str = ", ".join(str(x) for x in (scores * 100))

                    rows_list.append([
                        slice_dataset,
                        dataset,
                        model_name,
                        sequence,
                        percentile,
                        scores_str,
                        np.average(scores * 100),
                        np.std(scores * 100)
                    ])

                    df = pd.DataFrame(rows_list, columns=columns)
                    df.to_csv(output_file)



Loading MobileNetV2
Evaluating MobileNetV2
0 : 0.48148148148148145
1 : 0.48148148148148145
2 : 0.4074074074074074
3 : 0.4444444444444444
4 : 0.4074074074074074
5 : 0.2962962962962963
6 : 0.4444444444444444
7 : 0.37037037037037035
8 : 0.3333333333333333
9 : 0.48148148148148145
Loading NASNetMobile
Evaluating NASNetMobile
0 : 0.48148148148148145
1 : 0.5925925925925926
2 : 0.4074074074074074
3 : 0.5185185185185185
4 : 0.4444444444444444
5 : 0.4444444444444444
6 : 0.48148148148148145
7 : 0.4074074074074074
8 : 0.4444444444444444
9 : 0.4444444444444444
Loading VGG19
Evaluating VGG19
0 : 0.5555555555555556
1 : 0.48148148148148145
2 : 0.4074074074074074
3 : 0.4074074074074074
4 : 0.6296296296296297
5 : 0.4444444444444444
6 : 0.4444444444444444
7 : 0.6666666666666666
8 : 0.4444444444444444
9 : 0.4444444444444444
Loading ResNet50
Evaluating ResNet50
0 : 0.5185185185185185
1 : 0.48148148148148145
2 : 0.4074074074074074
3 : 0.4444444444444444
4 : 0.5555555555555556
5 : 0.4444444444444444
6 : 0.44

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [16]:
from gliomi import *

In [34]:
a = np.array([1, 2, 3, 40, 5, 6, 7])

In [35]:
a[ordered_index_percentile_of_sizes(np.array(a), 50)]

array([40,  7,  6,  5])

In [39]:
percentile = 50

roi_sizes = np.array(a)

non_empty_sizes = roi_sizes[np.where(roi_sizes > 0)]

percentile_val = np.percentile(non_empty_sizes, percentile)

b = np.where(roi_sizes >= percentile_val)[0]


In [40]:
a[b]

array([40,  5,  6,  7])

In [15]:
a[-3:]

[5, 6, 7]

In [None]:
                    except:
                        
                        rows_list.append([
                            slice_dataset,
                            dataset,
                            model_name,
                            sequence,
                            percentile,
                            "Error",
                            0,
                            0
                        ])

                        df = pd.DataFrame(rows_list, columns=columns)
                        df.to_csv(output_file)
                        
                        pass

# Results Analysis

In [None]:
df = pd.read_csv("results-2.1.csv")

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
plt.close('all')

In [None]:
df.columns

In [None]:
df1 = df[df.iloc[:,7] > 85]
df1

In [None]:
df2 = df1[df1.iloc[:, 4] == 't2']
df2

In [None]:
df.iloc[:,]

In [None]:
df.plot.bar(x='Unnamed: 0', y='avg. scores')

In [None]:
import numpy as np

def max_score(df):
    return df[df['avg. scores'] == np.max(df['avg. scores'])]

def the_winner_is(df, dataset):
    return max_score(df[(df['dataset'] == dataset)])

In [None]:
import os

winners = pd.concat([
    the_winner_is(df, 'survivor'),
    the_winner_is(df, 'idh'),
    the_winner_is(df, 'mgmt'),
    the_winner_is(df, 'ki67'),
    the_winner_is(df, 'egfr'),    
])

winners['slice-dataset'] = winners['slice-dataset'].transform(lambda x: os.path.basename(x).replace("datasets-", ""))

winners = winners[['dataset', 'slice-dataset' , 'model', 'sequence', 'percentile', 'avg. scores', 'std. scores']]

winners

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
import numpy as np

def boruta(X, y):
    
    ###initialize Boruta
    forest = RandomForestRegressor(
       n_jobs = -1, 
       max_depth = 5
    )
    
    boruta = BorutaPy(
       estimator = forest, 
       n_estimators = 'auto',
       max_iter = 100 # number of trials to perform
    )
    
    ### fit Boruta (it accepts np.array, not pd.DataFrame)
    boruta.fit(X, y)
    
    return boruta.support_, boruta.support_weak_

In [None]:
def apply_boruta(X, columns):
    flat_list = [item for sublist in columns for item in sublist]
    return X[:,flat_list]    

In [13]:
from numba import cuda
cuda.select_device(0)
cuda.close()