# Deep Learning Model 

In [1]:
# Libraries

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import random

#from PIL import Image
# !pip install mat73
import mat73
import helper

#import cv2

import tensorflow as tf

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, UpSampling2D, Conv2DTranspose
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import BatchNormalization, Dropout, LeakyReLU
from tensorflow.keras.layers import Dense, Flatten, Concatenate
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split

### Parameters

In [2]:
resize = 0.2

## Image loading and preprocessing

In [3]:
### Load each file independently (to avoid memory overflows) and retriev PID
patient_ids_dict = dict()
label_ids_dict = dict()
shape_x_dict = dict()
shape_y_dict = dict()
image_dict = dict()

for file_number in range(1,3047):
    file = f'../data/brain-tumor-data-public/{file_number}.mat'
    data_dict = mat73.loadmat(file)
    patient_ids_dict[file_number] = data_dict['cjdata']['PID']
    label_ids_dict[file_number] = int(data_dict['cjdata']['label'])
    shape_x_dict[file_number] = data_dict['cjdata']['image'].shape[0]
    shape_y_dict[file_number] = data_dict['cjdata']['image'].shape[1]
    image_dict[file_number] = data_dict['cjdata']['image']

In [4]:
patient_ids = pd.Series(patient_ids_dict)
label_ids = pd.Series(label_ids_dict)
shape_x = pd.Series(shape_x_dict)
shape_y = pd.Series(shape_y_dict)
image = pd.Series(image_dict)

patients = pd.DataFrame({'pid':patient_ids, 'label':label_ids,
                         'x':shape_x, 'y':shape_y, 'image':image}, index=range(1, 3047))

In [5]:
patients['image'] = patients['image'].apply(lambda x : helper.imx_preproc(x, zero_up_to_one=True, resize=resize))
patients['image'] = patients['image'].apply(lambda x : x.astype('float32'))

## Train and Validation Sets

In [6]:
random.seed(1234)

val_size = 0.2
test_size = 0.2
# Training set has 2436 images test set has 610 images 
train_set, test_set = train_test_split(patients, stratify=patients['label'], test_size=test_size)
# Training set has 1948 images, validation set has 488 images
train_set, val_set = train_test_split(train_set, stratify=train_set['label'], test_size=val_size)

n_total_train = train_set.shape[0]

train_set_x = np.stack(train_set['image'].values)
train_set_y = np.array(train_set['label'])

val_set_x = np.stack(val_set['image'].values)
val_set_y = np.array(val_set['label'])

Applying one hot encoding to target variable y 

In [7]:
new_train_set_y = tf.one_hot(
    train_set_y,
    4,
    on_value=None,
    off_value=None,
    axis=None,
    dtype=None,
    name=None
)
new_train_set_y = np.array(new_train_set_y)[:,1:]

2022-04-29 18:40:19.009265: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-29 18:40:19.009615: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-29 18:40:19.010877: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [8]:
new_val_set_y = tf.one_hot(
    val_set_y,
    4,
    on_value=None,
    off_value=None,
    axis=None,
    dtype=None,
    name=None
)
new_val_set_y = np.array(new_val_set_y)[:,1:]

## CNN Implementation

In [None]:
batch_size = 64
IMG_H = int(round(512 * resize, 0))
IMG_W = int(round(512 * resize, 0))

In [None]:
CNN = Sequential([
    InputLayer(input_shape=(IMG_H, IMG_W, 1)),
    
    Conv2D(64, 3, activation='relu'),
    MaxPool2D(),

    Conv2D(64, 3, activation='relu'),
    MaxPool2D(),

    Conv2D(64, 3, activation='relu'),
    MaxPool2D(),

    Conv2D(64, 3, activation='relu'),
    MaxPool2D(),

    Flatten(),
    Dense(512, activation='relu'),
#     Dense(1, activation='sigmoid')
    Dense(3, activation='softmax')
])

CNN.summary()

In [None]:
# Compile: Define training parameters
CNN.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train model
EPOCHS = 10
BATCHES = n_total_train//batch_size

### Following line overflows memory
CNN.fit(train_set_x, new_train_set_y, steps_per_epoch=BATCHES, epochs=EPOCHS, validation_data=(val_set_x, new_val_set_y))

### Following line can still not be run because we do not have images as jpg in specific folders
# CNN.fit(train_generator, steps_per_epoch=BATCHES, epochs=EPOCHS, validation_data=val_generator)

### Following line was thought to be a solution to the memory running out, but it wasn't
# CNN.fit(train_dataset, epochs=10)

This alternative does not require doing OHE

In [None]:
# Compile: Define training parameters
CNN.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train model
EPOCHS = 10
BATCHES = n_total_train//batch_size

### Following line overflows memory
CNN.fit(train_set_x, new_train_set_y, steps_per_epoch=BATCHES, epochs=EPOCHS, validation_data=(val_set_x, new_val_set_y))

### Following line can still not be run because we do not have images as jpg in specific folders
# CNN.fit(train_generator, steps_per_epoch=BATCHES, epochs=EPOCHS, validation_data=val_generator)

### Following line was thought to be a solution to the memory running out, but it wasn't
# CNN.fit(train_dataset, epochs=10)

Esta es para que el validation set se calcule a partir del train set 

In [9]:
# Train model
EPOCHS = 10
BATCHES = n_total_train//batch_size

### Following line overflows memory
CNN.fit(train_set_x, train_set_y, steps_per_epoch=BATCHES, epochs=EPOCHS, validation_split = 0.1)

### Following line can still not be run because we do not have images as jpg in specific folders
# CNN.fit(train_generator, steps_per_epoch=BATCHES, epochs=EPOCHS, validation_data=val_generator)

### Following line was thought to be a solution to the memory running out, but it wasn't
# CNN.fit(train_dataset, epochs=10)

NameError: name 'batch_size' is not defined