## Installing Dependencies

In [None]:
!pip install tensorflow-gpu==2.0.0-beta1
# Load the TensorBoard notebook extension
%load_ext tensorboard

## Importing Dependencies**

In [None]:
import os
import math
import shutil
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras as krs
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
from PIL import Image
from PIL import ImageDraw
from glob import glob
from tqdm import tqdm
from skimage.io import imread
from IPython.display import SVG

In [None]:
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [None]:
tf.__version__

In [None]:
!pip install livelossplot
from livelossplot.keras import PlotLossesCallback

## Data Preparation**

In [None]:
TRAINING_LOGS_FILE = "training_logs.csv"
MODEL_SUMMARY_FILE = "model_summary.txt"
MODEL_FILE = "histopathologic_cancer_detector.h5"

TRAINING_PLOT_FILE = "training.png"
VALIDATION_PLOT_FILE = "validation.png"
#ROC_PLOT_FILE = "roc.png"

INPUT_DIR = '../input/'
SAMPLE_COUNT = 60000
TESTING_BATCH_SIZE = 5000

In [None]:
training_dir = INPUT_DIR + 'train/'

df = pd.DataFrame({'path': glob(os.path.join(training_dir,'*.tif'))})
df['id'] = df.path.map(lambda x: x.split('/')[3].split('.')[0])

labels = pd.read_csv(INPUT_DIR + 'train_labels.csv')

df = df.merge(labels,on='id')
negative_values = df[df.label == 0].sample(SAMPLE_COUNT)
positive_values = df[df.label == 1].sample(SAMPLE_COUNT)

df = pd.concat([negative_values,positive_values]).reset_index()
df = df[['path','id','label']]
df['image'] = df['path'].map(imread)

In [None]:
train_path = '../training'
val_path = '../validation'

for directory in [train_path,val_path]:
    for sub_directory in ['0','1']:
        path = os.path.join(directory,sub_directory)
        os.makedirs(path,exist_ok=True)
        
train,val = train_test_split(df,train_size=0.8,stratify=df['label'])
df.set_index('id',inplace=True)

for images_paths in [(train,train_path),(val,val_path)]:
    images = images_paths[0]
    path = images_paths[1]
    for image in images['id'].values:
        file_name = image + '.tif'
        label = str(df.loc[image,'label'])
        destination = os.path.join(path,label,file_name)
        if not os.path.exists(destination):
            source = os.path.join(INPUT_DIR + 'train',file_name)
            shutil.copyfile(source,destination)

## Data Generators and Data Augmentation**

**Note: ** Take note of the input shape of Data Generated Image.

In [None]:
train_datagen = ImageDataGenerator(rescale=1/255,
                                   horizontal_flip=True,
                                   vertical_flip=True,
                                   rotation_range=30,
                                   zoom_range=0.2, 
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   shear_range=0.05,
                                   channel_shift_range=0.1)
train_generator = train_datagen.flow_from_directory(train_path,  
                                                    target_size=(96,96), 
                                                    batch_size=10,
                                                    class_mode='binary')

validation_datagen = ImageDataGenerator(rescale=1/255)
validation_generator = validation_datagen.flow_from_directory(val_path,  
                                                              target_size=(96,96), 
                                                              batch_size=10,
                                                              class_mode='binary')

## Pre-Train Network (Transfer Learning) / Creating the Model

**Note: ** Take note of the input shape of pre-trained network.

In [None]:
base_model = krs.applications.VGG19(include_top=False,
                                    input_shape = (96,96,3),
                                    weights = 'imagenet')

In [None]:
# Checking the Layers of the Model
'''for layer in base_model.layers[:-15]:
    layer.trainable = False'''
    
for layer in base_model.layers:
    print(layer,layer.trainable)

## Creating the Model

In [None]:
def create_model(base_model, summary=False):
    model = krs.Sequential(name='VGG19 + FC')
    model.add(base_model)
    model.add(krs.layers.Flatten())
    model.add(krs.layers.Dense(1024,activation='relu'))
    model.add(krs.layers.Dropout(0.5))
    model.add(krs.layers.Dense(1,activation='sigmoid'))
    if summary==True:
        model.summary()
    return model

# tf.keras.layers.Add
# tf.keras.layers.Input

In [None]:
model = create_model(base_model, True)

## Checking the Entire Model**

In [None]:
# Checking the Layers of the Model
'''for layer in base_model.layers[:-15]:
    layer.trainable = False'''
    
for layer in base_model.layers:
    print(layer,layer.trainable)

In [None]:
# Checking the Layers of the Model
'''for layer in model.layers[:-15]:
    layer.trainable = False'''
    
for layer in model.layers:
    print(layer,layer.trainable)

## Callbacks**

**Note: ** Although functioning in code, tensorboard is not displaying on kaggle's kernel.

In [None]:
# Load the TensorBoard notebook extension.
%load_ext tensorboard
%reload_ext tensorboard

In [None]:
from keras.callbacks import ModelCheckpoint, CSVLogger
log_dir="logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

checkpoint = ModelCheckpoint(
    './base.model',
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    mode='min',
    save_weights_only=False,
    period=1
)

csvlogger = CSVLogger(
    filename= "training_csv.log",
    separator = ",",
    append = False
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
callbacks = [tensorboard_callback]

## Tuning, Compiling and Training the Model **

In [None]:
# Hyperparameters
total_data = 9600
lr = 1e-4
n_epochs = 50
steps_epoch = total_data / 4
verbosity = 1

# Compile the Model, Loss and Optimizer
model.compile(loss='binary_crossentropy',
              optimizer=krs.optimizers.Adam(lr=lr),
              metrics=['accuracy'])

In [None]:
training = model.fit_generator(train_generator,
                               #batch_size=steps_epoch,
                               steps_per_epoch=steps_epoch,
                               epochs=n_epochs,
                               validation_data=validation_generator,
                               validation_steps=steps_epoch,
                               verbose=verbosity,
                               callbacks=callbacks)

## Showing the Results**

**Note: ** Although functioning in code, tensorboard is not displaying on kaggle's kernel.

In [None]:
#----Custom function to visualize the training of the model------#
def show_final_history(score):
    fig, ax = plt.subplots(1, 2, figsize=(16,5))
    
    ax[0].plot(score.epoch, score.history["loss"], label="Train loss")
    ax[0].plot(score.epoch, score.history["val_loss"], label="Validation loss")
    ax[0].ylabel('Loss')
    ax[0].xlabel('# of Episode')
    ax[0].grid(which="major", alpha=0.30)
    
    ax[1].plot(score.epoch, score.history["acc"], label="Train acc")
    ax[1].plot(score.epoch, score.history["val_acc"], label="Validation acc")
    ax[1].ylabel('Accuracy')
    ax[1].xlabel('# of Episode')
    ax[1].grid(which="major", alpha=0.30)
    
    ax[0].legend()
    ax[1].legend()

In [None]:
show_final_history(history)
print("Validation Accuracy: " + str(history.history['val_acc'][-1:]))

In [None]:
# Tensorboard Not Working
# Starting Tensorboard
%tensorboard --logdir logs/fit

## Saving and Loading the Results

In [None]:
## Saving and Loading the WEIGHTS
model.save_weights('my_model_weights.h5')
model.load_weights('my_model_weights.h5')

In [None]:
# Saving and Loading the whole model (ARCHITECTURE + WEIGHTS + OPTIMIZER STATE)
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
model = load_model('my_model.h5')

## ROC & AUC

- **ROC Curve**
    - Curve of probability
    - In a ROC curve the true positive rate (Sensitivity) is plotted in function of the false positive rate (100-Specificity) for different cut-off points of a parameter. Each point on the ROC curve represents a sensitivity/specificity pair corresponding to a particular decision threshold. 


 - **AUD or Area under the Curve**
    - is a measure of how well a parameter can distinguish between two diagnostic groups (diseased/normal).

## Acknowledgement
- https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5
- https://www.medcalc.org/manual/roc-curves.php