## Overview

  - Import Dependencies
  - Output File
  - Data Preparation
  - Data Generator / Data Augmentation
  - Data Exploration
  - Callbacks
  - Model Architecture
  - Evaluation
  - Saving and Loading the Model
  - Testing the Model
  - Reference

### **1. Import Dependencies**

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

#High Level Operations on Files and Collection of Files
import shutil

In [None]:
from glob import glob   # Finds all the pathnames matching a specified pattern
from skimage.io import imread   #Loading an image from a file

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.nasnet import NASNetMobile
from keras.applications.xception import Xception

from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, Input, Concatenate, GlobalMaxPooling2D, BatchNormalization
from keras.models import Model
from keras.optimizers import Adam

In [None]:
!pip install livelossplot
from livelossplot import PlotLossesKeras

### **2. Output Files / Parameters of the Model**

In [None]:
TRAINING_LOGS_FILE = 'training_logs.csv'
MODEL_SUMMARY_FILE = 'model.summary.txt'
MODEL_FILE = 'histopathologic_cancer_detector.h5'

TRAINING_PLOT_FILE = 'training.png'
VALIDATION_PLOT_FILE = 'validation.png'
ROC_PLOT_FILE = 'roc.png'

KAGGLE_SUBMISSION_FILE = 'kaggle_submission.csv'
INPUT_DIRECTORY = '../input/'

In [None]:
test_run = False

SAMPLE_COUNT = 85000
TRAINING_RATIO = 0.9
VERBOSITY = 1
TESTING_BATCH_SIZE = 5000

if test_run == True:
    NUM_EPOCHS = 5
else:
    NUM_EPOCHS = 100

IMG_HEIGHT = 96
IMG_WIDTH = 96
IMG_CHANNEL = 3
IMG_SIZE = (IMG_HEIGHT, IMG_WIDTH)
BATCH_SIZE = 216

### **3. Data Preparation**

In [None]:
#Training Directory
training_dir = INPUT_DIRECTORY + 'train/'

#Creating CSV File
data_frame = pd.DataFrame({'path': glob(os.path.join(training_dir, '*tif'))})
data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[3].split('.')[0])

#Importing or Reading CSV file
labels = pd.read_csv(INPUT_DIRECTORY + 'train_labels.csv')
data_frame = data_frame.merge(labels, on='id')

In [None]:
#Getting the negatives and positives
negatives = data_frame[data_frame.label == 0].sample(SAMPLE_COUNT)
positives = data_frame[data_frame.label == 1].sample(SAMPLE_COUNT)

data_frame = pd.concat([negatives, positives]).reset_index()   #Concat and Reseting index
data_frame = data_frame[['path', 'id', 'label']]        #Removing unnecessary columns

In [None]:
data_frame.image = data_frame.path.map(imread)   #Replacing?!?!?!?!

In [None]:
training_path = '../training'
validation_path = '../validation'

for folder in [training_path, validation_path]:
    for subfolder in ['0', '1']:
        path = os.path.join(folder, subfolder)
        os.makedirs(path, exist_ok=True)

In [None]:
training, validation = train_test_split(data_frame, 
                                        train_size=TRAINING_RATIO, 
                                        stratify=data_frame['label'])

In [None]:
data_frame.set_index('id', inplace=True)

In [None]:
for images_and_path in [(training, training_path), (validation, validation_path)]:
    images = images_and_path[0]
    path = images_and_path[1]
    for image in images['id'].values:
        file_name = image + '.tif'
        label = str(data_frame.loc[image,'label'])
        destination = os.path.join(path, label, file_name)
        
        if not os.path.exists(destination):
            source = os.path.join(INPUT_DIRECTORY + 'train', file_name)
            shutil.copyfile(source, destination)

### **4. Data Generator / Data Augmentation**

In [None]:
### Data Augmentation / Data Generation
training_data_generator = ImageDataGenerator(rescale=1./255,
                                             horizontal_flip=True,
                                             vertical_flip=True,
                                             rotation_range=90,
                                             zoom_range=0.2, 
                                             width_shift_range=0.1,
                                             height_shift_range=0.1,
                                             shear_range=0.05,
                                             channel_shift_range=0.1)
                                             
training_generator = training_data_generator.flow_from_directory(training_path,
                                                                 target_size=IMG_SIZE,
                                                                 batch_size=BATCH_SIZE,
                                                                 class_mode='binary')

validation_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(validation_path,
                                                                              target_size=IMG_SIZE,
                                                                              batch_size=BATCH_SIZE,
                                                                              class_mode='binary')

### **5. Callbacks**

In [None]:
### CALLBACKS
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger, TerminateOnNaN, TensorBoard

csv_logger = CSVLogger(TRAINING_LOGS_FILE, 
                       append=False, 
                       separator = ';')

model_checkpoint = ModelCheckpoint(MODEL_FILE, monitor='val_acc', 
                                   verbose = VERBOSITY, 
                                   save_best_only=True, 
                                   mode='max')

callback = [PlotLossesKeras(), csv_logger, model_checkpoint]

### **6. Model Architecture**

In [None]:
input_shape = IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL
inputs = Input(input_shape)

xception_model = Xception(include_top=False, weights='imagenet')
nasnet_model = NASNetMobile(include_top=False, weights='imagenet')

In [None]:
#Freezing some layer
for layer in xception_model.layers[:-10]:
    layer.trainable = False

'''for layer in xception_model.layers:
    print(layer, layer.trainable)'''

In [None]:
#Freezing some of the layers
for layer in nasnet_model.layers[:-44]:
    layer.trainable = False

'''for layer in nasnet_model.layers:
    print(layer, layer.trainable)'''

In [None]:
outputs = Concatenate(axis=-1)([GlobalAveragePooling2D()(xception_model(inputs)),
                                GlobalAveragePooling2D()(nasnet_model(inputs))])

outputs = Dropout(0.55)(outputs)
outputs = Dense(1, activation='sigmoid')(outputs)

In [None]:
final_model = Model(inputs, outputs)
final_model.compile(optimizer=Adam(lr=0.0001, decay=0.00001),
             loss='binary_crossentropy',
             metrics=['accuracy'])

#final_model.summary()

In [None]:
'''from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(final_model).create(prog='dot', format='svg'))

from keras.utils.vis_utils import plot_model
plot_model(final_model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)'''

In [None]:
final_model_history = final_model.fit_generator(training_generator,
                              steps_per_epoch=len(training_generator), 
                              validation_data=validation_generator,
                              validation_steps=len(validation_generator),
                              epochs=NUM_EPOCHS,
                              verbose=VERBOSITY,
                              callbacks=callback)

final_model.load_weights(MODEL_FILE)

### **7. Evaluation / Showing the Result**

In [None]:
epochs = [i for i in range(1, len(final_model_history.history['loss'])+1)]

plt.plot(epochs, history.history['loss'], color='blue', label="training_loss")
plt.plot(epochs, history.history['val_loss'], color='red', label="validation_loss")
plt.legend(loc='best')
plt.title('training')
plt.xlabel('epoch')
plt.savefig(TRAINING_PLOT_FILE, bbox_inches='tight')
plt.show()

plt.plot(epochs, history.history['acc'], color='blue', label="training_accuracy")
plt.plot(epochs, history.history['val_acc'], color='red',label="validation_accuracy")
plt.legend(loc='best')
plt.title('validation')
plt.xlabel('epoch')
plt.savefig(VALIDATION_PLOT_FILE, bbox_inches='tight')
plt.show()

### **8. Submission**

In [None]:
testing_files = glob(os.path.join(INPUT_DIRECTORY+'test/','*.tif'))

#Creating a dataframe
submission = pd.DataFrame()
for index in range(0, len(testing_files), TESTING_BATCH_SIZE):
    data_frame = pd.DataFrame({'path': testing_files[index:index+TESTING_BATCH_SIZE]})
    data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[3].split(".")[0])
    data_frame['image'] = data_frame['path'].map(imread)
    images = np.stack(data_frame.image, axis=0)
    
    predicted_labels = [final_model.predict(np.expand_dims(image/255.0, axis=0))[0][0] for image in images]
    predictions = np.array(predicted_labels)
    data_frame['label'] = predictions
    submission = pd.concat([submission, data_frame[["id", "label"]]])
submission.to_csv(KAGGLE_SUBMISSION_FILE, index=False, header=True)