In [20]:
import cv2
import numpy as np
import pandas as pd
import os
import sys
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import keras_tuner as kt
from sklearn.metrics import classification_report
import shutil
from os import path
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image_dataset_from_directory
from keras.preprocessing.image import ImageDataGenerator

# path variables to access files
root = os.getcwd()

### Load dataframe

In [21]:
main_data = pd.read_csv("data_labels_mainData.csv")
extra_data = pd.read_csv("data_labels_extraData.csv")

### Get data from splitted folder

In [22]:
# method get data for multiclass task (task 2)
def celltype_classify_data_all(class_list):    
    images = list()
    labels = list()
    
    for i, label in enumerate(class_list):
        # get image directory
        img_dir = os.path.join(root, "multiclass_task", f"{label}")
        
        for img in os.listdir(img_dir):
            img = cv2.imread(os.path.join(img_dir, img))
            # resize to 0-1 for faster computation
            resized = img / 255
            images.append(resized)
            labels.append(i)
        
    return (images, labels)

all_class = ["epithelial", "fibroblast", "inflammatory", "others"]

# method get data for multiclass task (task 2)
def celltype_classify_data(class_list, mode):    
    images = list()
    labels = list()
    
    for i, label in enumerate(class_list):
        # get image directory
        img_dir = os.path.join(root, "split3-multi-task", f"{mode}", f"{label}")
        
        for img in os.listdir(img_dir):
            img = cv2.imread(os.path.join(img_dir, img))
            # resize to 0-1 for faster computation
            resized = img / 255
            images.append(resized)
            labels.append(i)
        
    return (images, labels)

### Classification report method

In [23]:
def multiclass_classification_report(y_test, prediction, print_out=True):
    """
        Method to generate sklearn classification report with CNN multiclass output
    """
    
    encoded_pred = list()
    # convert each CNN output (sparse categorial) to class
    for pred in prediction:
        encoded_pred.append(np.argmax(pred))

    encoded_pred = np.array(encoded_pred)
    if print_out:
        print(classification_report(y_test, encoded_pred))
    
    return classification_report(y_test, encoded_pred, output_dict = True)

### Structure of sub-models

In [24]:
def get_multiclass_model():
    """
        Structure of model classifying 4 classes
    """
    
    model = tf.keras.Sequential()
    # First convo-pooling
    # Convolutional layers (filter the image with a kernel)
    model.add(tf.keras.layers.Conv2D(64, (3, 3), strides=1,activation="relu", input_shape=[27, 27, 3]))
    # Max-pooling layers (reduce the size of the image by choosing max pixel at certain area)
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=1))
    
    # Second convo-pooling
    # Convolutional layers (filter the image with a kernel)
    model.add(tf.keras.layers.Conv2D(64, (3, 3),strides=1, activation="relu"))
    # Max-pooling layers (reduce the size of the image by choosing max pixel at certain area)
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2),strides=1))
    
    # Flatten input
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(256, activation="relu"))
    model.add(tf.keras.layers.Dense(128, activation="relu"))
    model.add(tf.keras.layers.Dense(32, activation="relu"))
    # Output layer
    model.add(tf.keras.layers.Dense(4, activation="softmax"))
    # Compile model 
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])
    
    return model

def get_subclass_model():
    """
        Structure of model classifying 3 classes
    """
    
    model = tf.keras.Sequential()
    # First convo-pooling
    # Convolutional layers (filter the image with a kernel)
    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation="relu", input_shape=[27, 27, 3]))
    # Max-pooling layers (reduce the size of the image by choosing max pixel at certain area)
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2)))
    # Flatten input
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(256, activation="relu"))
    model.add(tf.keras.layers.Dense(128, activation="relu"))
    model.add(tf.keras.layers.Dense(32, activation="relu"))
    # Output layer
    model.add(tf.keras.layers.Dense(3, activation="softmax"))
    # Compile model 
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])
    
    return model

### Methods to train and save submodels

In [25]:
submodel_path = os.path.join(root, "multiclass_submodels")   
def three_class_submodel(file_path, aug_train = None, aug_val = None):
    # path to model trained with 3 classes
    subclass_path = path.join(file_path, "subclass.h5")
    subclass_model = None
    if not os.path.isfile(subclass_path) or keras.models.load_model(subclass_path) == None:
        print("---Sub model training---")
        # train subclass that fit with 3 types of images
        subclass_model = get_subclass_model()
        
        if aug_train is not None and aug_val is not None:
            print("Train with augmented data")
            subclass_model.fit(aug_train, epochs=50, validation_data=aug_val)
        else:
            print("Train with non-augmented data")
            subclass_model.fit(subx_train, suby_train, epochs=40, validation_data=(subx_test, suby_test))
            
        subclass_model.save(subclass_path)
        return subclass_model
    else:
        subclass_model = keras.models.load_model(subclass_path)
        print("Model trained with 3 classes loaded")
        return subclass_model
    
def all_class_submodel(file_path, aug_train = None, aug_val = None):
    allclass_path = path.join(file_path, "allclass.h5")
    allclass_model = None
    if not os.path.isfile(allclass_path) or keras.models.load_model(allclass_path) == None:
        print("---Sub model training---")
        # train subclass that fit with 4 types of images
        allclass_model = get_multiclass_model()
        
        if aug_train is not None and aug_val is not None:
            print("Train with augmented data")
            allclass_model.fit(aug_train, epochs=50, validation_data=aug_val)
        else:
            print("Train with non-augmented data")
            allclass_model.fit(x_train, y_train, epochs=15, validation_data=(x_val, y_val))
            
        allclass_model.save(allclass_path)
        return allclass_model
    else:
        allclass_model = keras.models.load_model(allclass_path)
        print("Model trained with 4 classes loaded")
        return allclass_model

### Meta-learner

In [26]:
def get_transfer_model():
    model = tf.keras.Sequential()
    # Flatten input
    model.add(tf.keras.layers.Flatten())
    # Hidden layers
    model.add(tf.keras.layers.Dense(256, activation="relu"))
    model.add(tf.keras.layers.Dense(128, activation="softmax"))
    model.add(tf.keras.layers.Dense(32, activation="relu"))
    model.add(tf.keras.layers.Dense(3, activation="sigmoid"))
    # Output layer
    model.add(tf.keras.layers.Dense(4, activation="softmax"))
    # Compile model
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])
    return model

### Get augmentated data for training

In [47]:
# load images from folders
all_class = ["epithelial", "fibroblast", "inflammatory", "others"]
cell_img, cell_label = celltype_classify_data_all(all_class)
cell_img, cell_label = np.array(cell_img), np.array(cell_label)

sub_multiclass = ["epithelial", "fibroblast", "inflammatory"]
sub_img, sub_label = celltype_classify_data_all(sub_multiclass)

# # train and validate data for 3 classes
# subx_train, subx_test, suby_train, suby_test = train_test_split(
#     np.array(sub_img), np.array(sub_label), test_size=0.2
# )

# # train and validate data for 4 classes
# x_train_val, x_test, y_train_val, y_test = train_test_split(
#     cell_img, cell_label, test_size=0.2
# )
# x_train, x_val, y_train, y_val = train_test_split(
#     x_train_val, y_train_val, test_size=0.25
# )

x_train, y_train = celltype_classify_data(all_class, "train")
x_train, y_train = np.array(x_train), np.array(y_train)

x_val, y_val = celltype_classify_data(all_class, "val")
x_val, y_val = np.array(x_val), np.array(y_val)

x_test, y_test = celltype_classify_data(all_class, "test")
x_test, y_test = np.array(x_test), np.array(y_test)

sub_multiclass = ["epithelial", "fibroblast", "inflammatory"]
subx_train, suby_train = celltype_classify_data(sub_multiclass, "train")
subx_train, suby_train = np.array(subx_train), np.array(suby_train)
subx_test, suby_test = celltype_classify_data(sub_multiclass, "val")
subx_test, suby_test = np.array(subx_test), np.array(suby_test)


# get augmentation for extra training data
extra_datagen = ImageDataGenerator(
    rotation_range=60,
    horizontal_flip=True,
    vertical_flip=True,

)

# fit augmentation with all images from all 4 classes
extra_datagen.fit(cell_img)

rotation_aug = ImageDataGenerator(
    rotation_range=60
)
flip_aug = ImageDataGenerator(
    horizontal_flip=True,
    vertical_flip=True,
)
aug_list = [rotation_aug, flip_aug]

# git aug_list with images
for aug in aug_list:
    aug.fit(cell_img)

### Helper function 

In [100]:
def get_extra_stacked(x, submodels, datagen):
    """Generate stacked output from submodels with augmented x for meta learner"""
    stacked = None
    # augmentate x before predicting with submodels
    pred_gen = datagen.flow(x, shuffle=False)
    for submodel in submodels:
        if stacked is None:
            stacked = submodel.predict_generator(pred_gen)
        else:
            # augmentate x before predicting with submodels
            new_pred = submodel.predict_generator(pred_gen)
            stacked = np.concatenate((stacked, new_pred), axis=1)
    return stacked

def get_meta_prediction(x, sub_models, meta_learner):
    """Method to get meta model prediction """
    stacked_x = get_extra_stacked(x, extra_submodels,extra_datagen)
    pred = meta_learner.predict(stacked_x)
    return pred

def avg_pred(x, aug_list, sub_models, meta_learner):
    """Method to get average meta model prediction of different augmentations of the same x"""
    
#     get prediction of mix match model
    pred_arr = None
#     Find the sum of all predictions from each augmentation test set
    for aug in aug_list:
        # get stacked output from submodels to let meta_learner predict 
        stacked_x = get_extra_stacked(x, sub_models, aug)
        # predict with metta_learner
        pred = meta_learner.predict(stacked_x)
        
        if pred_arr is None:
            pred_arr = pred
        else:
            pred_arr = np.add(pred_arr, pred)
#     Find the average of prediction      
    pred_arr = np.true_divide(pred_arr, len(aug_list))
    return pred_arr

### Train metal model with augmented data

### 

In [98]:
extramodel_path = os.path.join(root, "extra_submodels")
# data for 3 classes submodels
sub_train = extra_datagen.flow(subx_train, suby_train)
sub_val = extra_datagen.flow(subx_test, suby_test)

# data for 4 classes submodels 
all_train = extra_datagen.flow(x_train, y_train)
all_val = extra_datagen.flow(x_val, y_val)

# train submodels with augmented x
subclass_extra = three_class_submodel(extramodel_path, aug_train=sub_train, aug_val=sub_val)
allclass_extra = all_class_submodel(extramodel_path, aug_train=all_train, aug_val=all_val)
extra_submodels = [subclass_extra, allclass_extra]

# get stacked x for meta-learner
stacked_x_train = get_extra_stacked(x_train,extra_submodels, extra_datagen)
stacked_x_val = get_extra_stacked(x_val,extra_submodels, extra_datagen)

extra_model = get_transfer_model()
# fit model with stacked data from submodels
extra_model.fit(stacked_x_train, y_train, epochs=30, validation_data=(stacked_x_val, y_val))

Model trained with 3 classes loaded
Model trained with 4 classes loaded


2022-05-14 10:24:13.030737: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:24:13.984174: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 1/30


2022-05-14 10:24:15.654059: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-14 10:24:17.433570: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x2bb8f1e80>

### Average prediction of multiple augmentations vs single output

In [102]:
# predict with mix match (avg of multiple augmentations)
avg = avg_pred(x_test, aug_list, extra_submodels, extra_model)
# predict with one meta learner output
meta_pred = get_meta_prediction(x_test, extra_submodels, extra_model)

print("--- Stats for augmentations average prediction ---")
multiclass_classification_report(y_test,avg);

print("--- Stats for one output prediction ---")
multiclass_classification_report(y_test,meta_pred);

--- Stats for augmentations average prediction ---
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       817
           1       0.75      0.78      0.76       379
           2       0.73      0.83      0.78       510
           3       0.70      0.50      0.59       278

    accuracy                           0.80      1984
   macro avg       0.77      0.75      0.76      1984
weighted avg       0.80      0.80      0.80      1984

--- Stats for one output prediction ---
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       817
           1       0.71      0.76      0.74       379
           2       0.71      0.81      0.76       510
           3       0.64      0.41      0.50       278

    accuracy                           0.78      1984
   macro avg       0.74      0.72      0.73      1984
weighted avg       0.78      0.78      0.78      1984



### Get extra dataset

In [103]:
def load_extra_epi():   
    """Load cancerous/epithelial in the extra dataset"""
    images = list()
    labels = list()
    
    # get image directory
    img_dir = os.path.join(root, "extra", "1")

    for img in os.listdir(img_dir):
        img = cv2.imread(os.path.join(img_dir, img))
        # resize to 0-1 for faster computation
        resized = img / 255
        images.append(resized)
        
        labels.append(0)
    return (images, labels)

def load_unlabeled():
    """Load unlabeled data from the extra dataset"""
    images = list()
    
    # get image directory
    img_dir = os.path.join(root, "extra", "0")

    for img in os.listdir(img_dir):
        img = cv2.imread(os.path.join(img_dir, img))
        # resize to 0-1 for faster computation
        resized = img / 255
        images.append(resized)
    return images

### Load extra data

In [104]:
epi_train, epi_label = load_extra_epi()
epi_train, epi_label = np.array(epi_train), np.array(epi_label)

unlabeled = load_unlabeled()
unlabeled = np.array(unlabeled)

extra_model.save(path.join(root, "extra_model.h5"))

print(epi_label.shape)
print(unlabeled.shape)

(2990,)
(7394, 27, 27, 3)


### Generate additional train data from extra dataset

In [105]:
def encode_labels(pred_labels):
    encoded_list = list()
    # transform pseudo label to consists of integers
    for i, pred in enumerate(pseudo_labels):
        encoded = np.argmax(pred)
        encoded_list.append(encoded)
        
    return np.array(encoded_list)

# predict unlabeled data
pseudo_labels = avg_pred(unlabeled, aug_list, extra_submodels, extra_model)
pseudo_labels = encode_labels(pseudo_labels)

2022-05-14 10:31:34.525718: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### Filter out epithelial to improve class imbalance

In [106]:
epi_indices = np.where(pseudo_labels == 0)[0]

pseudo_labels = np.delete(pseudo_labels, epi_indices, axis=0)
unlabeled = np.delete(unlabeled, epi_indices, axis=0)

print(pseudo_labels.shape)
print(unlabeled.shape)

(4566,)
(4566, 27, 27, 3)


### Initial performance

In [107]:
og_performance = multiclass_classification_report(y_test,meta_pred, print_out=False)["accuracy"]
print(f"Initial training accuracy {og_performance}")

Initial training accuracy 0.7842741935483871


### Train extra data in batches

In [114]:
import warnings
# mute warning 
warnings.filterwarnings('ignore')

batches = 5
extra_x_batch = np.array_split(unlabeled, batches)
extra_y_batch = np.array_split(pseudo_labels, batches)

# load extra model from last save point
extra_model = keras.models.load_model(path.join(root, "extra_model.h5"))

for i in range(batches):
    # compute accuracy with test set
    old_pred = get_meta_prediction(x_test, extra_submodels, extra_model)
    old_accuracy = multiclass_classification_report(y_test,old_pred, print_out=False)["accuracy"]
    
    # generate stacked x for train
    stacked_batched = get_extra_stacked(extra_x_batch[i], extra_submodels, extra_datagen)
    # fit with augmented batch
    extra_model.fit(stacked_batched, extra_y_batch[i], epochs=10, validation_data=(stacked_x_val, y_val), verbose=0)
    
    # compute new accuracy with test set
    new_pred = get_meta_prediction(x_test, extra_submodels, extra_model)
    new_accuracy = multiclass_classification_report(y_test,new_pred, print_out=False)["accuracy"]
    
    # save model if newly trained batch increase accuracy by at least 0.002
    if (new_accuracy - old_accuracy) > 0.001:
        print("Update model with newly trained data")
        extra_model.save(path.join(root, "extra_model.h5"))
    else:
        print("Reload from most recent save")
        # reload from old model/checkpoint if new batch decrease accuracy
        extra_model = keras.models.load_model(path.join(root, "extra_model.h5"))

2022-05-14 10:42:34.485116: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:35.008060: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:35.703853: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Reload from most recent save


2022-05-14 10:42:40.636863: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:41.145164: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:41.796393: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Reload from most recent save


2022-05-14 10:42:46.623279: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:47.117992: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:47.795638: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Reload from most recent save


2022-05-14 10:42:52.708497: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:53.208499: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:42:53.886026: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Reload from most recent save


2022-05-14 10:42:58.857608: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:43:02.818326: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-05-14 10:43:03.621991: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Reload from most recent save


In [116]:
# compute post train model output
updated_pred = get_meta_prediction(x_test, extra_submodels, extra_model)
updated_acc = multiclass_classification_report(y_test,updated_pred, print_out=False)['accuracy'];

# compute average output from different augmentations
updated_avg_pred = avg_pred(x_test, aug_list, extra_submodels, extra_model)
updated_avg_acc = multiclass_classification_report(y_test,updated_avg_pred, print_out=False)['accuracy'];

print("--- Stats for one output prediction ---")
multiclass_classification_report(y_test,updated_pred);
print(f"Accuracy post extra training {updated_acc}")

print("--- Stats for augmentations average prediction ---")
multiclass_classification_report(y_test,updated_avg_pred);
print(f"Accuracy post extra training {updated_avg_acc}")

--- Stats for one output prediction ---
              precision    recall  f1-score   support

           0       0.90      0.89      0.90       817
           1       0.73      0.79      0.76       379
           2       0.72      0.82      0.76       510
           3       0.69      0.46      0.56       278

    accuracy                           0.79      1984
   macro avg       0.76      0.74      0.74      1984
weighted avg       0.79      0.79      0.79      1984

Accuracy post extra training 0.7938508064516129
--- Stats for augmentations average prediction ---
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       817
           1       0.74      0.79      0.76       379
           2       0.73      0.83      0.78       510
           3       0.68      0.48      0.56       278

    accuracy                           0.80      1984
   macro avg       0.76      0.75      0.75      1984
weighted avg       0.80      0.80      0.80   