# Medical Image Analysis, Nicholas Petr, Assignment 3, MSCA 3200091

Resources: 

https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator

https://pyimagesearch.com/2018/12/03/deep-learning-and-medical-image-analysis-with-keras/

https://studymachinelearning.com/keras-imagedatagenerator-with-flow_from_directory/

https://ecode.dev/cnn-for-medical-imaging-using-tensorflow-2/

https://developers.google.com/codelabs/tensorflow-6-largecnns#5

## Load packages and dataset

In [184]:
from tensorflow.keras.applications import VGG16
import os
import config
from imutils import paths
import random
import shutil
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
import keras_tuner as kt
from tensorflow import keras
from keras_tuner import HyperModel
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.optimizers import Adam
from tensorflow.data import Dataset
from tensorflow.keras.regularizers import l2

## Initialize data pathways

In [2]:
orig_input_dataset = r"C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\cell_images"
base_path = r"C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria"

train_path = r"C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\training"
val_path = r"C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\validation"
test_path = r"C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\testing"

## Split into train/test datasets, with 10% of training dataset going to validation

In [34]:
imagePaths = list(paths.list_images(orig_input_dataset))
random.seed(42)
random.shuffle(imagePaths)

train_split = 0.8
val_split = 0.1

In [35]:
i = int(len(imagePaths) * train_split)
trainPaths = imagePaths[:i]
testPaths = imagePaths[i:]

i = int(len(trainPaths) * val_split)
valPaths = trainPaths[:i]
trainPaths = trainPaths[i:]

print(len(trainPaths),len(valPaths),len(testPaths))

19842 2204 5512


In [40]:
datasets = [
("training", trainPaths, train_path),
("validation", valPaths, val_path),
("testing", testPaths, test_path)
]

# loop over the datasets

for (dType, imagePaths, baseOutput) in datasets:

    # show which data split we are creating
    print("[INFO] building '{}' split".format(dType))

    # if the output base output directory does not exist, create it
    if not os.path.exists(baseOutput):
        print("[INFO] 'creating {}' directory".format(baseOutput))
        os.makedirs(baseOutput)

    # loop over the input image paths
    for inputPath in imagePaths:
        # extract the filename of the input image along with its
        # corresponding class label
        filename = inputPath.split(os.path.sep)[-1]
        label = inputPath.split(os.path.sep)[-2]

        # build the path to the label directory
        labelPath = os.path.sep.join([baseOutput, label])

        # if the label output directory does not exist, create it
        if not os.path.exists(labelPath):
            print("[INFO] 'creating {}' directory".format(labelPath))
            os.makedirs(labelPath)

        # construct the path to the destination image and then copy
        # the image itself
        p = os.path.sep.join([labelPath, filename])
        shutil.copy2(inputPath, p)

[INFO] building 'training' split
[INFO] 'creating C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\training\Parasitized' directory
[INFO] 'creating C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\training\Uninfected' directory
[INFO] building 'validation' split
[INFO] 'creating C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\validation\Parasitized' directory
[INFO] 'creating C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\validation\Uninfected' directory
[INFO] building 'testing' split
[INFO] 'creating C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\testing\Parasitized' directory
[INFO] 'creating C:\Users\Nick's Laptop\Desktop\Health Analyics\medical_image_analysis\malaria\testing\Uninfected' directory


## Design your CNN using keras and use pretrained network VGG16 or VGG19

In [27]:
conv_base = VGG16(weights = "imagenet",
include_top = False)
conv_base.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 block1_conv1 (Conv2D)       (None, None, None, 64)    1792      
                                                                 
 block1_conv2 (Conv2D)       (None, None, None, 64)    36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, None, None, 64)    0         
                                                                 
 block2_conv1 (Conv2D)       (None, None, None, 128)   73856     
                                                                 
 block2_conv2 (Conv2D)       (None, None, None, 128)   147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, None, None, 128)   0     

## Use keras ImageGenerator adjust features from images

In [24]:
datagen = ImageDataGenerator(
        rescale=1 / 255.0,
        rotation_range=20,
        zoom_range=0.05,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.05,
        horizontal_flip=True)

valGen = ImageDataGenerator(rescale=1 / 255.0)

## Run processing steps on images in train/validate/test directories. Training and validation generators were shuffled. The training generator uses the more comprehensive ImageDataGenerator, while the validation and testing generators were given the copy provided in the assignment  

In [25]:
train_gen = datagen.flow_from_directory(
    train_path, target_size=(64, 64), color_mode='rgb',
    class_mode='categorical', batch_size=32, shuffle=True
)

val_gen = valGen.flow_from_directory(
    val_path, target_size=(64, 64), color_mode='rgb',
    class_mode='categorical', batch_size=32, shuffle=True
)


test_gen = valGen.flow_from_directory(
    test_path, target_size=(64, 64), color_mode='rgb',
    class_mode='categorical', batch_size=32, shuffle=False
)

Found 19842 images belonging to 2 classes.
Found 2204 images belonging to 2 classes.
Found 5512 images belonging to 2 classes.


## Extract test features and labels for modeling 

In [28]:
features = []
labels = []
batch_size = 32
sample_count = 19842
i = 0
for inputs_batch, labels_batch in train_gen:

#print(labels_batch)
    features_batch = conv_base.predict(inputs_batch)
    features[i * batch_size: (i+1) * batch_size] = features_batch
    labels[i * batch_size: (i+1) * batch_size] = labels_batch
    i += 1
    if ((i * batch_size % 1000) == 0 ):
       print("processed size =", i * batch_size)
    if (i * batch_size >= sample_count):
        break

np.save('train_features', features)
np.save('train_labels', labels)

processed size = 4000
processed size = 8000
processed size = 12000
processed size = 16000


In [29]:
features = []
labels = []
batch_size = 32
sample_count = 2204
i = 0
for inputs_batch, labels_batch in val_gen:

#print(labels_batch)
    features_batch = conv_base.predict(inputs_batch)
    features[i * batch_size: (i+1) * batch_size] = features_batch
    labels[i * batch_size: (i+1) * batch_size] = labels_batch
    i += 1
    if ((i * batch_size % 1000) == 0 ):
       print("processed size =", i * batch_size)
    if (i * batch_size >= sample_count):
        break

np.save('val_features', features)
np.save('val_labels', labels)

In [30]:
features = []
labels = []
batch_size = 32
sample_count = 5512
i = 0
for inputs_batch, labels_batch in test_gen:

#print(labels_batch)
    features_batch = conv_base.predict(inputs_batch)
    features[i * batch_size: (i+1) * batch_size] = features_batch
    labels[i * batch_size: (i+1) * batch_size] = labels_batch
    i += 1
    if ((i * batch_size % 1000) == 0 ):
       print("processed size =", i * batch_size)
    if (i * batch_size >= sample_count):
        break

np.save('test_features', features)
np.save('test_labels', labels)

processed size = 4000


## Save features and labels for later use, check length to ensure accuracy 

In [31]:
train_features = np.load('train_features.npy')
train_labels = np.load('train_labels.npy')
val_features = np.load('val_features.npy')
val_labels = np.load('val_labels.npy')
test_features = np.load('test_features.npy')
test_labels = np.load('test_labels.npy')

In [70]:
print("train_features: ", train_features.shape)
print("train_labels: ", train_labels.shape)
print("val_features: ", val_features.shape)
print("val_labels: ", val_labels.shape)
print("test_features: ", test_features.shape)
print("test_labels: ", test_labels.shape)

train_features:  (19842, 2, 2, 512)
train_labels:  (19842, 2)
val_features:  (2204, 2, 2, 512)
val_labels:  (2204, 2)
test_features:  (5512, 2, 2, 512)
test_labels:  (5512, 2)


## Model Tuning: other keras layers were tried (e.g., dropout, BatchNormalization, GlorotNormal) but these parameters seemed to yield the optimal results 

In [195]:
# Establish model, including features you would like to test 
model = tf.keras.models.Sequential([
     tf.keras.layers.Conv2D(8,kernel_size=(1,1), activation='sigmoid',input_shape=(2, 2, 512)),
     tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
     tf.keras.layers.Flatten(),
     tf.keras.layers.Dense(16, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(2, 2, 512))
     ])

# Compile the model, indicating loss type, optimizer, and metrics of interest 
model.compile(loss="binary_crossentropy",optimizer=Adam(learning_rate=1e-3),metrics=['binary_accuracy'])

## fit_generator tuning: 15 epochs appears to consistently produce the highest accuracy and lowest test loss 

In [196]:
history = model.fit(train_features,train_labels[:,0], batch_size=32, 
                    epochs=15, validation_data=(val_features, val_labels[:,0]))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Check test accuracy 

In [197]:
score = model.evaluate(test_features,test_labels[:,0])
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.1531982272863388
Test accuracy: 0.9433962106704712
