## Importing NIH Dataset (ZIP Format)
Here we import the NIH dataset (zip file format) from a website housed by the NIH National Library of Medicine (NLM). 

In [None]:
# Import relevant packages
import numpy as np
import os
from shutil import copyfile
from zipfile import ZipFile

# Download NIH dataset zip file
!wget -nc ftp://lhcftp.nlm.nih.gov/Open-Access-Datasets/Malaria/cell_images.zip

# Extract images if not already extracted
ROOT_DIR = os.path.join("/", "content")
if not os.path.isdir("cell_images"):
    print("Extracting images...")
    with ZipFile(os.path.join("cell_images.zip"), "r") as zipObj:
        zipObj.extractall()
    print("Done!")

## Unzip Images, Resize, and Store in NumPy Arrays
We load the 10000 images (5000 each class) from the zip file into two different folders. There are 13779 images in each class, with a "Thumbs.db" file located in each folder, which we remove. We only use 5000 images in each class and resize each individual image into 128x128 pixels, while maintaining the 3 RGB channels, and store them into the NumPy arrays ```Parasitized``` and ```Uninfected```. 

In [None]:
# Install and import relevant packages
import numpy as np
import os
!pip install opencv-python
!apt update && apt install -y libsm6 libxext6 libxrender1
import cv2
from PIL import Image

# Create new folders to save rescaled images
if not os.path.isdir("RescaledSet"):
    os.mkdir("RescaledSet")
if not os.path.isdir("RescaledSet/Parasitized"):
    os.mkdir("RescaledSet/Parasitized")
if not os.path.isdir("RescaledSet/Uninfected"):
    os.mkdir("RescaledSet/Uninfected")

# Generate list of parasitized file names
ParasitizedFiles = os.listdir("cell_images/Parasitized/")
UninfectedFiles = os.listdir("cell_images/Uninfected/")

# Remove Thumb.db files
while 'Thumbs.db' in ParasitizedFiles: ParasitizedFiles.remove('Thumbs.db')   
while 'Thumbs.db' in UninfectedFiles: UninfectedFiles.remove('Thumbs.db')  

# Pre-allocate memory space for images
Parasitized = np.empty([5000,128,128,3])
Uninfected = np.empty([5000,128,128,3])

# Resize and load parasitized images
for i in range(5000):
    TempImage = cv2.imread('cell_images/Parasitized/'+ParasitizedFiles[i])
    ResizedImage = cv2.resize(TempImage, dsize=(128,128))
    Parasitized[i,:,:,:] = ResizedImage

# Resize and load uninfected images
for i in range(5000):
    TempImage = cv2.imread('cell_images/Uninfected/'+UninfectedFiles[i])
    ResizedImage = cv2.resize(TempImage, dsize=(128,128))
    Uninfected[i,:,:,:] = ResizedImage
    
print('Uninfected Dataset size is:',np.shape(Uninfected))
print('Parasitized Dataset size is:',np.shape(Parasitized))

## Generate Cross-Validation Indices for Training and Testing Sets
Here we randomly generate five cross-validation group indices to access the images in the dataset.## Generate Cross-Validation Indices for Training and Testing Sets

In [None]:
# Generate dataset labels
ParasitizedLabels = np.repeat([[0,1]], 5000, axis=0)
UninfectedLabels = np.repeat([[1,0]], 5000, axis=0)
Labels = np.concatenate((ParasitizedLabels,UninfectedLabels), axis=0)

# Generate image dataset
Dataset = np.concatenate((Parasitized, Uninfected), axis=0)

# Generate 5-fold cross-validation groups
CVIndices = np.random.permutation(Dataset.shape[0])
Index1, Index2, Index3, Index4, Index5 = CVIndices[:2000], CVIndices[2000:4000], CVIndices[4000:6000], CVIndices[6000:8000], CVIndices[8000:]

## Create List of Classification Layer Hyperparameters
Here we just use the list variable ```Activation``` to specify the dense layer activation function we wish to test.

In [None]:
# Import relevant neural network architecture packages 
from keras.applications.vgg16 import VGG16

# List of hyperparameters
BatchSize = [4,8,16,32,64,128,256]

## Train Model and Save Results as CSV Files
Now we test different model variants based on varying amounts of dense nodes in each of the two dense layers.

In [None]:
# Import relevant packages for neural network training
import sys
import csv
if 'tensorflow' in sys.modules == False:
    %tensorflow_version 2.x
    import tensorflow as tf
import keras
from keras import applications
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, BatchNormalization
from keras import backend as k 
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

!pip install scikit-learn
import sklearn
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

for k in BatchSize:
    # Create empty lists to store results
    TrainLoss = []
    TrainAcc = []
    TestLoss = []
    TestAcc = []
    All_FPR = []
    All_TPR = []
    All_thresholds = []
    All_AUC = []

    for i in range(5):

        # Create the appropriate training and testing sets
        if i == 0:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index2,:],Dataset[Index3,:],Dataset[Index4,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index2,:], Labels[Index3,:], Labels[Index4,:]), axis=0)
            TestImages = Dataset[Index5,:]
            TestLabels = Labels[Index5,:]
        elif i == 1:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index2,:],Dataset[Index3,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index2,:], Labels[Index3,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index4,:]
            TestLabels = Labels[Index4,:]
        elif i == 2:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index2,:],Dataset[Index4,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index2,:], Labels[Index4,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index3,:]
            TestLabels = Labels[Index3,:]
        elif i == 3:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index3,:],Dataset[Index4,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index3,:], Labels[Index4,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index2,:]
            TestLabels = Labels[Index2,:]
        else:
            TrainImages = np.concatenate((Dataset[Index2,:],Dataset[Index3,:],Dataset[Index4,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index2,:], Labels[Index3,:], Labels[Index4,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index1,:]
            TestLabels = Labels[Index1,:]

        base_model = VGG16(weights = "imagenet", include_top=False, input_shape = (128,128,3))

        x = base_model.output
        x = Flatten()(x)
        x = Dense(1024, activation='relu')(x)
        x = Dropout(0.5)(x)
        x = Dense(1024, activation='relu')(x)
        x = Dropout(0.5)(x)
        predictions = Dense(2, activation="softmax")(x)
        model = Model(input = base_model.input, output = predictions)
        adam = optimizers.Adam(lr=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss = "categorical_crossentropy", optimizer = adam, metrics=["accuracy"])

        # Train model and evaluate performance
        print('We are now training cross-validation set #',i+1)
        Results = model.fit(TrainImages, TrainLabels, epochs=40, batch_size=k, validation_data=(TestImages,TestLabels), validation_freq=1)

        # Display and store performance results
        Results.history['loss'] = [round(l, 4) for l in Results.history['loss']]
        Results.history['accuracy'] = [round(l, 4) for l in Results.history['accuracy']]
        Results.history['val_loss'] = [round(l, 4) for l in Results.history['val_loss']]
        Results.history['val_accuracy'] = [round(l, 4) for l in Results.history['val_accuracy']]

        print('Training Loss:',Results.history['loss'])
        print('Training Accuracy:',Results.history['accuracy'])
        print('Validation Loss:',Results.history['val_loss'])
        print('Validation Accuracy:',Results.history['val_accuracy'])

        TrainLoss.append(Results.history['loss'])
        TrainAcc.append(Results.history['accuracy'])
        TestLoss.append(Results.history['val_loss'])
        TestAcc.append(Results.history['val_accuracy'])
        print('')


        # Predict values for test set
        Probabilities = model.predict(TestImages)

        # Calculate data for ROC curve
        FPR, TPR, thresholds = roc_curve(TestLabels[:,1], Probabilities[:,1])
        All_FPR.append(FPR)
        All_TPR.append(TPR)
        All_thresholds.append(thresholds)

    # Save and export as CSV files
    with open(str(k)+"_TrainLoss.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(TrainLoss)
    with open(str(k)+"_TrainAcc.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(TrainAcc)
    with open(str(k)+"_TestLoss.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(TestLoss)
    with open(str(k)+"_TestAcc.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(TestAcc)
    with open(str(k)+"_FPR.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(All_FPR)
    with open(str(k)+"_TPR.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(All_TPR)
    with open(str(k)+"_Thresholds.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(All_thresholds)

## Examine RAM Requirements

In [None]:
# Examine RAM Usage
import sys
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)