## Importing NIH Dataset (ZIP Format)

In [2]:
# Import relevant packages
import numpy as np
import os
from shutil import copyfile
from zipfile import ZipFile

# Download NIH dataset zip file
!wget -nc ftp://lhcftp.nlm.nih.gov/Open-Access-Datasets/Malaria/cell_images.zip

# Extract images if not already extracted
ROOT_DIR = os.path.join("/", "content")
if not os.path.isdir("cell_images"):
    print("Extracting images...")
    with ZipFile(os.path.join("cell_images.zip"), "r") as zipObj:
        zipObj.extractall()
    print("Done!")

File ‘cell_images.zip’ already there; not retrieving.


## Unzip Images, Resize, and Store in NumPy Arrays

In [4]:
# Install and import relevant packages
import numpy as np
import os
!pip install opencv-python
!apt update && apt install -y libsm6 libxext6 libxrender1
import cv2
from PIL import Image

# Create new folders to save rescaled images
if not os.path.isdir("RescaledSet"):
    os.mkdir("RescaledSet")
if not os.path.isdir("RescaledSet/Parasitized"):
    os.mkdir("RescaledSet/Parasitized")
if not os.path.isdir("RescaledSet/Uninfected"):
    os.mkdir("RescaledSet/Uninfected")

# Generate list of parasitized file names
ParasitizedFiles = os.listdir("cell_images/Parasitized/")
UninfectedFiles = os.listdir("cell_images/Uninfected/")

# Remove Thumb.db files
while 'Thumbs.db' in ParasitizedFiles: ParasitizedFiles.remove('Thumbs.db')   
while 'Thumbs.db' in UninfectedFiles: UninfectedFiles.remove('Thumbs.db')  

# Pre-allocate memory space for images
Parasitized = np.empty([13779,128,128,3])
Uninfected = np.empty([13779,128,128,3])

# Resize and load parasitized images
for i in range(13779):
    TempImage = cv2.imread('cell_images/Parasitized/'+ParasitizedFiles[i])
    ResizedImage = cv2.resize(TempImage, dsize=(128,128))
    Parasitized[i,:,:,:] = ResizedImage

# Resize and load uninfected images
for i in range(13779):
    TempImage = cv2.imread('cell_images/Uninfected/'+UninfectedFiles[i])
    ResizedImage = cv2.resize(TempImage, dsize=(128,128))
    Uninfected[i,:,:,:] = ResizedImage
    
print('Uninfected Dataset size is:',np.shape(Uninfected))
print('Parasitized Dataset size is:',np.shape(Parasitized))

Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Reading package lists... Done[0m                 [33m[33m[33m[33m
Building dependency tree       
Reading state information... Done
28 packages can be upgraded. Run 'apt list --upgradable' to see them.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsm6 is already the newest version (2:1.2.2-1).
libxext

## Generate Cross-Validation Indices for Training and Testing Sets

In [5]:
# Generate dataset labels
ParasitizedLabels = np.repeat([[0,1]], 13779, axis=0)
UninfectedLabels = np.repeat([[1,0]], 13779, axis=0)
Labels = np.concatenate((ParasitizedLabels,UninfectedLabels), axis=0)

# Generate image dataset
Dataset = np.concatenate((Parasitized, Uninfected), axis=0)

# Generate 5-fold cross-validation groups
CVIndices = np.random.permutation(Dataset.shape[0])
Index1, Index2, Index3, Index4, Index5 = CVIndices[:5512], CVIndices[5512:11024], CVIndices[11024:16536], CVIndices[16536:22048], CVIndices[22048:]

## Train Model and Save Results as CSV Files

In [None]:
# Import relevant neural network architecture packages 
from keras.applications.vgg16 import VGG16

# Import relevant packages for neural network training
import sys
import csv
if 'tensorflow' in sys.modules == False:
    %tensorflow_version 2.x
    import tensorflow as tf
import keras
from keras import applications
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, BatchNormalization
from keras import backend as k 
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

!pip install scikit-learn
import sklearn
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Specify layers to freeze
FreezeLayers = [19,15,11,7,4,0]

# Create empty lists to store results
TrainLoss = []
TrainAcc = []
TestLoss = []
TestAcc = []
All_FPR = []
All_TPR = []
All_thresholds = []
All_AUC = []

for j in FreezeLayers:

    for i in range(5):

        # Create the appropriate training and testing sets
        if i == 0:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index2,:],Dataset[Index3,:],Dataset[Index4,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index2,:], Labels[Index3,:], Labels[Index4,:]), axis=0)
            TestImages = Dataset[Index5,:]
            TestLabels = Labels[Index5,:]
        elif i == 1:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index2,:],Dataset[Index3,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index2,:], Labels[Index3,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index4,:]
            TestLabels = Labels[Index4,:]
        elif i == 2:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index2,:],Dataset[Index4,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index2,:], Labels[Index4,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index3,:]
            TestLabels = Labels[Index3,:]
        elif i == 3:
            TrainImages = np.concatenate((Dataset[Index1,:],Dataset[Index3,:],Dataset[Index4,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index1,:], Labels[Index3,:], Labels[Index4,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index2,:]
            TestLabels = Labels[Index2,:]
        else:
            TrainImages = np.concatenate((Dataset[Index2,:],Dataset[Index3,:],Dataset[Index4,:],Dataset[Index5,:]), axis=0)
            TrainLabels = np.concatenate((Labels[Index2,:], Labels[Index3,:], Labels[Index4,:], Labels[Index5,:]), axis=0)
            TestImages = Dataset[Index1,:]
            TestLabels = Labels[Index1,:]

        base_model = VGG16(weights = "imagenet", include_top=False, input_shape = (128,128,3))

        for layer in base_model.layers[:j]:
            layer.trainable=False
        for k,layer in enumerate(base_model.layers):
            print(k,layer.name,layer.trainable)

        x = base_model.output
        x = Flatten()(x)
        x = Dense(1024, activation="relu")(x)
        x = Dropout(0.5)(x)
        x = Dense(1024, activation="relu")(x)
        x = Dropout(0.5)(x)
        predictions = Dense(2, activation="softmax")(x)
        model = Model(input = base_model.input, output = predictions)
        adam = optimizers.Adam(lr=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss = "categorical_crossentropy", optimizer = adam, metrics=["accuracy"])

        # Train model and evaluate performance
        print('We are now training cross-validation set #',i+1)
        ResultsPre = model.fit(TrainImages, TrainLabels, epochs=50, batch_size=64, validation_data=(TestImages,TestLabels), validation_freq=1)

        # Display and store performance results
        ResultsPre.history['loss'] = [round(k, 4) for k in ResultsPre.history['loss']]
        ResultsPre.history['accuracy'] = [round(k, 4) for k in ResultsPre.history['accuracy']]
        ResultsPre.history['val_loss'] = [round(k, 4) for k in ResultsPre.history['val_loss']]
        ResultsPre.history['val_accuracy'] = [round(k, 4) for k in ResultsPre.history['val_accuracy']]
        
        for layer in model.layers[:j]:
            layer.trainable=True
        for i,layer in enumerate(model.layers):
            print(i,layer.name,layer.trainable)
    
        adam = optimizers.Adam(lr=0.000001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss = "categorical_crossentropy", optimizer = adam, metrics=["accuracy"])

        # Train model and evaluate performance
        print('We are now training cross-validation set #',i+1)
        ResultsPost = model.fit(TrainImages, TrainLabels, epochs=50, batch_size=64, validation_data=(TestImages,TestLabels), validation_freq=1)

        # Display and store performance results
        ResultsPost.history['loss'] = [round(k, 4) for k in ResultsPost.history['loss']]
        ResultsPost.history['accuracy'] = [round(k, 4) for k in ResultsPost.history['accuracy']]
        ResultsPost.history['val_loss'] = [round(k, 4) for k in ResultsPost.history['val_loss']]
        ResultsPost.history['val_accuracy'] = [round(k, 4) for k in ResultsPost.history['val_accuracy']]
        
        TrainLoss.append(ResultsPre.history['loss']+ResultsPre.history['loss'])
        TrainAcc.append(ResultsPre.history['accuracy']+ResultsPre.history['accuracy'])
        TestLoss.append(ResultsPre.history['val_loss']+ResultsPre.history['val_loss'])
        TestAcc.append(ResultsPre.history['val_accuracy']+ResultsPre.history['val_accuracy'])
        print('')
        
        print('Training Loss:',ResultsPre.history['loss']+ResultsPre.history['loss'])
        print('Training Accuracy:',ResultsPre.history['accuracy']+ResultsPre.history['accuracy'])
        print('Validation Loss:',ResultsPre.history['val_loss']+ResultsPre.history['val_loss'])
        print('Validation Accuracy:',ResultsPre.history['val_accuracy']+ResultsPre.history['val_accuracy'])

        # Predict values for test set
        Probabilities = model.predict(TestImages)

        # Calculate data for ROC curve
        FPR, TPR, thresholds = roc_curve(TestLabels[:,1], Probabilities[:,1])
        All_FPR.append(FPR)
        All_TPR.append(TPR)
        All_thresholds.append(thresholds)

        # Save and export as CSV files
        with open(str(j)+"Unfreeze_TrainLoss.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(TrainLoss)
        with open(str(j)+"Unfreeze_TrainAcc.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(TrainAcc)
        with open(str(j)+"Unfreeze__TestLoss.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(TestLoss)
        with open(str(j)+"Unfreeze__TestAcc.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(TestAcc)
        with open(str(j)+"Unfreeze__FPR.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(All_FPR)
        with open(str(j)+"Unfreeze__TPR.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(All_TPR)
        with open(str(j)+"Unfreeze__Thresholds.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(All_thresholds)

0 input_2 False
1 block1_conv1 False
2 block1_conv2 False
3 block1_pool False
4 block2_conv1 False
5 block2_conv2 False
6 block2_pool False
7 block3_conv1 False
8 block3_conv2 False
9 block3_conv3 False
10 block3_pool False
11 block4_conv1 False
12 block4_conv2 False
13 block4_conv3 False
14 block4_pool False
15 block5_conv1 False
16 block5_conv2 False
17 block5_conv3 False
18 block5_pool False




We are now training cross-validation set # 1
Train on 22048 samples, validate on 5510 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50

## Examine RAM Requirements

In [None]:
# Examine RAM Usage
import sys
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)