In [48]:
# With only a train lot of 16K images and a batch size of 64, 20 epochs took 8 hours on a CPU with 16 GB
# A GPU run took around 10 - 15 mins per epoch

# Import key libraries
import sys
import os
from os import makedirs, listdir
import shutil
from matplotlib import pyplot
from matplotlib.image import imread
import pandas as pd
import numpy as np
import scipy
from shutil import copyfile
from random import random, seed

# Load the necessary tensorflow and keras libraries
import tensorflow as tf
from tensorflow import keras
import keras.utils as image
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, save_img
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import VGG16
from keras.callbacks import EarlyStopping

# Following code to optimise GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)
    
# Check to see if TF has GPU support and print the GPU device name
if tf.test.gpu_device_name():
   print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
   print("Please install GPU version of TF")

print("Test is built with CUDA = " + str(tf.test.is_built_with_cuda()))


Please install GPU version of TF
Test is built with CUDA = False


In [27]:
# UTILS FUNCTIONS

# Preprocess images (renames them to 'other<n>' and 'xray<n>' and then resizes them to 224, 224 with a gray-scale)
def OrganizeDataFromSource():
    # A - Base folder creation and file copies
    # If the proceessed folder is not there, create it
    os.mkdir('data/tmp_processed')
    os.mkdir('data/tmp_processed/others')
    os.mkdir('data/tmp_processed/xrays')
    
    # getting all the files in the source directory
    files = os.listdir('data/original_dataset/others')
    for file in files:
        shutil.copy(os.path.join('data/original_dataset/others', file), 'data/tmp_processed/others')
    
    # getting all the files in the source directory
    files = os.listdir('data/original_dataset/xrays')
    for index, file in enumerate(files):
        shutil.copy(os.path.join('data/original_dataset/xrays', file), 'data/tmp_processed/xrays')
    
    print("File copies from source dataset to destination folders complete")
    
    # B - Renames, grayscales and resizes
    others_dest_path = "data/tmp_processed/others"
    files = os.listdir(others_dest_path)

    # With the sub-dirs made, copy image from source to dest
    for index, file in enumerate(files):
        if os.path.isfile(os.path.join(others_dest_path, ''.join(['other_',str(index+1), '.jpg']))):
           break; 
        os.rename(os.path.join(others_dest_path, file), os.path.join(others_dest_path, ''.join(['other_',str(index+1), '.jpg']))) 
    
    print("Renaming for 'others' complete")
    
    for index, file in enumerate(files):
        # Now resize the image and conver to grayscale
        full_path = os.path.join(others_dest_path, ''.join(['other_',str(index+1), '.jpg']))
        im_resized = image.load_img(full_path, target_size = (224,224), color_mode = 'grayscale')
        img_array = image.img_to_array(im_resized)
        image.save_img(full_path, img_array)
    
    print("Size and grayscale for 'others' complete")
    
    xray_dest_path = "data/tmp_processed/xrays"
    files = os.listdir(xray_dest_path)

    # With the sub-dirs made, copy image from source to dest
    for index, file in enumerate(files):
        if os.path.isfile(os.path.join(xray_dest_path, ''.join(['xray_',str(index+1), '.jpg']))):
            break;
        os.rename(os.path.join(xray_dest_path, file), os.path.join(xray_dest_path, ''.join(['xray_',str(index+1), '.jpg'])))   
    
    print("Renaming for 'xrays' complete")
    
    for index, file in enumerate(files):
        # Now resize the image and conver to grayscale
        full_path = os.path.join(xray_dest_path, ''.join(['xray_',str(index+1), '.jpg']))
        im_resized = image.load_img(full_path, target_size = (224,224), color_mode = 'grayscale')
        img_array = image.img_to_array(im_resized)
        image.save_img(full_path, img_array)

    print("Size and grayscale for 'xrays' complete")

    # C - Copy files across both the classes to a common directory for training as well as deriving mean, std purpose
    if os.path.isdir('data/all_train'):
        shutil.rmtree('data/all_train')
        print("Removed folder all_train")
    
    os.mkdir('data/all_train')
    
    if os.path.isdir('data/dataset_others_vs_xrays'):
        shutil.rmtree('data/dataset_others_vs_xrays')
        print("Removed folder dataset_others_vs_xrays")
    
    os.mkdir('data/dataset_others_vs_xrays')

    # Create sub-directories under the above newly created directory
    dataset_home = 'data/dataset_others_vs_xrays/'
    subdirs = ['train/', 'test/']

    for subdir in subdirs:
        # create label subdirectories
        labeldirs = ['others/', 'xrays/']
        for labldir in labeldirs:
            newdir = dataset_home + subdir + labldir
            makedirs(newdir, exist_ok=True)
    
    # Copy both the xray and other content into the all_train folder
    files = os.listdir('data/tmp_processed/others')
    for file in files:
        shutil.copy(os.path.join('data/tmp_processed/others', file), 'data/all_train')
    
    # getting all the files in the source directory
    files = os.listdir('data/tmp_processed/xrays')
    for index, file in enumerate(files):
        shutil.copy(os.path.join('data/tmp_processed/xrays', file), 'data/all_train')

    print("Copy of all processed files into all_train folder complete")

    # D - Create train / test folders in the ratio of 75:25 for both classes from the processed common folder created earlier
    seed(1)

    # define ratio of pictures to use for validation
    val_ratio = 0.25

    # copy training dataset images into subdirectories
    src_directory = 'data/all_train/'

    for file in listdir(src_directory):
        src = src_directory + '/' + file
        
        dst_dir = 'train/'
        if random() < val_ratio:
            dst_dir = 'test/'
            
        if file.startswith('other'):
            dst = dataset_home + dst_dir + 'others/'  + file
            copyfile(src, dst)
        elif file.startswith('xray'):
            dst = dataset_home + dst_dir + 'xrays/'  + file
            copyfile(src, dst)

    # Finally, remove the temp folder
    if os.path.isdir('data/tmp_processed'):
        shutil.rmtree('data/tmp_processed')
        print("Removed temp folder")
        
    print("Creation of final dataset folders completed")

# Gets the mean across a sample of 100 images for all the 3 channels (RGB) and std deviation. This is useful to apply normalization prior
# to training the model and before making a prediction on a test image
def get_mean_std_per_batch(df, H, W):
    sample_data = []
    IMAGE_DIR = 'data/all_train/'
    
    for idx, img in enumerate(df.sample(100)["Image"].values):
        path = IMAGE_DIR + img
        sample_data.append(np.array(image.load_img(path, target_size=(H, W))))

    mean1 = np.mean(sample_data[0])
    mean2 = np.mean(sample_data[1])
    mean3 = np.mean(sample_data[2])
    std = np.std(sample_data[0])
    
    return mean1, mean2, mean3, std    


In [28]:
OrganizeDataFromSource()

File copies from source dataset to destination folders complete
Renaming for 'others' complete
Size and grayscale for 'others' complete
Renaming for 'xrays' complete
Size and grayscale for 'xrays' complete
Copy of all processed files into all_train folder complete
Removed temp folder
Creation of final dataset folders completed


In [31]:
dest_path = "data/dataset_others_vs_xrays/train/xrays"
path = os.path.join(dest_path, 'xray_1.jpg')
img = load_img(path)
print(type(img))
print(img.format)
print(img.mode)
print(img.size)
img.show()

<class 'PIL.Image.Image'>
None
RGB
(224, 224)


In [32]:
dest_path = "data/dataset_others_vs_xrays/train/others"
path = os.path.join(dest_path, 'other_2.jpg')
img = load_img(path)
print(type(img))
print(img.format)
print(img.mode)
print(img.size)
img.show()

<class 'PIL.Image.Image'>
None
RGB
(224, 224)


In [6]:
# Call the ProcessOriginalDataset function first if not done so
CreateTrainTestFoldersFromProcessedDataset()

Removed dataset_others_vs_xrays folder
Creation of final dataset folders completed


In [44]:
def Define_Model_Transfer_VGG_16():
    # We can use the feature extraction part of the model and add a new classifier part of the model
    # that is tailored to the dogs and cats dataset. Specifically, we can hold the weights of all of the 
    # convolutional layers fixed during training, and only train new fully connected layers that will learn 
    # to interpret the features extracted from the model and make a binary classification    
    
    # This can be achieved by loading the VGG-16 model, removing the fully connected layers from the
    # output-end of the model, then adding the new fully connected layers to interpret the model output 
    # and make a prediction. The classifier part of the model can be removed automatically by
    # setting the “include_top” argument to “False“, which also requires that the shape of the input also 
    # be specified for the model, in this case (224, 224, 3). This means that the loaded model ends
    # at the last max pooling layer, after which we can manually add a Flatten layer
    # and the new clasifier layers.
    
    # Load the VGG model
    model = VGG16(include_top = False, input_shape=(224, 224, 3), weights='data/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5')
    
    # mark loaded layers as not trainable
    for layer in model.layers:
        layer.trainable = False
    flat1 = Flatten()(model.layers[-1].output)
    
    # Create the new classifier layers
    class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
    
    # Create the new output layer
    output = Dense(1, activation='sigmoid')(class1)
    
    # Define the complete  model now with the VGG inputs and the new classifier layers
    model = Model(inputs=model.inputs, outputs=output)
    
    # Compile model
    opt = SGD(learning_rate=0.001, momentum=0.9)
    
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model    
    

In [45]:
# plot diagnostic learning curves
def summarize_diagnostics(history):
    # Plot loss
    pyplot.subplot(211)
    pyplot.title('Cross Entropy Loss')
    pyplot.plot(history.history['loss'], color='blue', label='train')
    pyplot.plot(history.history['val_loss'], color='orange', label='test')
    
    # Plot accuracy
    pyplot.subplot(212)
    pyplot.title('Classification Accuracy')
    pyplot.plot(history.history['accuracy'], color='blue', label='train')
    pyplot.plot(history.history['val_accuracy'], color='orange', label='test')
    
    # Save plot to file
    filename = sys.argv[0].split('/')[-1]
    pyplot.savefig(filename + '_plot.png')
    pyplot.close()

In [35]:
# Get the mean and std for sample (100 records from entire proceessed batch
df = pd.read_csv('data/train-common.csv')
mean1, mean2, mean3, std = get_mean_std_per_batch(df, 224, 224)

print(mean1)
print (mean2)
print (mean3)
print(std)

131.34044164540816
170.01317362882654
146.68295599489795
70.01424932635871


In [111]:
# run the test harness for evaluating a model
def run_test_harness_vgg_transfer():
    
    # Define model
    model = Define_Model_Transfer_VGG_16()
    
    # We can load the images progressively using the Keras ImageDataGenerator class and flow_from_directory() API
    datagen = ImageDataGenerator(featurewise_center=True)
    
    # DO NOT USE A CUSTOM MEAN AND STD - SEE DESCRIPTION BELOW
    # The VGG16 model was trained on a specific ImageNet challenge dataset. As such, it is configured to expected input images 
    # to have the shape 224×224 pixels. We will use this as the target size when loading photos from our dataset.
    # The model also expects images to be centered. That is, to have the mean pixel values from each channel (red, green, and blue) as 
    # calculated on the ImageNet training dataset subtracted from the input. Keras provides a function to perform this preparation for 
    # individual photos via the preprocess_input() function. Nevertheless, we can achieve the same effect with the ImageDataGenerator 
    # by setting the “featurewise_center” argument to “True” and manually specifying the mean pixel values to use when centering as 
    # the mean values from the ImageNet training dataset: [123.68, 116.779, 103.939].
    
    # Get the mean and std from a sample batch
    #df = pd.read_csv('data/train-common.csv')
    #mean1, mean2, mean3, std = get_mean_std_per_batch(df, 224, 224)
    #train_datagen.mean = [mean1, mean2, mean3]
    #train_datagen.std = std

    datagen.mean = [123.68, 116.779, 103.939]
    
    # Prepare iterators
    train_it = datagen.flow_from_directory('data/dataset_others_vs_xrays/train',
        class_mode='binary', batch_size=32, target_size=(224, 224))
    
    # The following is commented as we are now running the final lot with all the files together
    test_it = datagen.flow_from_directory('data/dataset_others_vs_xrays/test',
        class_mode='binary', batch_size=32, target_size=(224, 224))
    
    # Fit model
    # Define a basic early stopping
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
    
    # FOR UAR
    # Commented the below lines as the final run does not require validation
    history = model.fit(train_it, steps_per_epoch=len(train_it),
        validation_data = test_it, validation_steps = len(test_it), epochs=10, verbose=1, callbacks=[es])

    # FOR PROD
    #history = model.fit(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=1, callbacks=[es])

    # FOR UAT
    # Evaluate model - Comment it out in the final run
    _, acc = model.evaluate(test_it, steps=len(test_it), verbose = 1)
    
    # Print the accuracy after the fit is complete. Comment it out in the final run
    #print('> %.3f' % (acc * 100.0))
    
    # Save the model
    model.save('saved_model/final_xray_classifier_model.h5')

    # learning curves
    summarize_diagnostics(history)


In [49]:
#run_test_harness()
run_test_harness_vgg_transfer()


Found 4475 images belonging to 2 classes.
Found 1489 images belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 2: early stopping




> 100.000


In [122]:
# Prediction section

# Load and prepare the image
def load_image(filename):
    # Load the image with the same size as loaded in the training set
    img = image.load_img(filename, target_size=(224, 224), grayscale=True)
    
    # Convert to array
    img = img_to_array(img)

    # Reshape into a single sample with 3 channels
    
    #img = img.reshape(1, 224, 224, 3)
    img = np.expand_dims(img, axis=0)
    
    # Center pixel data
    #df = pd.read_csv('data/train-common.csv')
    #mean1, mean2, mean3, std = get_mean_std_per_batch(df, 224, 224)
    #img = img - [mean1, mean2, mean3]    
    
    img = img.astype('float32')
    img = img - [123.68, 116.779, 103.939]
    
    return img

# load an image and predict the class
def run_example(path):
    # load the image
    img = load_image(path)
    
    # load model
    model = load_model('saved_model/final_xray_classifier_model.h5')
    
    # predict the class
    result = model.predict(img)
    
    return(result[0].astype('int'))
    

In [145]:
# Entry point, run the example
#path = 'data/dataset_others_vs_xrays/test/others/other_1.jpg'
#path = 'data/dataset_others_vs_xrays/test/others/other_105.jpg'
#path = 'data/dataset_others_vs_xrays/test/others/other_1033.jpg'
#path = 'data/dataset_others_vs_xrays/test/others/other_1531.jpg'
#path = 'data/dataset_others_vs_xrays/test/others/other_1602.jpg'
#path = 'data/dataset_others_vs_xrays/test/others/other_1746.jpg'
#path = 'data/dataset_others_vs_xrays/test/others/other_2677.jpg'


#path = 'data/test/DSC_.jpg'
#path = 'data/test/Farida_Rupawalla_PAN_Card.jpg'
#path = 'data/test/mccaffe subscription.jpg'
#path = 'data/test/Naem_Profile_Photo_2.jpg'
path = 'data/test/DSC_0899 copy.jpg'
#path = 'data/test/DSC_0901 copy.jpg'
#path = 'data/test/Screenshot 2023-12-12 161432.png'

#path = 'data/dataset_others_vs_xrays/test/xrays/xray_15.jpg'
#path = 'data/dataset_others_vs_xrays/test/xrays/xray_19.jpg'
#path = 'data/dataset_others_vs_xrays/test/xrays/xray_114.jpg'
#path = 'data/dataset_others_vs_xrays/test/xrays/xray_115.jpg'
#path = 'data/dataset_others_vs_xrays/test/xrays/xray_198.jpg'
#path = 'data/dataset_others_vs_xrays/test/xrays/xray_201.jpg'
#path = 'data/dataset_others_vs_xrays/test/xrays/xray_2966.jpg'
#path = 'data/dataset_others_vs_xrays/test/xrays/xray_2977.jpg'

result = run_example(path)

# If result is less than or equal to 0, then the image is not an xray image else it is
print(result)

if result <= 0:
    print("The image selected is not an x-ray")
else:
    print("The image selected is an x-ray")
    

[0]
The image selected is not an x-ray
