In [40]:
#IMPORTING PACKAGES
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
from sklearn.model_selection import train_test_split
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.layers import AveragePooling2D, Dense, Flatten, Dropout
from keras.models import Model
import random

In [3]:
normal_path = "../input/covid19-radiography-database/COVID-19_Radiography_Dataset/Normal/images"
viral_path = "../input/covid19-radiography-database/COVID-19_Radiography_Dataset/Viral Pneumonia/images"
covid_path = "../input/covid19-radiography-database/COVID-19_Radiography_Dataset/COVID/images"

In [4]:
#listdir returns a list of all the files in the mentioned directory
filenames = os.listdir(normal_path) + os.listdir(viral_path) + os.listdir(covid_path) 

In [5]:
# 0 -> Normal
# 1 -> Viral Pneumonia
# 2 -> COVID-19

#Iterating through all the files and appending their label to 'categories' list
categories = []
paths = []
for filename in filenames:
    if "Normal" in filename:
        categories.append('normal')
        paths.append(normal_path + '/' + filename)
    if "Viral Pneumonia" in filename:
        categories.append('viral pneumonia')
        paths.append(viral_path + '/' + filename)
    if "COVID" in filename:
        categories.append('covid')
        paths.append(covid_path + '/' + filename)

In [6]:
#Creating a dataframe with two columns, one containing the path of the XRay scanned image
#And the other column containing its label
df = pd.DataFrame({
    'File Path' : paths,
    'Category' : categories
})

In [7]:
#A sample image to showcase that our code works fine
#Reading the image with opencv and plotting the image using matplotlib
img = cv2.imread(df['File Path'][2000])
plt.imshow(img)

In [8]:
# Splitting into training and testing data
# train_test_split parameters:
# -> dataframe (or input data)
# -> test_size -> 1/4th of the input data is used as test and validation data
# -> random_state -> used to control randomness of our shuffle 
# -> stratify -> splits the data into strata(or groups) based on a particular parameter
train_data , test_valid_data = train_test_split(df, test_size=0.25, random_state = 42, 
                                                shuffle=True, stratify=df['Category'])

In [9]:
#resest_index -> normalises the indices of dataframe such that they start with 0
train_data = train_data.reset_index(drop=True)
test_valid_data = test_valid_data.reset_index(drop=True)

In [10]:
#Splitting testing data into validation and testing data
test_data, valid_data = train_test_split(test_valid_data, test_size=0.5, random_state = 42, shuffle=True, stratify=test_valid_data['Category'])
test_data = test_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)

In [14]:
# ImageDataGenerator -> Generates batches of tensor image data with real-time data augmentation
# rescale -> divide by 255 to normalise images

In [15]:
train_data_gen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=15,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

In [16]:
# flow_from_dataframe method -> Takes the dataframe as input and generates batches of normalised and augmented data
# class_mode -> if 'categorical', the y_col parameter contains the target labels/classes
# x_col -> column in dataframe that contains file paths of images
# target_size -> size to which all images are resized

In [17]:
train_gen = train_data_gen.flow_from_dataframe(
    train_data, 
    x_col='File Path',
    y_col='Category',
    target_size=(224,224),
    class_mode='categorical',
    batch_size=32
)

In [18]:
valid_data_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

valid_gen = valid_data_gen.flow_from_dataframe(
    valid_data, 
    x_col='File Path',
    y_col='Category',
    target_size=(224,224),
    class_mode='categorical',
    batch_size=32
)


In [21]:
# instantiating a VGG16 model whose weights have been pretrained using imagenet data
# include_top -> leaving out the top layers of the model because we want the model to work images of varying sizes

In [19]:
base_model = VGG16(input_shape=(224,224,3), weights='imagenet', include_top=False)

In [22]:
# Customisations to VGG16:
# Adding Pooling and Flatten layers to the model
# Adding 1 hidden layer where we chose the activation function as Rectified Linear Unit
# Adding a Dropout layer to prevent overfitting of model
# Finaly using a softmax in our output layer

In [23]:
for layer in base_model.layers:
    layer.trainable = False

head_model = base_model.output
head_model = AveragePooling2D()(head_model)
head_model = Flatten()(head_model)
head_model = Dense(128, activation="relu")(head_model)
head_model = Dropout(0.2)(head_model)
head_model = Dense(3, activation='softmax')(head_model)

model = Model(inputs=base_model.input, outputs=head_model)

In [24]:
model.summary()

In [25]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [26]:
epochs = 10
history = model.fit_generator(train_gen, epochs=epochs, validation_data=valid_gen, verbose=1)

In [28]:
#Saving our model
model.save('Covid.h5')

# MODEL ANALYSIS

In [29]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
ax1.plot(history.history['loss'], color='b', label="Training loss")
ax1.plot(history.history['val_loss'], color='r', label="validation loss")
ax1.set_xticks(np.arange(1, epochs, 1))
ax1.set_yticks(np.arange(0, 1, 0.1))

ax2.plot(history.history['accuracy'], color='b', label="Training accuracy")
ax2.plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
ax2.set_xticks(np.arange(1, epochs, 1))

legend = plt.legend(loc='best', shadow=True)
plt.tight_layout()
plt.show()

# MODEL TESTING

In [41]:
sample = random.choice(test_data['File Path'])

category = sample.split('/')[-3]
true = ''
if category == 'COVID':
    true = 'COVID'
elif category == 'Viral Pneumonia':
    true = 'Viral Pneumonia'
else:
    true = 'Normal'

print(f'True value is : {true}')
    
image = load_img(sample, target_size=(224, 224))
img = img_to_array(image)
img = img.reshape((1, 224, 224, 3))

result = model.predict(img)
result = np.argmax(result, axis=-1)
print('Prediction is:')
if result == 0:
    print("Normal")
elif result == 1:
    print("Viral Pneumonia")
else:
    print("COVID +")
    
plt.imshow(image)