# Import Libraries

In [None]:
# Sequetial is used to build different layers in the neural network
from tensorflow.keras.models import Sequential

# Conv2d helps the model to learn specific features and sharpening detections in the image
# Maxpooling reduces the size of the data without loosing important features in the image
# Dropout prevents overfitting. It randomly looses / drop connections between neurons
# Flatten layer transfroms a 2D matrix of features to a vector that can fit in the fully connected layer
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense

# Adam (Adaptive Momentum) optimizer 
from tensorflow.keras.optimizers import Adam

# ImageDataGenerator is used for image augmentation to increase the sample size. Making slight modifications such as rotation
# so the model can learn more effectively from many variations
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import numpy as np
import matplotlib.pyplot as plt


# TASK 2 : Clone & Explore dataset

In [None]:
#clone the dataset from the github repository
! git clone https://github.com/education454/datasets.git

In [None]:
#set the path to the main dir
import os
main_dir = "/content/datasets/Data"

#set the path to the train dir
train_dir = os.path.join(main_dir,"train")

#set the path to the test dir
test_dir = os.path.join(main_dir, "test")

#directory with the training covid images
train_covid_dir = os.path.join(train_dir, "COVID19")

#directory with the training normal images
train_normal_dir = os.path.join(train_dir, "NORMAL")
#directory with the testing covid images
test_covid_dir = os.path.join(test_dir, "COVID19")
#directory with the testing normal images
test_normal_dir = os.path.join(test_dir, "NORMAL")

In [None]:
#print the filenames
# Takes in each directory and converts it to a list
train_covid_names = os.listdir(train_covid_dir)
print("The type for the train_covid_names is: ", type(train_covid_names))
print(train_covid_names[:10])


train_normal_names = os.listdir(train_normal_dir)
print(train_normal_names[:10])

test_covid_names = os.listdir(test_covid_dir)
print(test_covid_names[:10])

test_normal_names = os.listdir(test_normal_dir)
print(test_normal_names[:10])

In [None]:
#print the total no of images present in each dir
print("Total COVID19 images in the training set: ", len(train_covid_names))
print("Total NORMAL images in the training set: ", len(train_normal_names))
print("Total COVID19 images in the testing set: ", len(test_covid_names))
print("Total Normal images in the testing set: ", len(test_normal_names))
print("Total images present in the training set :", len(train_covid_names + train_normal_names))
print("Total images present in the testing set :", len(test_covid_names + test_normal_names))

# TASK 3 : Data Visualization

In [None]:
# plot a grid of 16 images (8 images of Covid19 and 8 images of Normal)
import matplotlib.image as mpimg
#set the number of columns and rows
rows = 4
cols = 4
#set the figure size - get the current figure (gcf)
fig = plt.gcf()
fig.set_size_inches(12,12)
#get the filenames from the covid & normal dir of the train dataset
covid_pic = [os.path.join(train_covid_dir, filename) for filename in train_covid_names [0:8]]
normal_pic = [os.path.join(train_normal_dir, filename) for filename in train_normal_names[0:8]]
#print the list
print(covid_pic)
print(normal_pic)
#merge the covid and normal list
merged_list = covid_pic + normal_pic
for i , img_path in enumerate(merged_list):
  # it will transform the img directory to a list by splitting them based on the "/" pattern
  # to get only the name part, we are taking the last element from the list by using [-1]
  data = img_path.split('/')[-1]
  sp = plt.subplot(rows, cols, i + 1)
  sp.axis('off')
  img = mpimg.imread(img_path)
  sp.set_title(data,fontsize=10)
  plt.imshow(img, cmap = 'gray')

plt.show()

# TASK 4 : Data Preprocessing & Augmentation

In [None]:
# generate training,testing and validation batches 
# rescale normalizes the pixel values
# validation_split randomly places the allocated percentage of the data in the validation set and the remaining in the training set
# zoom_range randomly zooms in parts of the images
# horizontal_flip randomly flips inputs horizontally
dgen_train = ImageDataGenerator(rescale=1./255,
                                validation_split = 0.2,
                                zoom_range = 0.2,
                                horizontal_flip = True)
dgen_validation = ImageDataGenerator(rescale = 1./255)
dgen_test = ImageDataGenerator(rescale= 1./255)

# flow_from_directory loads the images from the directory
# target_size rescales our image to 150 by 150 pixels
# subset specifies whether the training or validation subset is passed
# batch_size specifies how many images pass through the model at once
# class_mode if only two outputs then its binary otherwise categorical
train_generator = dgen_train.flow_from_directory(train_dir,
                                                 target_size = (150, 150),
                                                 subset = 'training',
                                                 batch_size = 32,
                                                 class_mode = 'binary')

validation_generator = dgen_train.flow_from_directory(train_dir,
                                                           target_size = (150, 150),
                                                           subset = 'validation',
                                                           batch_size = 32,
                                                           class_mode = 'binary')

test_generator = dgen_test.flow_from_directory(test_dir,
                                               target_size = (150, 150),
                                               batch_size = 32,
                                               class_mode = 'binary')

In [None]:
#get the class indices
train_generator.class_indices

In [None]:
#get the image shape
train_generator.image_shape

# TASK 5 : Build Convolutional Neural Network Model

In [None]:
model = Sequential()
# add the convolutional layer
# filters, size of filters,padding,activation_function,input_shape
# First we take 32 features of the image using 5 by 5 filters
# padding same means the output size is the same as the input size. This requires the filter window to slip outside the input map, hence the need to pad 
# Since this is the first layer, we need to specify the input_shape as the image shape. 3 represents the RGB channels
model.add(Conv2D(32,(5,5), padding = 'SAME', activation = 'relu', input_shape = (150, 150, 3)))
# pooling layer- to reduce the dimmensionality to reduce the training time
model.add(MaxPooling2D(pool_size = (2,2)))
# place a dropout layer
model.add(Dropout(0.5))
# add another convolutional layer
model.add(Conv2D(64,(5,5), padding = 'SAME', activation = 'relu'))
# pooling layer
model.add(MaxPooling2D(pool_size=(2,2)))
# place a dropout layer
model.add(Dropout(0.5))
# Flatten layer - convert the image from 2D to 1D
model.add(Flatten())
# add a dense layer : amount of nodes, activation
model.add(Dense(256, activation = 'relu'))
# place a dropout layer
# 0.5 drop out rate is recommended, half input nodes will be dropped at each update
model.add(Dropout(0.5))
# Since we are dealing with a binary classification problem, we are adding 1 followed by sigmoid
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

# TASK 6 : Compile & Train the Model

In [None]:
#compile the model
# using the adam optimizer with a learning rate of 0.001; loss binary cross entropy since we have 2 categories
model.compile(Adam(lr = 0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
#train the model
history = model.fit(train_generator,
                    epochs = 30,
                    validation_data = validation_generator)

# TASK 7 : Performance Evaluation

In [None]:
#get the keys of history object
history.history.keys()

In [None]:
#plot graph between training and validation loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['Training', 'Validation'])
plt.title('Training and Validation Losses')
plt.xlabel('epoch')

In [None]:
#plot graph between training and validation accuarcy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['Training', 'Validation'])
plt.title('Training and Validation Accuracy')
plt.xlabel('epoch')

In [None]:
# get the test acuarcy and loss
test_loss, test_acc = model.evaluate(test_generator)
print('Test Loss: {} Test Accuracy: {}'.format(test_loss,test_acc))

# TASK 8 : Prediction On New Data

In [None]:
from google.colab import files
from keras.preprocessing import image
uploaded = files.upload()
for filename in uploaded.keys():
  img_path = '/content/'+filename
  img = image.load_img(img_path, target_size = (150,150))
  images = image.img_to_array(img)
  images = np.expand_dims(images, axis = 0)
  prediction = model.predict(images)
  print(filename)

  if prediction ==0:
    print('COVID19 Detected')
  else: 
    print("The X-ray report is normal")