### Imports

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import shutil
import time
import cv2 as cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Activation,Dropout,Conv2D, MaxPooling2D,BatchNormalization, Flatten
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model, load_model, Sequential
import os
import seaborn as sns
sns.set_style('darkgrid')
from PIL import Image
from sklearn.metrics import confusion_matrix, classification_report
from IPython.core.display import display, HTML

from tensorflow.keras.preprocessing import image
from sklearn.decomposition import PCA
from math import ceil

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from tensorflow.keras.preprocessing import image

### Globals

In [None]:

filelist = "../input/covidx-cxr2/train.txt"
image_path = '../input/covidx-cxr2/train'

test_file_list = "../input/covidx-cxr2/test.txt"
test_image_path = '../input/covidx-cxr2/test'

### Exploratory analaysis on the dataset

In [None]:
# Exploratory analysis on the datsets:
# We want to look at the following things:
# 1) Mean covid and mean non covid image
# 2) Variance between images in each class
# 3) Eigen images for each class (Eigenfaces)

# function that returns images loaded in a dataframe.
def imDataframe(filelist, image_path):
    res = pd.DataFrame()
    Fseries = pd.Series(dtype = 'str')
    Lseries = pd.Series(dtype = 'str')
    train_txt = open(filelist,"r")
    train_list_filenames = []
    train_list_labels = []
    train_list=train_txt.readlines()

    for line in train_list:
        split=line.split(' ')    
        if len(split)>4:        
            fname=os.path.join(image_path, split[2])        
            label=split [3]
        else:        
            fname=os.path.join(image_path, split[1])
            label=split[2]
        train_list_filenames.append(fname)
        train_list_labels.append(label)
    
    Fseries=pd.Series(train_list_filenames, name='filenames')
    Lseries=pd.Series(train_list_labels, name='labels')
    res=pd.concat([Fseries, Lseries], axis=1)
    return res

In [None]:
# load all images in a datframe
im_df = imDataframe(filelist, image_path)

# Calculate and print the mean covid and mean non-covid image

# find list of covid positive and covid negative images
covid_pos = im_df.loc[im_df.labels == 'positive']
covid_neg = im_df.loc[im_df.labels == 'negative']
covid_pos = covid_pos.reset_index()
size = [64, 64, 1]

cummulative_sum = [[0]*np.prod(np.array(size))]

for idx, row in covid_pos.iterrows():
    current_image = image.load_img(row['filenames'], target_size = size, 
                                       color_mode = 'grayscale')
    # covert image to a matrix
    img_ts = image.img_to_array(current_image).ravel()
    cummulative_sum = np.add(cummulative_sum, img_ts)

pos_mean_img = cummulative_sum // i
pos_mean_img = pos_mean_img.reshape(size)
plt.imshow(pos_mean_img, vmin=0, vmax=255, cmap='Greys_r')
plt.title('Mean covid positive')
plt.axis('off')
plt.show()

covid_neg = covid_neg.reset_index()
size = [64, 64, 1]
cummulative_sum = [[0]*np.prod(np.array(size))]

for idx, row in covid_neg.iterrows():
    current_image = image.load_img(row['filenames'], target_size = size, 
                                       color_mode = 'grayscale')
    img_ts = image.img_to_array(current_image).ravel()
    cummulative_sum = np.add(cummulative_sum, img_ts)

neg_mean_img = cummulative_sum // i
neg_mean_img = neg_mean_img.reshape(size)
plt.imshow(neg_mean_img, vmin=0, vmax=255, cmap='Greys_r')
plt.title('Mean normal lung')
plt.axis('off')
plt.show()

# Calculate and draw a contrast plot between the mean covid and mean non-covid image.
diff = neg_mean_img - pos_mean_img 
plt.imshow(diff, cmap='bwr')
plt.title('Difference Between Normal & Covid Average')
plt.axis('off')
plt.show()

In [None]:
# load all images in a dataframe
def img2np(list_of_filename, size = (64, 64)):
    for fn in list_of_filename:
        fp = fn
        current_image = image.load_img(fp, target_size = size, 
                                       color_mode = 'grayscale')
        img_ts = image.img_to_array(current_image)
        # turn that into a vector / 1D array
        img_ts = [img_ts.ravel()]
        try:
            res = np.concatenate((res, img_ts))
        except UnboundLocalError: 
            res = img_ts
    return res

covid_pos_im_matrix = img2np(covid_pos.filenames.values)
covid_neg_im_matrix = img2np(covid_neg.filenames.values)

In [None]:
# Calculate and plot how the images vary in each class
def imVariance(input_mat, title, size = (64, 64)):
    stddev_img = np.std(input_mat, axis = 0)
    stddev_img = stddev_img.reshape(size)
    plt.imshow(stddev_img, vmin=0, vmax=255, cmap='Greys_r')
    plt.title(f'Variance {title}')
    plt.axis('off')
    plt.show()
    return stddev_img

norm_variance = imVariance(covid_neg_im_matrix, 'NORMAL')
pneu_variance = imVariance(covid_pos_im_matrix, 'COVID')

In [None]:
# Eigen images of the different classes in the dataset.
# Calculate and plot PCA on the dataset

def eigenimages(input_mat, title, size = (64, 64), num_comp = 0.7):
    res = PCA(n_components = num_comp, whiten = True)
    res.fit(input_mat)
    return res
  
def print_pca(input_pca, size = (64, 64)):
    components = input_pca.n_components_
    fig = plt.figure(figsize=(8, 8))
    rows = int(components**.5)
    cols = ceil(components/ rows)
    for i in range(components):
        ax = fig.add_subplot(rows, cols, i + 1, xticks = [], yticks = [])
        ax.imshow(input_pca.components_[i].reshape(size), 
                  cmap='Greys_r')
    plt.axis('off')
    plt.show()
    
print_pca(eigenimages(covid_pos_im_matrix, 'COVID'))
print_pca(eigenimages(covid_neg_im_matrix, 'NORMAL'))