In [1]:
import keras
import os
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd
import numpy as np
import zipfile

In [2]:
# Uncomment if you need to download the data, it will say the location (folder is hidden)
#!darwin dataset pull v7-labs/covid-19-chest-x-ray-dataset:all-images

In [3]:
ziploc ="C:/Users/anahs/OneDrive - Universiteit Utrecht/SECOND PERIOD/Pattern Recognition/CXR_PROJECT/"
imgloc = "C:/Users/anahs/.darwin/datasets/v7-labs/covid-19-chest-x-ray-dataset/images"

test_zf = zipfile.ZipFile(ziploc+'test_data.zip') 
test_df = pd.read_csv(test_zf.open('val_data.csv'))

In [4]:
def label(df):
    df["label"] = np.nan
    df.loc[df["type"] == "No Pneumonia", 'label'] = 0
    df.loc[df["type"] == "Bacterial Pneumonia", 'label'] = 1
    df.loc[df["type"] == "Viral Pneumonia", 'label'] = 2
    df.loc[df["Covid"] == True, 'label'] = 3
    return df

test = label(test_df)
test_df = test[test['label'].notna()]

In [5]:
# rescale and reproduceability
seed = 4
datagenerator =  ImageDataGenerator(rescale= 1/255)

# Make test data
test_data = datagenerator.flow_from_dataframe(
        dataframe = test_df,
        directory = imgloc,
        x_col = "ogfilename",
        y_col = None,
        class_mode = None,
        batch_size = 1,
        seed = seed,
        shuffle = False,
        target_size = (224,224), # changed values for vgg16
        keep_aspect_ratio = True,
        validate_filenames= True)

Found 641 validated image filenames.


## VGG16_imagenet_free_model

In [6]:
model = tf.keras.models.load_model('VGG16_imagenet_free_model')

In [7]:
#Predict
y_prediction = model.predict(test_data) #for each observation the model returns the probability of belonging to each class 



In [9]:
y_prediction

array([[2.7843351e-02, 4.9268880e-01, 2.5068099e-02, 8.8289849e-02],
       [8.7550590e-03, 6.4834601e-01, 2.1960461e-03, 1.2821660e-02],
       [9.8913200e-02, 2.3882896e-01, 2.3685645e-03, 1.2537321e-02],
       ...,
       [2.2415612e-04, 7.3379558e-04, 9.9807978e-01, 8.2566345e-05],
       [2.5945667e-03, 1.7011426e-03, 9.9890202e-01, 1.3603761e-03],
       [6.9178072e-06, 3.2912949e-04, 9.9997342e-01, 9.5609561e-05]],
      dtype=float32)

In [10]:
y_prediction = np.argmax(y_prediction, axis=1) #choose the class with most probability

#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(test_df.label, y_prediction, normalize='pred')
print(result)

[[0.67555556 0.01529052 0.05263158 0.        ]
 [0.13333333 0.73700306 0.11842105 0.15384615]
 [0.19111111 0.24770642 0.13157895 0.84615385]
 [0.         0.         0.69736842 0.        ]]


In [11]:
result = result.astype('float') / result.sum(axis=1)[:, np.newaxis]

#The diagonal entries are the accuracies of each class
acc_perclass = result.diagonal()

print("Accuracy por clase")
print(acc_perclass)
print("Healthy      Bacterial     Viral     Covid")

Accuracy por clase
[0.90864272 0.64502077 0.09288689 0.        ]
Healthy      Bacterial     Viral     Covid


In [12]:
print(classification_report(test_df.label, y_prediction))

              precision    recall  f1-score   support

         0.0       0.68      0.94      0.79       161
         1.0       0.74      0.85      0.79       282
         2.0       0.13      0.07      0.09       145
         3.0       0.00      0.00      0.00        53

    accuracy                           0.63       641
   macro avg       0.39      0.47      0.42       641
weighted avg       0.52      0.63      0.57       641



## VGG16_free_model

In [13]:
model2 = tf.keras.models.load_model('VGG16_free_model')

In [14]:
#Predict
y_prediction2 = model2.predict(test_data) #for each observation the model returns the probability of belonging to each class 



In [15]:
y_prediction2

array([[4.39927697e-01, 1.91484001e-02, 1.81896798e-02, 4.84982729e-02],
       [9.78682004e-03, 2.83165574e-01, 1.71474740e-02, 1.02220014e-01],
       [7.47225881e-02, 2.38302737e-01, 2.47117458e-03, 1.51563302e-01],
       ...,
       [1.19934864e-01, 1.78222952e-03, 9.93645668e-01, 7.11614266e-05],
       [3.45302396e-04, 1.04672397e-02, 9.96992767e-01, 4.46134545e-02],
       [1.67350993e-06, 8.75800652e-06, 9.99988437e-01, 1.50614064e-02]],
      dtype=float32)

In [16]:
y_prediction2 = np.argmax(y_prediction2, axis = 1) #choose the class with most probability

#Create confusion matrix and normalizes it over predicted (columns)
result2 = confusion_matrix(test_df.label,  y_prediction2, normalize='pred')
print(result2)

[[0.70935961 0.01162791 0.06185567 0.05325444]
 [0.13793103 0.86046512 0.26804124 0.47337278]
 [0.15270936 0.12790698 0.12371134 0.47337278]
 [0.         0.         0.54639175 0.        ]]


In [17]:
result2 = result2.astype('float') / result2.sum(axis=1)[:, np.newaxis]

#The diagonal entries are the accuracies of each class
acc_perclass2 = result2.diagonal()

print("Accuracy por clase")
print(acc_perclass2)
print("Healthy      Bacterial     Viral     Covid")

Accuracy por clase
[0.8484172  0.49457414 0.14094939 0.        ]
Healthy      Bacterial     Viral     Covid


In [18]:
print(classification_report(test_df.label, y_prediction2))

              precision    recall  f1-score   support

         0.0       0.71      0.89      0.79       161
         1.0       0.86      0.52      0.65       282
         2.0       0.12      0.08      0.10       145
         3.0       0.00      0.00      0.00        53

    accuracy                           0.47       641
   macro avg       0.42      0.38      0.39       641
weighted avg       0.58      0.47      0.51       641



## VGG16_imagenet_model

In [19]:
model3 = tf.keras.models.load_model('VGG16_imagenet_model')

In [20]:
#Predict
y_prediction3 = model3.predict(test_data) #for each observation the model returns the probability of belonging to each class 



In [21]:
y_prediction3

array([[1.3399896e-01, 6.8855561e-02, 3.1142685e-01, 5.1353570e-02],
       [4.2112532e-01, 1.0599787e-01, 3.3898681e-01, 9.7181043e-03],
       [2.6674975e-02, 3.3246219e-01, 9.7355209e-02, 6.3240066e-02],
       ...,
       [3.9061089e-03, 4.8470978e-05, 9.9950510e-01, 1.5719625e-04],
       [1.2133875e-03, 5.8876324e-02, 9.6298736e-01, 2.7357863e-02],
       [4.8025441e-04, 2.9228244e-02, 9.9175966e-01, 2.7654167e-02]],
      dtype=float32)

In [22]:
y_prediction3 = np.argmax(y_prediction3, axis=1) #choose the class with most probability

#Create confusion matrix and normalizes it over predicted (columns)
result3 = confusion_matrix(test_df.label, y_prediction3, normalize='pred')
result3

array([[0.59765625, 0.01129944, 0.03030303, 0.        ],
       [0.1953125 , 0.86440678, 0.39393939, 0.1       ],
       [0.20703125, 0.12429379, 0.30808081, 0.9       ],
       [0.        , 0.        , 0.26767677, 0.        ]])

In [23]:
result3 = result3.astype('float') / result3.sum(axis=1)[:, np.newaxis]

#The diagonal entries are the accuracies of each class
acc_perclass3 = result3.diagonal()

print("Accuracy por clase")
print(acc_perclass3)
print("Healthy      Bacterial     Viral     Covid")

Accuracy por clase
[0.93492077 0.55636852 0.20012969 0.        ]
Healthy      Bacterial     Viral     Covid


In [24]:
print(classification_report(test_df.label, y_prediction3))

              precision    recall  f1-score   support

         0.0       0.60      0.95      0.73       161
         1.0       0.86      0.54      0.67       282
         2.0       0.31      0.42      0.36       145
         3.0       0.00      0.00      0.00        53

    accuracy                           0.57       641
   macro avg       0.44      0.48      0.44       641
weighted avg       0.60      0.57      0.56       641

