In [1]:
import numpy as np
import pandas as pd
from PIL import Image, ImageOps
import matplotlib.pyplot as plt
import os
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.applications.resnet50 import preprocess_input, ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.initializers import Constant
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [2]:
def prepare_data(ds, shuffle=False, augment=False):
    ds = ds.map(lambda x, y: (preprocess_input(x), y))
    if shuffle:
        ds = ds.shuffle(1000)
    ds = ds.batch(batch_size)
    if augment:
        ds = ds.map(
            lambda x, y: (data_augmentation(x, training=True), y),
            num_parallel_calls=AUTOTUNE
        )
    return ds.prefetch(buffer_size=AUTOTUNE)

In [3]:
metadata = pd.read_csv('data/Chest_xray_Corona_Metadata.csv').drop('Unnamed: 0', axis=1)
metadata.head()

Unnamed: 0,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
0,IM-0128-0001.jpeg,Normal,TRAIN,,
1,IM-0127-0001.jpeg,Normal,TRAIN,,
2,IM-0125-0001.jpeg,Normal,TRAIN,,
3,IM-0122-0001.jpeg,Normal,TRAIN,,
4,IM-0119-0001.jpeg,Normal,TRAIN,,


In [4]:
metadata['Label'].value_counts()

Pnemonia    4334
Normal      1576
Name: Label, dtype: int64

In [5]:
image_root_path = 'data/'

In [6]:
batch_size = 50
input_shape = (299, 299, 3)
AUTOTUNE = tf.data.AUTOTUNE

In [7]:
train_data = image_dataset_from_directory(
    'data/train/',
    labels='inferred',
    batch_size=50,
    image_size=input_shape[:2],
    seed=1,
    validation_split=0.2,
    subset='training'
)

Found 5286 files belonging to 2 classes.
Using 4229 files for training.


In [8]:
validation_data = image_dataset_from_directory(
    'data/train/',
    labels='inferred',
    batch_size=50,
    image_size=input_shape[:2],
    seed=1,
    validation_split=0.2,
    subset='validation'
)

Found 5286 files belonging to 2 classes.
Using 1057 files for validation.


In [9]:
test_data = image_dataset_from_directory(
    'data/test/',
    labels='inferred',
    image_size=input_shape[:2]
)

Found 624 files belonging to 2 classes.


In [10]:
iterator = iter(train_data)
y_train = np.array([])
for i in iterator:
    y_train = np.concatenate([y_train, i[1].numpy()])

y_train.mean()

0.7450934026956727

In [11]:
iterator = iter(validation_data)
y_val = np.array([])
for i in iterator:
    y_val = np.concatenate([y_val, i[1].numpy()])
    
y_val.mean()

0.750236518448439

In [12]:
iterator = iter(test_data)
y_test = np.array([])
for i in iterator:
    y_test = np.concatenate([y_test, i[1].numpy()])

y_test.mean()

0.625

In [13]:
data_augmentation = tf.keras.Sequential([
layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
layers.experimental.preprocessing.RandomRotation(0.2),
layers.experimental.preprocessing.RandomHeight(0.1),
layers.experimental.preprocessing.RandomWidth(0.1),
layers.experimental.preprocessing.RandomZoom(0.1),
])

In [14]:
# train_data = prepare_data(train_data, shuffle=True, augment=True)
# validation_data = prepare_data(validation_data)
# test_data = prepare_data(test_data)
train_data = train_data.map(lambda x, y: (preprocess_input(x), y))
validation_data = validation_data.map(lambda x, y: (preprocess_input(x), y))
test_data = test_data.map(lambda x, y: (preprocess_input(x), y))
train_data_pf = train_data.prefetch(buffer_size=AUTOTUNE)
validation_data_pf = validation_data.prefetch(buffer_size=AUTOTUNE)
test_data_pf = test_data.prefetch(buffer_size=AUTOTUNE)

In [15]:
# Calculate class weights to balance data
unique_classes = metadata.Label.unique()
all_rows = metadata.Label.to_numpy()
weights = compute_class_weight('balanced', classes=unique_classes, y=all_rows)

In [16]:
# set initial output bias for model to speed up convergence
initial_bias = Constant(weights[1] / weights[0])

In [17]:
base_model = ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=input_shape
)

base_model.trainable = False
base_model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 305, 305, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 150, 150, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
conv1_bn (BatchNormalization)   (None, 150, 150

In [29]:
def create_model():
    model = Sequential()
    model.add(data_augmentation)
    model.add(base_model)
    model.add(GlobalAveragePooling2D())
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='tanh', bias_initializer=initial_bias))
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [30]:
model = create_model()

In [31]:
checkpoint_path = 'model checkpoints/inception_resnet_v2/'

In [32]:
checkpoint = ModelCheckpoint(
    checkpoint_path,
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=True
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=1
)

In [None]:
history = model.fit(
    train_data,
    batch_size=50,
    callbacks=[checkpoint, early_stopping],
    epochs=10,
    validation_data=validation_data,
    verbose=1
)

Epoch 1/10

In [None]:
plt.figure(figsize=(8,5))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['training loss', 'validation loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('Training and validation loss')
plt.show()

In [24]:
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x21307b8ef10>

In [25]:
y_proba = model.predict(test_data)

In [26]:
y_pred = np.where(y_proba >= 0.5, 1, 0)

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.34      0.13      0.19       234
         1.0       0.62      0.85      0.72       390

    accuracy                           0.58       624
   macro avg       0.48      0.49      0.45       624
weighted avg       0.51      0.58      0.52       624



In [28]:
y_pred.mean()

0.8573717948717948