In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import zipfile
from keras_preprocessing.image import img_to_array, array_to_img, load_img, ImageDataGenerator
import matplotlib.pyplot as plt
from skimage.io import imread

In [2]:
# Image and data locs:
ziploc ="E:/School/UU/PATREC/CXR_project/"
imgloc = "E:/School/UU\PATREC/.darwin/datasets/v7-labs/covid-19-chest-x-ray-dataset/images/"
train_zf = zipfile.ZipFile(ziploc+'train_data.zip') 
val_zf = zipfile.ZipFile(ziploc+'val_data.zip') 
test_zf = zipfile.ZipFile(ziploc+'test_data.zip') 
train_df = pd.read_csv(train_zf.open('train_data.csv'))
val_df = pd.read_csv(val_zf.open('val_data.csv'))
test_df = pd.read_csv(test_zf.open('val_data.csv'))

In [3]:
def label(df):
    df["label"] = np.nan
    df.loc[df["type"] == "No Pneumonia", 'label'] = "A"
    df.loc[df["type"] == "Bacterial Pneumonia", 'label'] = "B"
#     df.loc[df["type"] == "Fungal Pneumonia", 'label'] = "C"
    df.loc[df["type"] == "Viral Pneumonia", 'label'] = "D"
    df.loc[df["Covid"] == True, 'label'] = "C"
#     df.loc[df["type"] == "Undefined Pneumonia", 'label'] = "F"
    return df

In [4]:
train = label(train_df)
val = label(val_df)
test = label(test_df)

In [5]:
# Drop the undefined data labels
train2 = train[train['label'].notna()]
val2 = val[val['label'].notna()]
test2 = test[test['label'].notna()]

In [7]:
test2

Unnamed: 0,filename,type,ogfilename,lung1,lung2,view,Covid,width,height,json_filename,label
0,00002438.jpeg,Bacterial Pneumonia,person543_bacteria_2279.jpeg,"{'path': [{'x': 338, 'y': 45}, {'x': 334, 'y':...","{'path': [{'x': 521, 'y': 32.0}, {'x': 521, 'y...",,False,920.0,736.0,person543_bacteria_2279.json,B
1,00004180.jpeg,Bacterial Pneumonia,person1333_bacteria_3386.jpeg,"{'path': [{'x': 610, 'y': 165}, {'x': 608, 'y'...","{'path': [{'x': 443.0, 'y': 150}, {'x': 442, '...",,False,1136.0,688.0,person1333_bacteria_3386.json,B
2,00002785.jpeg,Bacterial Pneumonia,person40_bacteria_202.jpeg,"{'path': [{'x': 438.0, 'y': 114}, {'x': 430, '...","{'path': [{'x': 612, 'y': 88}, {'x': 612, 'y':...",,False,1216.0,960.0,person40_bacteria_202.json,B
3,00006195.jpeg,Bacterial Pneumonia,person133_bacteria_633.jpeg,"{'path': [{'x': 395, 'y': 25.0}, {'x': 394, 'y...","{'path': [{'x': 575, 'y': 41}, {'x': 572, 'y':...",,False,976.0,632.0,person133_bacteria_633.json,B
4,00002224.jpeg,Bacterial Pneumonia,person71_bacteria_349.jpeg,"{'path': [{'x': 556, 'y': 83}, {'x': 556, 'y':...","{'path': [{'x': 325, 'y': 77}, {'x': 321, 'y':...",,False,928.0,656.0,person71_bacteria_349.json,B
...,...,...,...,...,...,...,...,...,...,...,...
641,00006906.jpg,Viral Pneumonia,covid-19-pneumonia-58-day-9.jpg,"{'path': [{'x': 853.0, 'y': 129}, {'x': 838.0,...","{'path': [{'x': 1213, 'y': 170}, {'x': 1213, '...",View/PA,True,2267.0,1974.0,covid-19-pneumonia-58-day-9.json,C
642,00006644.jpg,Viral Pneumonia,887db78f.jpg,"{'path': [{'x': 1440, 'y': 187}, {'x': 1440, '...","{'path': [{'x': 1029, 'y': 190.0}, {'x': 1023,...",View/AP_Supine,True,2000.0,2000.0,887db78f.json,C
643,00006757.jpeg,Viral Pneumonia,66298CBF-6F10-42D5-A688-741F6AC84A76.jpeg,,,View/Axial,True,1206.0,1263.0,66298CBF-6F10-42D5-A688-741F6AC84A76.json,C
644,00006774.jpg,Viral Pneumonia,a361d7b7.jpg,"{'path': [{'x': 806, 'y': 188}, {'x': 801.0, '...","{'path': [{'x': 1190.0, 'y': 155}, {'x': 1190....",View/PA,True,2000.0,2000.0,a361d7b7.json,C


In [6]:
train2.label.value_counts()

B    2252
A    1284
D    1159
C     416
Name: label, dtype: int64

In [7]:
def generate_augmented(df, filepath, batch_size, seed, shuffle):
#   Things that can be augmented
    datagenerator = ImageDataGenerator(rotation_range=10, # rotation
        width_shift_range=0.2, # horizontal shift
        height_shift_range=0.2, # vertical shift
        zoom_range=0.2, # zoom
        horizontal_flip=True, # horizontal flip
        brightness_range=[0.2,1.2]) # brightness)
    
#   generate images on the go from dataframe
    generator = datagenerator.flow_from_dataframe(
        dataframe = df, # what df to work with
        directory = filepath, # file location
        x_col = "ogfilename", # which file to get
        y_col = "label", # so it knows the label of new augmented image
        class_mode = "categorical", #converts abcde into categorical
        batch_size = batch_size,
        seed = seed,
        shuffle = shuffle,
        target_size = (224,224), # changed value for vgg16
        keep_aspect_ratio = True,
        validate_filenames= True)
    return generator

In [10]:
# New idea just grab len shortest from every class that way they all get generated the same.
covid_train = train2[train2['label']== "C"]
no_train = train2[train2['label']== "A"].sample(len(covid_train),random_state=1)
viral_train = train2[train2['label']== "D"].sample(len(covid_train),random_state=1)
bac_train = train2[train2['label']== "B"].sample(len(covid_train),random_state=1)

all_train = pd.concat([covid_train, no_train, viral_train, bac_train])
all_train_generated = generate_augmented(all_train, imgloc, batch_size = 20, seed= 4, shuffle = True)

Found 1664 validated image filenames belonging to 4 classes.


In [11]:
def generate_test_data(df, filepath, batch_size, seed, mode, ycols):
#     Just rescale here for test
    datagenerator =  ImageDataGenerator(rescale= 1/255)
    test_generator = datagenerator.flow_from_dataframe(
        dataframe = df,
        directory = filepath,
        x_col = "ogfilename",
        y_col = ycols,
        class_mode = mode,
        batch_size = batch_size,
        seed = seed,
        shuffle = False,
        target_size = (224,224), # changed values for vgg16
        keep_aspect_ratio = True,
        validate_filenames= True)
    return test_generator

In [12]:
test_generator = generate_test_data(test, imgloc, batch_size = 1, seed =4, mode =None, ycols=None)
val_generator = generate_test_data(val2, imgloc, batch_size = 20, seed =4, mode= "categorical", ycols = "label")

Found 646 validated image filenames.
Found 640 validated image filenames belonging to 4 classes.


In [13]:
cnn = tf.keras.models.Sequential()
cnn.add(tf.keras.layers.Conv2D(filters=48, kernel_size=3, activation='relu', input_shape=[256, 256, 3]))
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
cnn.add(tf.keras.layers.Conv2D(filters=48, kernel_size=3, activation='relu'))
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
cnn.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu'))
cnn.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2))
cnn.add(tf.keras.layers.Flatten())
cnn.add(tf.keras.layers.Dense(128, activation='relu'))
cnn.add(tf.keras.layers.Dense(64, activation='relu'))
cnn.add(tf.keras.layers.Dense(4, activation='softmax'))
cnn.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [14]:
cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 254, 254, 48)      1344      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 127, 127, 48)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 125, 125, 48)      20784     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 62, 62, 48)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 60, 60, 32)        13856     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 30, 30, 32)       0

In [28]:
# Trying vgg16 following: https://towardsdatascience.com/step-by-step-vgg16-implementation-in-keras-for-beginners-a833c686ae6c

model =tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(input_shape=(224,224,3),filters=64,kernel_size=(3,3),padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=64,kernel_size=(3,3),padding="same", activation="relu"))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(tf.keras.layers.Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu"))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2)))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=4096,activation="relu"))
model.add(tf.keras.layers.Dense(units=4096,activation="relu"))
model.add(tf.keras.layers.Dense(units=4, activation="softmax"))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [29]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_29 (Conv2D)          (None, 224, 224, 64)      1792      
                                                                 
 conv2d_30 (Conv2D)          (None, 224, 224, 64)      36928     
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 112, 112, 64)     0         
 g2D)                                                            
                                                                 
 conv2d_31 (Conv2D)          (None, 112, 112, 128)     73856     
                                                                 
 conv2d_32 (Conv2D)          (None, 112, 112, 128)     147584    
                                                                 
 max_pooling2d_14 (MaxPoolin  (None, 56, 56, 128)      0         
 g2D)                                                 

In [30]:
# from keras.callbacks import ModelCheckpoint, EarlyStopping
# checkpoint = ModelCheckpoint("vgg16_1.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', save_freq=1)
# early = EarlyStopping(monitor='val_acc', min_delta=0, patience=20, verbose=1, mode='auto')
batch_size = 20
hist = model.fit_generator(
        steps_per_epoch=1600// batch_size, 
        generator=all_train_generated, 
        validation_data= val_generator, 
        validation_steps=600//batch_size, 
        epochs=10)

  hist = model.fit_generator(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
