In [8]:
# Load general libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from os import path, listdir
import zipfile
import warnings
from collections import Counter

warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
%matplotlib inline

  import pandas.util.testing as tm


In [0]:
# load model libraries

from keras.applications import VGG16
from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
import keras.optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [0]:
# load the label data

labels_df = pd.read_csv('/content/drive/My Drive/Capstone/sample_labels.csv', low_memory=False)

In [0]:
# create Y/N field for conditions
# the image either does or doesn't have one present

def conditions(x):
  if 'No Finding' in x:
    return 'N'
  return 'Y'

labels_df['HasCondition'] = labels_df['Finding Labels'].map(conditions)

In [0]:
# one hot encode results

labels_df_cat = pd.concat([labels_df.drop(['HasCondition'], axis=1), pd.get_dummies(labels_df['HasCondition'], prefix='Condition')], axis=1)

In [0]:
# add one hot encoding for labels
# used to include multiple conditions

conditions = ['No Finding','Infiltration','Atelectasis','Effusion','Nodule','Pneumothorax','Mass','Consolidation','Pleural_Thickening','Cardiomegaly','Emphysema','Fibrosis','Edema','Pneumonia','Hernia']

for i in conditions :
    labels_df[i] = labels_df['Finding Labels'].apply(lambda x: 1 if i in x else 0)

In [0]:
labels_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y,HasCondition,No Finding,Infiltration,Atelectasis,Effusion,Nodule,Pneumothorax,Mass,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Fibrosis,Edema,Pneumonia,Hernia
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139,Y,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168,Y,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168,N,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143,0.143,Y,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,00000032_001.png,Cardiomegaly|Edema|Effusion,1,32,055Y,F,AP,2500,2048,0.168,0.168,Y,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0


In [0]:
# load images from directory
# save them as a list

img_path = '/content/image_path/sample/images/'
image_shape = (128,128)
scans = list()
img_id = list()

for file in listdir(img_path):
  picture = load_img(img_path+file, target_size=image_shape)
  picture = img_to_array(picture)
  scans.append(picture)
  img_id.append(file)

In [0]:
# combine image IDs from download with their labels

img_id_df = pd.DataFrame(img_id, columns = ['IMG_ID'])
image_labels = labels_df_cat[['Image Index','Condition_N','Condition_Y']]

img_id_df = pd.merge(img_id_df, image_labels, how = 'left', left_on = ['IMG_ID'], right_on = ['Image Index'])

In [0]:
# create table with just labels for model input

labels = img_id_df[['Condition_N','Condition_Y']]

These blocks are the steps that stage the data for the model inputs

In [0]:
# convert scans and labels to arrays
# makes it easier to load into model

all_scans = np.asarray(scans)
all_labels = np.asarray(labels)

In [0]:
# set up data for the model

seed = 42

X = all_scans
y = all_labels

# this is for original data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = seed)

#configure data using image data generator

train_data_gen = ImageDataGenerator(
    rescale = 1.0/255.0,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True)

test_data_gen = ImageDataGenerator(
    rescale = 1.0/255.0,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True)


In [0]:
# define data for model

train_samples = len(Xtrain)
test_samples = len(Xtest)

# final staging of data

train_data = train_data_gen.flow(np.array(Xtrain), ytrain, batch_size = batch_size)
test_data = test_data_gen.flow(np.array(Xtest), ytest, batch_size = batch_size)

In [0]:
# build the model

baseModel = VGG16(weights="imagenet", include_top=False, input_shape=(128, 128, 3))

model = Sequential()
model.add(baseModel)
model.add(Flatten())
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation = 'softmax'))

init_lr = 0.001
epochs = 25
bs = 75


# compile the model
opt = keras.optimizers.Adam(lr = init_lr, decay = init_lr / epochs)
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])


# define early stopping
#stopping = EarlyStopping(monitor="loss", mode="min")

# calculate steps per epoch for training and validation
training_steps = train_samples // bs
test_steps = test_samples // bs




In [0]:
# define callbacks

stopping = EarlyStopping(monitor = 'acc', patience = 3)

reduceLR = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 2)

callback_list = [stopping, reduceLR]

In [33]:
# train the model
# this model uses balanced batch generator

model_train = model.fit_generator(train_data,
                                 steps_per_epoch = training_steps,
                                 epochs = 15,
                                 validation_data = test_data,
                                 validation_steps = test_steps,
                                 callbacks = callback_list)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [39]:
# plot model training results

print("[INFO] evaluating network...")
predIdxs = model.predict(Xtest, batch_size=bs)
    
    
# for each image in the testing set we need to find the index of the
# label with corresponding largest predicted probability
predIdxs = np.argmax(predIdxs, axis=1)
    
# show a nicely formatted classification report
print(classification_report(ytest.argmax(axis=1), predIdxs))
    

# compute the confusion matrix and and use it to derive the raw
# accuracy, sensitivity, and specificity
    
cm = confusion_matrix(ytest.argmax(axis=1), predIdxs)
total = sum(sum(cm))
acc = (cm[0, 0] + cm[1, 1]) / total
sensitivity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
specificity = cm[1, 1] / (cm[1, 0] + cm[1, 1])
   
# show the confusion matrix, accuracy, sensitivity, and specificity
print(cm)
print("acc: {:.4f}".format(acc))
print("sensitivity: {:.4f}".format(sensitivity))
print("specificity: {:.4f}".format(specificity))
    


[INFO] evaluating network...
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       618
           1       0.45      1.00      0.62       504

    accuracy                           0.45      1122
   macro avg       0.22      0.50      0.31      1122
weighted avg       0.20      0.45      0.28      1122

[[  0 618]
 [  0 504]]
acc: 0.4492
sensitivity: 0.0000
specificity: 1.0000
