In [1]:
# Mount G Drive files

from google.colab import drive
drive.mount('/content/drive')
root_path = '/content/drive/My Drive/Capstone/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
# bring in Kaggle API
# need to pull this file from Kaggle based on your user ID for the site

from google.colab import files

api = files.upload() 

Saving kaggle.json to kaggle.json


In [0]:
# load Kaggle

!pip install -q Kaggle

In [5]:
# make a directory to save the .json file

! mkdir =p ~/.kaggle
! cp kaggle.json ~/.kaggle/
! ls ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘=p’: File exists
mkdir: cannot create directory ‘/root/.kaggle’: File exists
kaggle.json


In [14]:
# load Kaggle data

! kaggle datasets download -d nih-chest-xrays/sample

Downloading sample.zip to /content
100% 4.20G/4.20G [01:31<00:00, 31.5MB/s]
100% 4.20G/4.20G [01:31<00:00, 49.5MB/s]


In [0]:
# save files in .zip folder to G Drive

!unzip -q sample.zip -d image_path

In [6]:
# Load general libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from os import path, listdir
import zipfile
import warnings
from collections import Counter

warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
%matplotlib inline

  import pandas.util.testing as tm


In [7]:
# Load library models

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, classification_report
from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Conv2D, MaxPooling2D
import keras.optimizers
from keras.utils import to_categorical
from keras.callbacks import Callback, EarlyStopping
from sklearn.preprocessing import StandardScaler
from imblearn.keras import balanced_batch_generator
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [0]:
# load the label data

labels_df = pd.read_csv('/content/drive/My Drive/Capstone/sample_labels.csv', low_memory=False)

In [0]:
# create Y/N field for conditions
# the image either does or doesn't have one present

def conditions(x):
  if 'No Finding' in x:
    return 'N'
  return 'Y'

labels_df['HasCondition'] = labels_df['Finding Labels'].map(conditions)

In [0]:
# one hot encode results

labels_df_cat = pd.concat([labels_df.drop(['HasCondition'], axis=1), pd.get_dummies(labels_df['HasCondition'], prefix='Condition')], axis=1)

In [0]:
# add one hot encoding for labels
# used to include multiple conditions

conditions = ['No Finding','Infiltration','Atelectasis','Effusion','Nodule','Pneumothorax','Mass','Consolidation','Pleural_Thickening','Cardiomegaly','Emphysema','Fibrosis','Edema','Pneumonia','Hernia']

for i in conditions :
    labels_df[i] = labels_df['Finding Labels'].apply(lambda x: 1 if i in x else 0)

In [12]:
labels_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y,HasCondition,No Finding,Infiltration,Atelectasis,Effusion,Nodule,Pneumothorax,Mass,Consolidation,Pleural_Thickening,Cardiomegaly,Emphysema,Fibrosis,Edema,Pneumonia,Hernia
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139,Y,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168,Y,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168,N,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143,0.143,Y,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,00000032_001.png,Cardiomegaly|Edema|Effusion,1,32,055Y,F,AP,2500,2048,0.168,0.168,Y,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0


In [0]:
# load images from directory
# save them as a list

img_path = '/content/image_path/sample/images/'
image_shape = (128,128)
scans = list()
img_id = list()

for file in listdir(img_path):
  picture = load_img(img_path+file, target_size=image_shape)
  picture = img_to_array(picture)
  scans.append(picture)
  img_id.append(file)

In [0]:
# combine image IDs from download with their labels

img_id_df = pd.DataFrame(img_id, columns = ['IMG_ID'])
image_labels = labels_df_cat[['Image Index','Condition_N','Condition_Y']]

img_id_df = pd.merge(img_id_df, image_labels, how = 'left', left_on = ['IMG_ID'], right_on = ['Image Index'])

In [0]:
# create table with just labels for model input

labels = img_id_df[['Condition_N','Condition_Y']]

These blocks are the steps that stage the data for the model inputs

In [0]:
# convert scans and labels to arrays
# makes it easier to load into model

all_scans = np.asarray(scans)
all_labels = np.asarray(labels)

In [0]:
# set up data for the model

seed = 42

X = all_scans
y = all_labels

# this is for original data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = seed)

#configure data using image data generator

train_data_gen = ImageDataGenerator(
    rescale = 1.0/255.0,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True)

test_data_gen = ImageDataGenerator(
    rescale = 1.0/255.0,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True)


In [0]:
# define data for model

train_samples = len(Xtrain)
test_samples = len(Xtest)
batch_size = 100

# final staging of data

train_data = train_data_gen.flow(np.array(Xtrain), ytrain, batch_size = batch_size)
test_data = test_data_gen.flow(np.array(Xtest), ytest, batch_size = batch_size)

In [0]:
# build the model

model = Sequential()
model.add(Conv2D(32, (3,3), activation = 'relu', input_shape = (128,128,3)))
model.add(Conv2D(32, (3,3), activation = 'relu'))
model.add(MaxPooling2D((2,2)))

model.add(Dropout(0.2))
model.add(Conv2D(64, (3,3), activation = 'relu'))
model.add(Conv2D(64, (3,3), activation = 'relu'))
model.add(MaxPooling2D((2,2)))

model.add(Dropout(0.2))
model.add(Conv2D(128, (3,3), activation = 'relu'))
model.add(Conv2D(128, (3,3), activation = 'relu'))
model.add(MaxPooling2D((2,2)))

model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation = 'relu'))
# final output layer
model.add(Dense(2, activation = 'sigmoid'))

# compile the model
opt = keras.optimizers.rmsprop(lr = 0.001)
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])


# define early stopping
#stopping = EarlyStopping(monitor="loss", mode="min")

# calculate steps per epoch for training and validation
training_steps = train_samples // batch_size
test_steps = test_samples // batch_size




In [0]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 126, 126, 32)      896       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 124, 124, 32)      9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 62, 62, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 62, 62, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 60, 60, 64)        18496     
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 58, 58, 64)        36928     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 29, 29, 64)       

In [0]:
# train the model
# this model uses balanced batch generator

model_train = model.fit_generator(train_data,
                                 steps_per_epoch = training_steps,
                                 epochs = 15,
                                 validation_data = test_data,
                                 validation_steps = test_steps)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [0]:
y_pred = model.predict(Xtest)
#y_pred[y_pred >= 0.5] = 1
#y_pred[y_pred < 0.5] = 0

In [0]:
# ROC score

roc = roc_auc_score(ytest.astype(int), y_pred.astype(int))
print('ROC AUC: {:.3f}'.format(roc))

ROC AUC: 0.578
