In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import os
import tensorflow as tf

In [2]:
#Read in data 112K rows
data = pd.read_csv('Data_Entry_2017.csv')
data.drop('Unnamed: 11',axis=1, inplace=True)
data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168


In [3]:
#Add specific imagepath for correct diagnosis
images = {os.path.basename(x): x for x in 
                   glob(os.path.join('images*', '*', '*.png'))}
print('Scans found:', len(images), ', Total Headers', data.shape[0])
data['path'] = data['Image Index'].map(images.get)
data.sample(3)

Scans found: 112120 , Total Headers 112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],path
11883,00003098_011.png,No Finding,11,3098,36,M,PA,2048,2500,0.168,0.168,images_002/images/00003098_011.png
92786,00023158_004.png,No Finding,4,23158,65,F,AP,3056,2544,0.139,0.139,images_010/images/00023158_004.png
23046,00006096_001.png,Atelectasis|Pneumothorax,1,6096,47,M,PA,2992,2991,0.143,0.143,images_003/images/00006096_001.png


In [4]:
data['Healthy'] = data['Finding Labels'].apply(lambda x:  np.where(x=='No Finding', 'Healthy', 'Sick'))

In [5]:
data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],path,Healthy
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,images_001/images/00000001_000.png,Sick
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,images_001/images/00000001_001.png,Sick
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,images_001/images/00000001_002.png,Sick
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,images_001/images/00000002_000.png,Healthy
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,images_001/images/00000003_000.png,Sick
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168,images_012/images/00030801_001.png,Sick
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,images_012/images/00030802_000.png,Healthy
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,images_012/images/00030803_000.png,Healthy
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,images_012/images/00030804_000.png,Healthy


In [6]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, 
                                   test_size = 0.25)
print('train', train_df.shape[0], 'validation', test_df.shape[0])

train 84090 validation 28030


In [7]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
image_size = (128, 128)
img_gen = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

In [8]:
all_labels = list(data.Healthy.unique())
image_size = (128, 128)

train_gen = img_gen.flow_from_dataframe(dataframe=train_df, directory=None, x_col = 'path',
y_col = 'Healthy', classmode = 'categorical',
classes = all_labels, targetsize = image_size, colormode = 'grayscale',
batch_size = 32)

test_gen = img_gen.flow_from_dataframe(dataframe=test_df, directory=None, x_col = 'path',
y_col = 'Healthy', classmode = 'categorical',
classes = all_labels, targetsize = image_size, colormode = 'grayscale',
batch_size = 256)

test_X, test_Y = next(img_gen.flow_from_dataframe(dataframe=test_df,
directory=None,
x_col = 'path', y_col = 'Healthy',
classmode = 'categorical', classes = all_labels,
targetsize = image_size,
colormode = 'grayscale', batchsize = 2048))


Found 84090 validated image filenames belonging to 2 classes.
Found 28030 validated image filenames belonging to 2 classes.
Found 28030 validated image filenames belonging to 2 classes.


In [9]:
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}_healthy_weights.best.hdf5".format('xray_class')

checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=3)
callbacks_list = [checkpoint, early]

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense



# dimensions of our images.
img_width, img_height = 256, 256


model = Sequential()
model.add(Conv2D(128, (3, 3), input_shape=(256,256,3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(256, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(512, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense((2)))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy', 'mae'])

In [None]:
model.fit(train_gen, 
                                  steps_per_epoch=2628,
                                  validation_data = (test_X, test_Y), 
                                  epochs = 20, 
                                  callbacks = callbacks_list)