### Overview
We try to train a simple model from scratch to see how well we can classify different diseases in the X-Rays. The notebook just shows how to use the HDF5 output to make getting started easier.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt
import h5py
from keras.utils.io_utils import HDF5Matrix

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
h5_path = './chest_xray.h5'
disease_vec_labels = ['Atelectasis','Cardiomegaly','Consolidation','Edema','Effusion','Emphysema','Fibrosis',
 'Hernia','Infiltration','Mass','Nodule','Pleural_Thickening','Pneumonia','Pneumothorax']
disease_vec = []
with h5py.File(h5_path, 'r') as h5_data:
    all_fields = list(h5_data.keys())
    for c_key in all_fields:
        print(c_key, h5_data[c_key].shape, h5_data[c_key].dtype)
    for c_key in disease_vec_labels:
        disease_vec += [h5_data[c_key][:]]
disease_vec = np.stack(disease_vec,1)
print('Disease Vec:', disease_vec.shape)

Atelectasis (112120,) float64
Cardiomegaly (112120,) float64
Consolidation (112120,) float64
Edema (112120,) float64
Effusion (112120,) float64
Emphysema (112120,) float64
Fibrosis (112120,) float64
Finding Labels (112120,) |S100
Follow-up # (112120,) int64
Height] (112120,) int64
Hernia (112120,) float64
Image Index (112120,) |S16
Infiltration (112120,) float64
Mass (112120,) float64
No Finding (112120,) float64
Nodule (112120,) float64
OriginalImagePixelSpacing[x (112120,) float64
OriginalImage[Width (112120,) int64
Patient Age (112120,) int64
Patient Gender (112120,) |S1
Patient ID (112120,) int64
Pleural_Thickening (112120,) float64
Pneumonia (112120,) float64
Pneumothorax (112120,) float64
Unnamed: 11 (112120,) float64
View Position (112120,) |S2
images (112120, 256, 256, 1) uint8
path (112120,) |S25
y] (112120,) float64
Disease Vec: (112120, 14)


In [3]:
img_ds = HDF5Matrix(h5_path, 'images')
split_idx = img_ds.shape[0]//2
train_ds = HDF5Matrix(h5_path, 'images', end = split_idx)
test_ds = HDF5Matrix(h5_path, 'images', start = split_idx)
train_dvec = disease_vec[0:split_idx]
test_dvec = disease_vec[split_idx:]
print('Train Shape', train_ds.shape, 'test shape', test_ds.shape)

Train Shape (56060, 256, 256, 1) test shape (56060, 256, 256, 1)


In [4]:
from keras.applications.mobilenet import MobileNet
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout, BatchNormalization, AveragePooling2D
raw_model = MobileNet(input_shape=(None, None, 1), include_top = False, weights = None)
full_model = Sequential()
full_model.add(AveragePooling2D((2,2), input_shape = img_ds.shape[1:]))
full_model.add(BatchNormalization())
full_model.add(raw_model)
full_model.add(Flatten())
full_model.add(Dropout(0.5))
full_model.add(Dense(64))
full_model.add(Dense(disease_vec.shape[1], activation = 'sigmoid'))
full_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
full_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
average_pooling2d_1 (Average (None, 128, 128, 1)       0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 128, 1)       4         
_________________________________________________________________
mobilenet_1.00_None (Model)  multiple                  3228288   
_________________________________________________________________
flatten_1 (Flatten)          (None, 16384)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16384)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                1048640   
_________________________________________________________________
dense_2 (Dense)              (None, 14)                910       
Total para

In [5]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
file_path="weights.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=3)
callbacks_list = [checkpoint, early] #early

In [6]:
full_model.fit(train_ds, train_dvec, 
                validation_data = (test_ds, test_dvec),
                epochs=5, 
                verbose = True,
                shuffle = 'batch',
                callbacks = callbacks_list)

Train on 56060 samples, validate on 56060 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.94425, saving model to weights.best.hdf5
Epoch 2/5

Epoch 00002: val_acc did not improve from 0.94425
Epoch 3/5

Epoch 00003: val_acc did not improve from 0.94425
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.94425


<keras.callbacks.History at 0x7f9393ed16d8>