In [29]:
from data_util import load_image, get_patch
from generators_binary import PatchGenerator, PatchSequence
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from networks_binary import create_initial_model, create_second_model, create_squeezenet3d_model
from os.path import join
from skimage.transform import rotate
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold
from scipy.ndimage.interpolation import zoom
from PIL.ImageEnhance import Contrast
from skimage.exposure import equalize_adapthist

from PIL import Image
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import keras.backend as K
import keras
import numpy as np
import pandas as pd

In [30]:
def augf(img, cut=True):
    zoomed = False
    if np.random.random() > .5:
        tmp = np.squeeze(img)
        angle = np.random.uniform(0, 90)
        tmp = rotate(tmp, angle)
        img = np.expand_dims(tmp, -1)
    if np.random.random() > .5:
        img = np.fliplr(img)
    if np.random.random() > .5:
        img = np.flipud(img)
    if np.random.random() > .5:
        tmp = img.reshape(img.shape[0], img.shape[1]*img.shape[2])
        tmp = equalize_adapthist(tmp, kernel_size=(5,5,5,1))
        img = tmp.reshape(70, 70, 40, 1)
        
    if np.random.random() > .5:
        img = zoom(img, (*np.random.choice([0.9, 1.1], 2), 1,1))
        zoomed = True
    if cut:
        crop_from = [15, 15]
        if np.random.random() > .5: # shifting
            if zoomed:
                crop_from = np.random.randint(0, 23, 2)
            else:
                crop_from = np.random.randint(0, 30, 2)
    
        return img[crop_from[0]:crop_from[0] + 40, crop_from[1]:crop_from[1] + 40, :] 
    return img

In [32]:
img = np.random.random((70, 70, 40, 1))

c = augf(img)

  .format(dtypeobj_in, dtypeobj_out))


In [3]:
data_dir = '/projects/0/ismi2018/FINALPROJECTS/BREAST_3D_ULTRASOUND/shareWithStudents'

csv = pd.read_csv(join(data_dir, 'trainingSet.csv'), dtype=str)
batch_size = 30

In [4]:
unique_patients = csv.patientID.unique()
kf = KFold(5, shuffle=True, random_state=42)

In [5]:
accuracies = []
for i, (train_patients_idxs, val_patients_idxs) in enumerate(kf.split(unique_patients), start=1):
    print('Training fold {}'.format(i))
    
    train_patients = unique_patients[train_patients_idxs]
    val_patients = unique_patients[val_patients_idxs]
    
    train_csv = csv.loc[csv['patientID'].isin(train_patients), :]
    val_csv = csv.loc[csv['patientID'].isin(val_patients), :]
    
    training_gen = PatchGenerator(
        input_dir=data_dir,
        dataframe=train_csv,
        batch_size=batch_size,
        augmentation_fn=augf
    )
    
    # Print label balance
    print(train_csv.groupby('histology').histology.count())

    valid_seq = PatchSequence(
        input_dir=data_dir,
        dataframe=val_csv,
        batch_size=batch_size
    )
    
    print(val_csv.groupby('histology').histology.count())
    
    net = create_initial_model(patch_size=40)
    net.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    save_path = 'first_model_fold_{}.h5'.format(i)
    cp = ModelCheckpoint(save_path, monitor='val_loss', save_best_only=True)
    lr = ReduceLROnPlateau(monitor='val_loss', factor=.5, min_lr=1e-6, patience=5)
    history = net.fit_generator(training_gen,
                                validation_data=valid_seq,
                                steps_per_epoch=len(training_gen)*5,
                                epochs=30,
                                use_multiprocessing=True,
                                workers=4,
                                callbacks=[cp, lr])
    # Plot training history
    plt.figure()
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['acc', 'val_acc', 'loss', 'val_loss'])
    plt.show()
    
    # Plot confusion matrix
    y_true = valid_seq.get_all_labels()
    y_true[y_true == 2] = 0
    y_true[y_true == 20] = 1
    y_true[y_true == 21] = 2
    best_model = keras.models.load_model(save_path)
    y_pred = best_model.predict_generator(valid_seq,
                                          use_multiprocessing=True,
                                          workers=4)
    y_pred = np.argmax(y_pred, axis=1)
    cm = confusion_matrix(y_true, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True)
    plt.show()
    
    # Store fold accuracy
    accuracies.append(accuracy_score(y_true, y_pred))

print('Mean accuracy: {:.4f}'.format(np.mean(accuracies)))

Training fold 1
PatchGenerator detected: 239 patch samples.
histology
2     130
20     38
21     71
Name: histology, dtype: int64
PatchSequence detected: 59 patch samples.
histology
2     36
20     5
21    18
Name: histology, dtype: int64




Epoch 1/30


Traceback (most recent call last):
  File "/home/ruc0027/.local/lib/python3.5/site-packages/keras/utils/data_utils.py", line 677, in _data_generator_task
    generator_output = next(self._generator)
  File "/nfs/home1/ruc0027/ISMI-Project/generators_binary.py", line 33, in __next__
    return self.next()
  File "/nfs/home1/ruc0027/ISMI-Project/generators_binary.py", line 98, in next
    batch_x = np.stack(images).astype(K.floatx())
  File "/hpc/sw/python-3.5.2/lib/python3.5/site-packages/numpy/core/shape_base.py", line 347, in stack
    raise ValueError('all input arrays must have the same shape')
ValueError: all input arrays must have the same shape


ValueError: all input arrays must have the same shape

In [None]:
# Investigate incorrect predictions

kf = KFold(5, shuffle=True, random_state=42)
for i, (train_patients_idxs, val_patients_idxs) in enumerate(kf.split(unique_patients), start=1):
    print('Investigating fold {}'.format(i))
    
    load_path = 'first_model_fold_{}.h5'.format(i)
    model = keras.models.load_model(load_path)
    
    val_patients = unique_patients[val_patients_idxs]
    val_csv = csv.loc[csv['patientID'].isin(val_patients), :]
    valid_seq = PatchSequence(
        input_dir=data_dir,
        dataframe=val_csv,
        batch_size=batch_size
    )
    
    y_true = valid_seq.get_all_labels()
    y_true[y_true == 2] = 0
    y_true[y_true == 20] = 1
    y_true[y_true == 21] = 2
    y_pred = model.predict_generator(valid_seq,
                                     use_multiprocessing=True,
                                     workers=4)
    y_pred = np.argmax(y_pred, axis=1)
    
    incorrect_idxs = np.where(y_true != y_pred)[0]
    
    for i in incorrect_idxs:
        pID = val_csv.iloc[i, :].patientID
        sID = val_csv.iloc[i, :].studyID
        scN = val_csv.iloc[i, :].scanName
        y, x, z = val_csv.iloc[i, :][['annotationLowResolution{}'.format(i) for i in range(1, 4)]].values.astype(np.uint8)
        img = load_image(pID, sID, scN)
        patch = np.squeeze(get_patch(img, (y, x, z), 40))
        
        # Show full image
        plt.figure()
        plt.subplot(121)
        plt.imshow(img[..., z], cmap='gray')
        plt.scatter(y, x, edgecolors='r', facecolors='none', s=1000)
        
        # Show patch
        plt.subplot(122)
        plt.imshow(patch[..., patch.shape[-1]//2], cmap='gray')
        print('Predicted: {}'.format(y_pred[i]))
        print('Actual: {}'.format(y_true[i]))
        
        plt.show()
        
    print('-----------------------------------------------------------------')