This notebook aims to be the baseline for loading and training models.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, '/root/projects/pyphoon/')
from os.path import join
from os import listdir
import numpy as np
import h5py
import gc
import cv2
import pandas as pd

from pyphoon.app.utils import load_h5datachunks
from pyphoon.app.preprocess import MeanImagePreprocessor

from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, \
    Flatten, Activation, Reshape, Dropout, add
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import backend as K
from keras.utils import np_utils
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  return f(*args, **kwds)


## 1. Load data

We have decided to use the following split:
- Test: chunks 0-10
- Validation: chunks 10-20
- Training: chunks 20-end

Note that this split must be preserved since preprocessing parameters have been computed on the training chunks.

In [3]:
# Paths where data is stored
dataset_dir = '/root/fs9/lucas/data/datasets/task_2b/'
chunk_filenames = listdir(dataset_dir)
test_chunk_filenames = chunk_filenames[:10]
valid_chunk_filenames = chunk_filenames[10:20]
train_chunk_filenames = chunk_filenames[20:]

For this task, we will ignore classes 6 and 7 and only focus on Tropical Cyclones.

In [4]:
X_test, Y_test = load_h5datachunks(dataset_dir, 
                                     test_chunk_filenames, 
                                     features=['data', 'class'],
                                     ignore_classes=[6, 7],
                                     display=True
                                     )

X_valid, Y_valid = load_h5datachunks(dataset_dir, 
                                     valid_chunk_filenames, 
                                     features=['data', 'class'],
                                     ignore_classes=[6, 7],
                                     display=True
                                     )

X_train, Y_train = load_h5datachunks(dataset_dir, 
                                     train_chunk_filenames, 
                                     features=['data', 'class'],
                                     ignore_classes=[6, 7],
                                     display=True
                                     )

 file 0_chunk.h5 read
 file 1_chunk.h5 read
 file 2_chunk.h5 read
 file 3_chunk.h5 read
 file 4_chunk.h5 read
 file 5_chunk.h5 read
 file 6_chunk.h5 read
 file 7_chunk.h5 read
 file 8_chunk.h5 read
 file 9_chunk.h5 read
 file 10_chunk.h5 read
 file 11_chunk.h5 read
 file 12_chunk.h5 read
 file 13_chunk.h5 read
 file 14_chunk.h5 read
 file 15_chunk.h5 read
 file 16_chunk.h5 read
 file 17_chunk.h5 read
 file 18_chunk.h5 read
 file 19_chunk.h5 read
 file 20_chunk.h5 read
 file 21_chunk.h5 read
 file 22_chunk.h5 read
 file 23_chunk.h5 read
 file 24_chunk.h5 read
 file 25_chunk.h5 read
 file 26_chunk.h5 read
 file 27_chunk.h5 read
 file 28_chunk.h5 read
 file 29_chunk.h5 read
 file 30_chunk.h5 read
 file 31_chunk.h5 read
 file 32_chunk.h5 read
 file 33_chunk.h5 read
 file 34_chunk.h5 read
 file 35_chunk.h5 read
 file 36_chunk.h5 read
 file 37_chunk.h5 read
 file 38_chunk.h5 read
 file 39_chunk.h5 read
 file 40_chunk.h5 read
 file 41_chunk.h5 read
 file 42_chunk.h5 read
 file 43_chunk.h5 rea

## 2. Data preprocessing

For this task, we will use `MeanImagePreprocessor`, which uses the dataset mean image to centre the data and the minimum and maximum pixel values to normalise the images. To this end, we have previously computed these parameters and stored them under file `random.h5`.

In [2]:
# Define preprocessor
with h5py.File('../../tasks/multiclass/preprocessing_random.h5') as f:
    mean = f.get('mean_128').value
    scale_factor = f.get('max_value_128').value - f.get('min_value_128').value
preprocessor = MeanImagePreprocessor(mean, scale_factor, (128,128), 'keras')

In [7]:
# Preprocess
n = len(X_train)
X_train = [np.expand_dims(preprocessor.apply(X_train[i][:,:,:,0]), axis=3) for i in range(n)]
n = len(X_valid)
X_valid = [np.expand_dims(preprocessor.apply(X_valid[i][:,:,:,0]), axis=3) for i in range(n)]
n = len(X_test)
X_test = [np.expand_dims(preprocessor.apply(X_test[i][:,:,:,0]), axis=3) for i in range(n)]

## 3. Model

For this task we use standard conv nets with ReLU activation, Batch Norm, Max pooling and Dropout in first dense layer.

In [3]:
input_img = Input(shape=(128, 128, 1), name="in")

# Conv layers
x = Conv2D(64, (3, 3), strides=(1, 1), padding='same', name='conv1')(
    input_img)
x = Activation('relu', name='act1')(x)
x = BatchNormalization(name="bn1")(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)

x = Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv2')(x)
x = Activation('relu', name='act2')(x)
x = BatchNormalization(name="bn2")(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)

x = Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv3')(x)
x = Activation('relu', name='act3')(x)
x = BatchNormalization(name="bn3")(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)

x = Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv4')(x)
x = Activation('relu', name='act4')(x)
x = BatchNormalization(name="bn4")(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)

x = Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv5')(x)
x = Activation('relu', name='act5')(x)
x = BatchNormalization(name="bn5")(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool5')(x)

# Dense layers
x = Flatten()(x)

x = Dense(1024, use_bias=True, name="fc1")(x)
x = Activation('relu', name="fc_act1")(x)
x = BatchNormalization(name="fc_bn1")(x)
x = Dropout(0.2, name="drop1")(x)

x = Dense(256, use_bias=True, name="fc2")(x)
x = Activation('relu', name="fc_act2")(x)
x = BatchNormalization(name="fc_bn2")(x)

# Out
x = Dense(4, use_bias=True, name="fc4")(x)
x = Activation('softmax', name="out")(x)

# Model
model = Model(input_img, x)

In [4]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
in (InputLayer)              (None, 128, 128, 1)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 128, 128, 64)      640       
_________________________________________________________________
act1 (Activation)            (None, 128, 128, 64)      0         
_________________________________________________________________
bn1 (BatchNormalization)     (None, 128, 128, 64)      256       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 64, 64, 64)        0         
_________________________________________________________________
conv2 (Conv2D)               (None, 64, 64, 128)       73856     
_________________________________________________________________
act2 (Activation)            (None, 64, 64, 128)       0         
__________

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[
    'accuracy'])

## 3. Callbacks

We define two main callbacks:

- Tensorboard: To monitor loss/accuracy curves.
- Model storing: We store the model everytime validation loss is improved.

To access Tensorboard visualisations, while training, execute

$ tensorboard --logdir=<path/to/log-directory> --port <port>
```

seting `<path/to/log-directory>` to be the variable `tensorboard_path` defined below. Then navigate to `localhost:<port>`.


In [10]:
callbacks = []

# Tensorboard
use_tensorboard = True
tensorboard_path = '/tmp/multiclass_rnd2'
if use_tensorboard:
    callbacks.append(TensorBoard(log_dir=tensorboard_path, histogram_freq=0,
                                 write_graph=True, write_images=True))

# Save model
filepath = "models_rnd2/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
m_cp = ModelCheckpoint(filepath, monitor='val_loss', verbose=0,
                       save_best_only=True, save_weights_only=False,
                       mode='auto', period=1)

callbacks.append(m_cp)

## 4. Data generators

We will train our model using data generators. In particular, we will feed our network with image batches from our lists.

In [11]:
def data_generator_from_chunklist(X, Y, batch_sz):
    """ Generates batches of data from samples **X** and labels **Y**.

    :param X: Sample data.
    :type X: list
    :param Y: Label data.
    :type Y: list
    :param batch_sz: Batch size.
    :type batch_sz: int
    :return:
    """
    n_chunks = len(X)
    indices = list(range(n_chunks))

    chunk_count = 0
    while True:
        # Randomise chunk order once all chunks have been seen
        if chunk_count % n_chunks == 0:
            np.random.shuffle(indices)

        # Get chunk for batch generation
        idx = indices[chunk_count % n_chunks]
        _X = X[idx]
        _Y = Y[idx]
        # Shuffle batch data
        n_samples = len(_Y)
        pos = np.arange(n_samples)
        np.random.shuffle(pos)
        _X = _X[pos]
        _Y = _Y[pos]
        _Y = np_utils.to_categorical(_Y - 2, num_classes=4)

        # Generate batches
        imax = int(n_samples / batch_sz)
        for i in range(imax):
            # Find list of IDs
            x = _X[i * batch_sz:(i + 1) * batch_sz]
            y = _Y[i * batch_sz:(i + 1) * batch_sz]
            yield x, y
        chunk_count += 1

## 5. Train

Time to train!

In [None]:
# Number train samples
n_train = sum([len(x) for x in Y_train])  # 50000
n_valid = sum([len(x) for x in Y_valid])

# Number of full epochs, i.e. #times algorithm sees all the data
epochs = 15
batch_size = 32

# Train
model.fit_generator(
    generator=data_generator_from_chunklist(X_train, Y_train,
                                            batch_sz=batch_size),
    steps_per_epoch=n_train // batch_size,
    validation_data=data_generator_from_chunklist(X_valid, Y_valid,
                                                  batch_sz=batch_size),
    validation_steps=n_valid // batch_size,
    epochs=epochs,
    callbacks=callbacks
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15

## 6. Results

### Confusion Matrix

Below we obtain the confusion matrix from the estimations on the validation set.

In [20]:
cm = None
for i in range(len(Y_valid)):
    y_true = Y_valid[i] - 2
    y_pred = model.predict(X_valid[i])
    y_pred = np.argmax(y_pred, axis=1)
    if cm is None:
        cm = pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True).as_matrix()
    else:
        cm += pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True).as_matrix()
    print(i) if i%3==0 else 0

0
1
2
3
4
5
6
7
8
9


#### Precision
*How many true "X" were estimated to be "X"*

In [24]:
for c in (cm.T/cm[:,-1]).T:
    print(c)

[  9.43786655e-01   5.07911918e-02   4.42624765e-03   9.95905721e-04
   1.00000000e+00]
[ 0.05625242  0.89965085  0.03750162  0.00659511  1.        ]
[ 0.01537905  0.07074362  0.86556902  0.0483083   1.        ]
[ 0.00148351  0.00958576  0.03571836  0.95321237  1.        ]
[ 0.29175789  0.25405666  0.17472634  0.27945911  1.        ]


#### Recall
*From the estimated "X", how many are actually true "X"*

In [22]:
cm/cm[-1,:]

array([[ 0.94118296,  0.05816753,  0.00737055,  0.00103687,  0.29095299],
       [ 0.04800265,  0.88163731,  0.05343652,  0.00587558,  0.24896974],
       [ 0.00937983,  0.04955012,  0.88151833,  0.03076037,  0.17794591],
       [ 0.00143456,  0.01064504,  0.05767459,  0.96232719,  0.28213136],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ]])

## 7. Test

In [26]:
n_test = sum([len(x) for x in Y_test])

model.evaluate_generator(
    generator=data_generator_from_chunklist(X_test, Y_test,
                                            batch_sz=batch_size),
    steps=n_test // batch_size
)

[0.28513007185556299, 0.92177835051546386]