In [18]:
import numpy as np
import tensorflow as tf
from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Lambda 
from keras.layers import Flatten, Conv2D, MaxPooling2D, AveragePooling2D, Reshape, LSTM, Embedding, TimeDistributed
from keras.models import Model, load_model
from keras.preprocessing import image
#from keras.initializers import glorot_uniform
from keras.optimizers import Adam
import keras.backend as K

from matplotlib.pyplot import imshow
%matplotlib inline


First version of a Deep Captionning model using a CNN-LSTM architecture (Without attention. For now...)

See Show and Tell reference architecture here: 
    https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Vinyals_Show_and_Tell_2015_CVPR_paper.pdf
    
# 1 - Encoder: CNN Features extraction using a standard ResNet model

## Identity block
The skip connection "skips over" 3 hidden layers.

<img src="img/idblock.png">

This image was borrowed from Coursera's <b> Convolutional Neural Networks </b> class by <b> Andrew Ng </b> 

Here are the individual steps.

### First component of main path:
The first CONV2D has F1 filters of shape (1,1) and a stride of (1,1). Its padding is "valid" (none) and its name is conv_name_base + '2a'. 
The first BatchNorm is normalizing the 'channels' axis. Its name is bn_name_base + '2a'.
The ReLU activation function has no name and no hyperparameters.

### Second component of main path:
The second CONV2D has F2 filters of shape  (f,f)  and a stride of (1,1). Its padding is "same" (keeps the input shape unchanged by playing with the padding) and its name is conv_name_base + '2b'. 
The second BatchNorm is normalizing the 'channels' axis. Its name is bn_name_base + '2b'.
The ReLU activation function has no name and no hyperparameters.

### Third component of main path:
The third CONV2D has F3 filters of shape (1,1) and a stride of (1,1). Its padding is "valid" and its name is conv_name_base + '2c'. 

### Final step:
The X_shortcut and the output from the 3rd layer X are added together, then the ReLU activation function is aplied. Again, with no name and no hyperparameters.

In [19]:
def identity_block(X, f, filters, stage, block):
    """
    Implementation of the identity block as defined in Figure 4
    
    Arguments:
    X -- input tensor of shape (m, n_H_prev, n_W_prev, n_C_prev)
    f -- integer, specifying the shape of the middle CONV's window for the main path
    filters -- python list of integers, defining the number of filters in the CONV layers of the main path
    stage -- integer, used to name the layers, depending on their position in the network
    block -- string/character, used to name the layers, depending on their position in the network
    
    Returns:
    X -- output of the identity block, tensor of shape (n_H, n_W, n_C)
    """
    
    # defining name basis
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    
    # Retrieve Filters
    F1, F2, F3 = filters
    
    # Save the input value. You'll need this later to add back to the main path. 
    X_shortcut = X
    
    # First component of main path
    X = Conv2D(filters = F1, kernel_size = (1, 1), strides = (1,1), padding = 'valid', name = conv_name_base + '2a', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2a')(X)
    X = Activation('relu')(X)
    
    # Second component of main path
    X = Conv2D(filters = F2, kernel_size = (f, f), strides = (1,1), padding = 'same', name = conv_name_base + '2b', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2b')(X)
    X = Activation('relu')(X)

    # Third component of main path
    X = Conv2D(filters = F3, kernel_size = (1, 1), strides = (1,1), padding = 'valid', name = conv_name_base + '2c', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2c')(X)

    # Final step: Add shortcut value to main path, and pass it through a RELU activation
    X = Add()([X_shortcut , X])
    X = Activation('relu')(X)

    return X

In [20]:
tf.reset_default_graph()

with tf.Session() as test:
    np.random.seed(1)
    A_prev = tf.placeholder("float", [3, 4, 4, 6])
    X = np.random.randn(3, 4, 4, 6)
    A = identity_block(A_prev, f = 2, filters = [2, 4, 6], stage = 1, block = 'a')
    test.run(tf.global_variables_initializer())
    out = test.run([A], feed_dict={A_prev: X, K.learning_phase(): 0})
    print("out = " + str(out[0][1][1][0]))

out = [ 0.94823   -0.         1.1610144  2.747859  -0.         1.36677  ]


## Convolutional block
The ResNet "convolutional block" is used when the input and output dimensions don't match up. 
The difference with the identity block is that there is a CONV2D layer in the shortcut path:

<img src="img/convblock.png">

This image was borrowed from Coursera's <b> Convolutional Neural Networks </b> class by <b> Andrew Ng </b> 

The CONV2D layer in the shortcut path is used to resize the input  xx  to a different dimension, so that the dimensions match up in the final addition needed to add the shortcut value back to the main path. 

Here are the individual steps.

### First component of main path:

The first CONV2D has F1 filters of shape (1,1) and a stride of (s,s). Its padding is "valid" (none) and its name is conv_name_base + '2a'.
The first BatchNorm is normalizing the 'channels' axis. Its name is bn_name_base + '2a'.
The ReLU activation function has no name and no hyperparameters.

### Second component of main path:

The second CONV2D has F2 filters of shape (f,f) and a stride of (1,1). Its padding is "same" and it's name is conv_name_base + '2b'.
The second BatchNorm is normalizing the 'channels' axis. Its name is bn_name_base + '2b'.
The ReLU activation function has no name and no hyperparameters.

### Third component of main path:

The third CONV2D has F3 filters of shape (1,1) and a stride of (1,1). Its padding is "valid" and it's name is conv_name_base + '2c'.
The third BatchNorm is normalizing the 'channels' axis. Its name is bn_name_base + '2c'.

### Shortcut path:

The CONV2D has F3 filters of shape (1,1) and a stride of (s,s). Its padding is "valid" and its name is conv_name_base + '1'.
The BatchNorm is normalizing the 'channels' axis. Its name is bn_name_base + '1'.

### Final step:

The shortcut and the main path values are added together then a ReLU activation function is applied.

In [21]:
def convolutional_block(X, f, filters, stage, block, s = 2):
    """
    Implementation of the convolutional block as defined in Figure 4
    
    Arguments:
    X -- input tensor of shape (m, n_H_prev, n_W_prev, n_C_prev)
    f -- integer, specifying the shape of the middle CONV's window for the main path
    filters -- python list of integers, defining the number of filters in the CONV layers of the main path
    stage -- integer, used to name the layers, depending on their position in the network
    block -- string/character, used to name the layers, depending on their position in the network
    s -- Integer, specifying the stride to be used
    
    Returns:
    X -- output of the convolutional block, tensor of shape (n_H, n_W, n_C)
    """
    
    # defining name basis
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'
    
    # Retrieve Filters
    F1, F2, F3 = filters
    
    # Save the input value
    X_shortcut = X

    ##### MAIN PATH #####
    # First component of main path 
    X = Conv2D(F1, kernel_size = (1, 1), strides = (s,s), padding = 'valid', name = conv_name_base + '2a', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2a')(X)
    X = Activation('relu')(X)
    
    # Second component of main path
    X = Conv2D(F2, kernel_size = (f, f), strides = (1,1), padding = 'same', name = conv_name_base + '2b', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2b')(X)
    X = Activation('relu')(X)

    # Third component of main path
    X = Conv2D(F3, kernel_size = (1, 1), strides = (1,1), padding = 'valid', name = conv_name_base + '2c', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2c')(X)

    ##### SHORTCUT PATH ####
    X_shortcut = Conv2D(F3, kernel_size = (1, 1), strides = (s,s), padding = 'valid', name = conv_name_base + '1', kernel_initializer = glorot_uniform(seed=0))(X_shortcut)
    X_shortcut = BatchNormalization(axis = 3, name = bn_name_base + '1')(X_shortcut)

    # Final step: Add shortcut value to main path, and pass it through a RELU activation
    X = Add()([X_shortcut , X])
    X = Activation('relu')(X)
    
    return X

In [22]:
tf.reset_default_graph()

with tf.Session() as test:
    np.random.seed(1)
    A_prev = tf.placeholder("float", [3, 4, 4, 6])
    X = np.random.randn(3, 4, 4, 6)
    A = convolutional_block(A_prev, f = 2, filters = [2, 4, 6], stage = 1, block = 'a')
    test.run(tf.global_variables_initializer())
    out = test.run([A], feed_dict={A_prev: X, K.learning_phase(): 0})
    print("out = " + str(out[0][1][1][0]))

out = [ 0.09018463  1.2348979   0.46822023  0.03671761 -0.          0.65516603]


### ResNet model (50 layers)

<img src="img/resnet.png">

This image was borrowed from Coursera's Convolutional Neural Networks class by Andrew Ng

Architecture of this ResNet-50 model:

Zero-padding pads the input with a pad of (3,3)
## Stage 1:
The 2D Convolution has 64 filters of shape (7,7) and uses a stride of (2,2). Its name is "conv1".
BatchNorm is applied to the 'channels' axis of the input.
MaxPooling uses a (3,3) window and a (2,2) stride.
## Stage 2:
The convolutional block uses three sets of filters of size [64,64,256], "f" is 3, "s" is 1 and the block is "a".
The 2 identity blocks use three sets of filters of size [64,64,256], "f" is 3 and the blocks are "b" and "c".
## Stage 3:
The convolutional block uses three sets of filters of size [128,128,512], "f" is 3, "s" is 2 and the block is "a".
The 3 identity blocks use three sets of filters of size [128,128,512], "f" is 3 and the blocks are "b", "c" and "d".
## Stage 4:
The convolutional block uses three sets of filters of size [256, 256, 1024], "f" is 3, "s" is 2 and the block is "a".
The 5 identity blocks use three sets of filters of size [256, 256, 1024], "f" is 3 and the blocks are "b", "c", "d", "e" and "f".
## Stage 5:
The convolutional block uses three sets of filters of size [512, 512, 2048], "f" is 3, "s" is 2 and the block is "a".
The 2 identity blocks use three sets of filters of size [512, 512, 2048], "f" is 3 and the blocks are "b" and "c".
The 2D Average Pooling uses a window of shape (2,2) and its name is "avg_pool".
The 'flatten' layer doesn't have any hyperparameters or name.
The Fully Connected (Dense) layer reduces its input to the number of classes using a softmax activation. Its name is 'fc' + str(classes).

# 2 - Decoder: LSTM based caption generator

Insert diagram and documentation here

# 3 - Model

In [23]:
def TrainShowAndTell(Tx, hidden_size, n_values, input_shape = (300, 300, 3)):
    
    ################
    # CNN ENCODER
    ################
    """
    Implementation of ResNet50:
    CONV2D -> BATCHNORM -> RELU -> MAXPOOL -> CONVBLOCK -> IDBLOCK*2 -> CONVBLOCK -> IDBLOCK*3
    -> CONVBLOCK -> IDBLOCK*5 -> CONVBLOCK -> IDBLOCK*2 -> AVGPOOL -> TOPLAYER

    Arguments:
    input_shape -- shape of the images of the dataset

    Returns:
    model -- a Model() instance in Keras
    """
    
    # Define the input as a tensor with shape input_shape
    X_input = Input(input_shape)
    
    # Zero-Padding
    X = ZeroPadding2D((3, 3))(X_input)
    
    # Stage 1
    X = Conv2D(64, (7, 7), strides = (2, 2), name = 'conv1', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = 'bn_conv1')(X)
    X = Activation('relu')(X)
    X = MaxPooling2D((3, 3), strides=(2, 2))(X)

    # Stage 2
    X = convolutional_block(X, f = 3, filters = [64, 64, 256], stage = 2, block='a', s = 1)
    X = identity_block(X, 3, [64, 64, 256], stage=2, block='b')
    X = identity_block(X, 3, [64, 64, 256], stage=2, block='c')

    # Stage 3 
    X = convolutional_block(X, f = 3, filters = [128,128,512], stage = 3, block='a', s = 2)
    X = identity_block(X, 3, [128,128,512], stage=3, block='b')
    X = identity_block(X, 3, [128,128,512], stage=3, block='c')
    X = identity_block(X, 3, [128,128,512], stage=3, block='d')
    
    # Stage 4
    X = convolutional_block(X, f = 3, filters = [256, 256, 1024], stage = 4, block='a', s = 2)
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='b')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='c')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='d')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='e')
    X = identity_block(X, 3, [256, 256, 1024], stage=4, block='f')
    
    # Stage 5
    X = convolutional_block(X, f = 3, filters = [512, 512, 2048], stage = 5, block='a', s = 2)
    X = identity_block(X, 3, [512, 512, 2048], stage=5, block='b')
    X = identity_block(X, 3, [512, 512, 2048], stage=5, block='c')
    
    # AVGPOOL
    X = AveragePooling2D(pool_size=(2, 2), name='avg_pool')(X)

    # output layer
    X = Flatten()(X)
    
    # Insert two FC layers to capture the final features and resize output for injection into the RNN decoder    
    X = Dense(8192, activation='relu', input_shape=(32384,), name = 'dense_img_features')(X)
    X = Dense(n_values, activation='relu', use_bias = False, name = 'dense_img_final_features')(X)

    ################
    # RNN DECODER
    ################
    """
    Implement the LSTM model
    
    Arguments:
    Tx -- max number of tokens in a sentence
    n_values -- number of features for each token (embedding size)
    hidden_size --  activation layer size for LSTM
    
    Returns:
    model -- a keras instance model with n_a activations
    """
    #Tx = 25
    #n_values = 50 # Number of features of word embedding
    
    # Redefine the input (features) layer's shape
    X = Lambda(lambda x : K.expand_dims(x, axis=1))(X)    

    # Define the initial hidden state a0 and initial cell state c0
    a0 = Input(shape=(hidden_size,), name='a0')
    c0 = Input(shape=(hidden_size,), name='c0')
    
    # Take image embedding as the first input to LSTM
    LSTMLayer = LSTM(hidden_size, return_sequences = True, return_state = True, dropout=0.5, name = 'lstm')
    print(X.shape)
    X, a, c = LSTMLayer(X, initial_state=[a0, c0])

    # Text embedding    
    caption = Input(shape=(Tx, n_values))
    #X_caption = Embedding(vocab_size, n_values, mask_zero = True, name = 'emb_text')(caption)
    print(caption.shape)
    
    # Take image embedding as the first input to LSTM
    C, _, _ = LSTMLayer(caption, initial_state=[a, c])
    output = TimeDistributed(Dense(n_values, activation='softmax'), name = 'time_distributed_softmax')(C)
    print(output.shape)
    
    return Model(inputs=[X_input, caption, a0, c0], outputs=output, name='TrainShowAndTell')


In [25]:
mymodel = TrainShowAndTell(25, 60, 50)
mymodel.summary()
mymodel.compile(loss='categorical_crossentropy', optimizer=Adam(lr = 0.01), metrics=['accuracy'])

(?, 1, 50)
(?, 25, 50)
(?, 25, 50)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
zero_padding2d_2 (ZeroPadding2D (None, 306, 306, 3)  0           input_3[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 150, 150, 64) 9472        zero_padding2d_2[0][0]           
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 150, 150, 64) 256         conv1[0][0]                      
__________________________________________________________________________


