<a href="https://colab.research.google.com/github/mobarakol/tutorial_captioning/blob/main/captioning_coco.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
! pip install pycocotools --user

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
!wget -O coco_captioning.zip https://www.dropbox.com/s/dngqe90t6owmsov/coco_captioning.zip?dl=0
!unzip -q coco_captioning.zip -d ./

--2022-08-23 14:16:03--  https://www.dropbox.com/s/dngqe90t6owmsov/coco_captioning.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6030:18::a27d:5012
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/dngqe90t6owmsov/coco_captioning.zip [following]
--2022-08-23 14:16:03--  https://www.dropbox.com/s/raw/dngqe90t6owmsov/coco_captioning.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc45baac3e2dfbba90ace0073c9c.dl.dropboxusercontent.com/cd/0/inline/BrgSLEY56pliOA0ZRGfTgn1wSwur4Mzgxse7RXw80OQnflb74xlsjuqlFvq4lapOIlqPrU03lNfY5kJ09hLrCP-f8rY_x1gqL6pWqLbo8IsS9Xd9HdA3g4yBhaMqIr2s2j1Kg_7bEOuZXsq7MVSmQ_fB2ufcfGflKbMcD8EPfpTA5g/file# [following]
--2022-08-23 14:16:04--  https://uc45baac3e2dfbba90ace0073c9c.dl.dropboxusercontent.com/cd/0/inline/BrgSLEY56pliOA0ZRGfTgn1wSwur4Mzgxse7RXw

In [37]:
# This file defines different layers used for RNN and for image captioning.
import numpy as np


def sigmoid(x):
    """
    A numerically stable version of the logistic sigmoid function.
    There is no shape requirement for input x.
    """
    pos_mask = (x >= 0)
    neg_mask = (x < 0)
    z = np.zeros_like(x)
    z[pos_mask] = np.exp(-x[pos_mask])
    z[neg_mask] = np.exp(x[neg_mask])
    top = np.ones_like(x)
    top[neg_mask] = z[neg_mask]
    return top / (1 + z)


def rnn_step_forward(x, prev_h, Wx, Wh, b):
    """
    Run the forward pass for a single time stamp in Vanilla RNN with a tanh activation function:
    next_h = tanh(Wx * x + Wh * prev_h + b).
    Arguments:
        x: input data for current time stamp with shape (N, D)
        prev_h: hidden state from previous time stamp with shape (N, H)
        Wx: weight matrix for input data with shape (D, H)
        Wh: weight matrix for hidden states with shape (H, H)
        b: bias with shape (H,)
    Outputs:
        next_h: hidden state after the forward step with shape (N, H)
        cache: cache used for back-prop
    """
    next_h = np.tanh(np.dot(x, Wx) + np.dot(prev_h, Wh) + b)
    cache = x, prev_h, Wx, Wh, b, next_h
    return next_h, cache


def rnn_step_backward(dnext_h, cache):
    """
    Run the backward pass for a single time stamp in Vanilla RNN with a tanh activation function:
    dx = (1 - next_h^2) * Wx * dnext_h
    dprev_h = (1 - next_h^2) * Wh * dnext_h
    dWx = (1 - next_h^2) * x.T * dnext_h
    dWh = (1 - next_h^2) * h.T * dnext_h
    db = (1 - next_h^2) * dnext_h
    Arguments:
        dnext_h: gradient of hidden state with shape (N, H)
        cache: cache used for back-prop
    Outputs:
        dx: gradient of input data with shape (N, D)
        dprev_h: gradient of hidden state for previous time stamp with shape (N, H)
        dWx: gradient of weight matrix for input data with shape (D, H)
        dWh: gradient of weight matrix for hidden states with shape (H, H)
        db: gradient of bias with shape (H,)
    """
    x, prev_h, Wx, Wh, b, h = cache
    dtanh = (1 - h ** 2) * dnext_h
    dx = np.dot(dtanh, Wx.T)
    dprev_h = np.dot(dtanh, Wh.T)
    dWx = np.dot(x.T, dtanh)
    dWh = np.dot(h.T, dtanh)
    db = np.sum(dtanh, axis=0)
    return dx, dprev_h, dWx, dWh, db


def rnn_forward(x, h0, Wx, Wh, b):
    """
    Run a forward pass for vanilla RNN on an entire sequence of data.
    The input has N sequences, each of which is composed of T vectors, each of dimension D.
    The hidden state size for the RNN is H.
    Arguments:
        x: input data for with shape (N, T, D)
        h0: initial hidden state with shape (N, H)
        Wx: weight matrix for input data with shape (D, H)
        Wh: weight matrix for hidden states with shape (H, H)
        b: bias with shape (H,)
    Outputs:
        h: hidden states after the forward step with shape (N, T, H)
        cache: cache used for back-prop
    """
    N, T, D = x.shape
    _, H = h0.shape
    x = np.swapaxes(x, 0, 1)  # swap axes for easier loops
    h = np.zeros((T, N, H))  # initialize h
    prev_h = h0
    cache = []
    for t in range(T):
        next_h, cache_ = rnn_step_forward(x[t], prev_h, Wx, Wh, b)
        prev_h = next_h
        cache.append(cache_)
        h[t] = prev_h
    h = np.swapaxes(h, 0, 1)  # swap axes for correct format
    return h, cache


def rnn_backward(dh, cache):
    """
    Run a backward pass for vanilla RNN from the gradient of all hidden states dh.
    Arguments:
        dh: gradient of all hidden states with shape (N, T, H)
        cache: cache used for back-prop
    Outputs:
        dx: gradient of input data with shape (N, T, D)
        dh0: gradient of initial hidden state with shape (N, H)
        dWx: gradient of weight matrix for input data with shape (D, H)
        dWh: gradient of weight matrix for hidden states with shape (H, H)
        db: gradient of bias with shape (H,)
    """
    dh = dh.copy()  # this is very important!
    N, T, H = dh.shape
    D = cache[0][0].shape[-1]  # extract parameter D fro initialization
    dh = np.swapaxes(dh, 0, 1)  # swap axes for easier loops
    # initializations for derivatives
    dx, dh0, dWx, dWh, db = np.zeros((T, N, D)), np.zeros((N, H)), np.zeros((D, H)), np.zeros((H, H)), np.zeros((H,))
    for t in range(T):
        dx[t], dprev_h, dWx_, dWh_, db_ = rnn_step_backward(dh[t], cache[t])
        # update parameters
        dh[t] += dprev_h
        dWx += dWx_
        dWh += dWh_
        db += db_
    dh0 = dprev_h
    dx = np.swapaxes(dx, 0, 1)  # swap axes for correct format
    return dx, dh0, dWx, dWh, db


def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
    """
    Run the forward pass for a single time stamp in LSTM:
    a = Wx * x + Wh * h + b;
    a = [a_i, a_f, a_o, a_g];
    i, f, o, g = sigmoid(a_i), sigmoid(a_f), sigmoid(a_o), tanh(a_g);
    next_c = f ⊙ prev_c + i ⊙ g;
    next_h = o ⊙ tanh(next_c).
    The shapes are consistent with Vallina RNN.
    Arguments:
        x: input data for current time stamp with shape (N, D)
        prev_h: hidden state from previous time stamp with shape (N, H)
        prev_c: cell state from previous time stamp with shape (N, H)
        Wx: weight matrix for input data with shape (D, 4H)
        Wh: weight matrix for hidden states with shape (H, 4H)
        b: bias with shape (4H,)
    Outputs:
        next_h: hidden state after the forward step with shape (N, H)
        next_c: cell state after the forward step with shape (N, H)
        cache: cache used for back-prop

    """
    a = np.dot(x, Wx) + np.dot(prev_h, Wh) + b
    H = a.shape[1] // 4
    a_i, a_f, a_o, a_g = a[:, :H], a[:, H:2*H], a[:, 2*H:3*H], a[:, 3*H:]
    i, f, o, g = sigmoid(a_i), sigmoid(a_f), sigmoid(a_o), np.tanh(a_g)
    next_c = f * prev_c + i * g
    next_h = o * np.tanh(next_c)
    cache = x, prev_h, prev_c, Wx, Wh, b, i, f, o, g, next_h, next_c
    return next_h, next_c, cache


def lstm_step_backward(dnext_h, dnext_c, cache):
    """
    Run the backward pass for a single time stamp in LSTM:
    do = dnext_h ⊙ tanh(next_c);
    ddnext_c = dnext_h ⊙ o;
    df = dnext_c ⊙ prev_c;
    dprev_c = dnext_c ⊙ f;
    di = dnext_c ⊙ g;
    dg = dnext_c ⊙ i;
    da_i = di ⊙ i ⊙ (1 - i);
    da_f = df ⊙ f ⊙ (1 - f);
    da_o = do ⊙ o ⊙ (1 - o);
    da_g = dg ⊙ (1 - g^2);
    da = [da_i, da_f, da_o, da_g];
    dx = da * Wx;
    dh = da * Wh;
    dWx = da * x.T;
    dWh = da * h.T;
    db = da.
    Arguments:
        dnext_h: gradient of hidden state with shape (N, H)
        dnext_c: gradient of cell state with shape (N, H)
        cache: cache used for back-prop
    Outputs:
        dx: gradient of input data with shape (N, D)
        dprev_h: gradient of hidden state with shape (N, H)
        dprev_c: gradient of cell state with shape (N, H)
        dWx: gradient of weight matrix for input data with shape (D, 4H)
        dWh: gradient of weight matrix for hidden states with shape (H, 4H)
        db: gradient of bias with shape (4H,)
    """
    x, prev_h, prev_c, Wx, Wh, b, i, f, o, g, h, c = cache
    do = dnext_h * np.tanh(c)
    dnext_c += dnext_h * o * (1 - np.tanh(c) ** 2)
    dprev_c = dnext_c * f
    dg = dnext_c * i
    di = dnext_c * g
    df = dnext_c * prev_c
    da_i = di * i * (1 - i)
    da_f = df * f * (1 - f)
    da_o = do * o * (1 - o)
    da_g = dg * (1 - g ** 2)
    da = np.concatenate((da_i, da_f, da_o, da_g), axis=-1)
    dx = np.dot(da, Wx.T)
    dWx = np.dot(x.T, da)
    dprev_h = np.dot(da, Wh.T)
    dWh = np.dot(prev_h.T, da)
    db = np.sum(da, axis=0)
    return dx, dprev_h, dprev_c, dWx, dWh, db


def lstm_forward(x, h0, Wx, Wh, b):
    """
    Run a forward pass for LSTM on an entire sequence of data.
    The dimensions are consistent with Vallina RNN.
    Arguments:
        x: input data for with shape (N, T, D)
        h0: initial hidden state with shape (N, H)
        Wx: weight matrix for input data with shape (D, H)
        Wh: weight matrix for hidden states with shape (H, H)
        b: bias with shape (H,)
    Outputs:
        h: hidden states after the forward step with shape (N, T, H)
        cache: cache used for back-prop
    """
    N, T, D = x.shape
    _, H = h0.shape
    x = np.swapaxes(x, 0, 1)  # swap axes for easier loops
    h = np.zeros((T, N, H))
    prev_c = np.zeros((N, H))
    prev_h = h0
    cache = []
    for i in range(T):
        prev_h, prev_c, cache_ = lstm_step_forward(x[i], prev_h, prev_c, Wx, Wh, b)
        h[i] = prev_h
        cache.append(cache_)
    h = np.swapaxes(h, 0, 1)  # swap back for correct format
    return h, cache


def lstm_backward(dh, cache):
    """
    Run a backward pass for LSTM from the derivative of all hidden states dh.
    Arguments:
        dh: gradient of all hidden states with shape (N, T, H)
        cache: cache used for back-prop
    Outputs:
        dx: gradient of input data with shape (N, T, D)
        dh0: gradient of initial hidden state with shape (N, H)
        dWx: gradient of weight matrix for input data with shape (D, H)
        dWh: gradient of weight matrix for hidden states with shape (H, H)
        db: gradient of bias with shape (H,)
    """
    dh = dh.copy()  # very important!
    N, T, H = dh.shape
    D = cache[0][0].shape[-1]  # extract parameter D
    dh = np.swapaxes(dh, 0, 1)  # swap axes for easier loops
    # initialization of derivatives
    dx, dWx, dWh, db, dprev_h = np.zeros((T, N, D)), np.zeros((D, 4*H)), np.zeros((H, 4*H)), np.zeros((4*H,)), np.zeros((N, H))
    dprev_c = np.zeros(dprev_h.shape)
    for t in reversed(range(T)):
        dh[t] += dprev_h
        dx[t], dprev_h, dprev_c, dWx_, dWh_, db_ = lstm_step_backward(dh[t], dprev_c, cache[t])
        dWx += dWx_
        dWh += dWh_
        db += db_
    dh0 = dprev_h
    dx = np.swapaxes(dx, 0, 1)
    return dx, dh0, dWx, dWh, db


def temporal_affine_forward(x, W, b):
    """
    Run a forward pass for temporal affine layer. The dimensions are consistent with RNN/LSTM forward passes.
    Arguments:
        x: input data with shape (N, T, D)
        W: weight matrix for input data with shape (D, M)
        b: bias with shape (M,)
    Outputs:
        out: output data with shape (N, T, M)
        cache: cache for back-prop
    """
    N, T, D = x.shape
    M = b.shape[0]
    out = np.dot(x.reshape(N * T, D), W).reshape(N, T, M) + b
    cache = x, W, b, out
    return out, cache


def temporal_affine_backward(dout, cache):
    """
    Run a backward pass for temporal affine layer. The dimensions are consistent with RNN/LSTM forward passes.
    Arguments:
        dout: gradient of output data with shape (N, T, M)
        cache: cache for back-prop
    Outputs:
        dx: gradient of input data with shape (N, T, D)
        dW: gradient of weight matrix with shape (D, M)
        db: gradient of bias with shape (M,)
    """
    x, W, b, out = cache
    N, T, D = x.shape
    M = b.shape[0]
    dx = np.dot(dout.reshape(N * T, M), W.T).reshape(N, T, D)
    dw = np.dot(dout.reshape(N * T, M).T, x.reshape(N * T, D)).T
    db = dout.sum(axis=(0, 1))
    return dx, dw, db


def temporal_softmax_loss(x, y, mask):
    """
    This function is adapted from CS231n.
    A temporal version of softmax loss for use in RNNs.
    The vocabulary has size V for each time step of a time series of length T, with a batch size of N.
    Cross-entropy loss is calculated, summed and averaged over all time steps across the batch.
    Arguments:
    - x: input scores for all vocabulary elements with shape of (N, T, V)
    - y: ground-truth indices at each time step with shape of (N, T), each element of which is in [0, V)
    - mask: boolean array with shape of (N, T) indicating whether the scores at x[n, t] should contribute to the loss
    Outputs:
    - loss: float of loss
    - dx: gradient of loss with respect to scores x
    """
    N, T, V = x.shape
    x_flat = x.reshape(N * T, V)
    y_flat = y.reshape(N * T)
    mask_flat = mask.reshape(N * T)
    probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N
    dx_flat = probs.copy()
    dx_flat[np.arange(N * T), y_flat] -= 1
    dx_flat /= N
    dx_flat *= mask_flat[:, None]
    dx = dx_flat.reshape(N, T, V)
    return loss, dx


def word_embedding_forward(x, W):
    """
    Run a forward pass for word embeddings.
    The dimensions are consistent with parameters in temporal softmax loss.
    Arguments:
    - x: integer array with shape of (N, T) giving indices of words, each of which lies in [0, V)
    - W: weight matrix with shape of (V, D) giving word vectors for all words.
    Outputs:
    - out: array with shape of (N, T, D) giving word vectors for all input words.
    - cache: cache for back-prop
    """
    out = W[x, :]
    cache = x, W
    return out, cache


def word_embedding_backward(dout, cache):
    """
    Run a backward pass for word embeddings.
    The dimensions are consistent with parameters in temporal softmax loss.
    Arguments:
    - dout: gradient of output with shape of (N, T, D)
    - cache: cache used for back-prop
    Outputs:
    - dW: gradient of weight matrix with shape of (V, D)
    """
    x, W = cache
    dW = np.zeros(W.shape)
    np.add.at(dW, x, dout)
    return dW

import numpy as np
from builtins import object
#from layers import *


class RNNImageCaption(object):
    """
    Define a RNN_image_captioning class, the instance of which outputs captions given image features.
    """
    def __init__(self, word_to_idx, input_dim=512, wordvec_dim=128, hidden_dim=128, cell_type='rnn', dtype=np.float32):
        """
        Initialization of instance in RNN_image_captioning.
        Arguments:
             word_to_idx: dictionary of word-index vocabulary table with V entries
             input_dim: input image feature dimension D
             wordvec_dim: word vector dimension W
             hidden_dim: hidden state dimension H in RNN
             cell_type: either 'rnn' or 'lstm' setting the RNN type
             dtype: numpy datatype - float32 for training and float64 for numerical gradient check
        """
        if cell_type not in ['rnn', 'lstm']:
            raise ValueError('Unknown cell type of "%s"' % cell_type)
        self.cell_type = cell_type
        self.input_dim = input_dim
        self.wordvec_dim = wordvec_dim
        self.hidden_dim = hidden_dim
        self.dtype = dtype
        self.params = {}
        # save indices of NULL, START and END
        self.null = word_to_idx['<NULL>']
        self.start = word_to_idx.get('<START>')
        self.end = word_to_idx.get('<END>')
        # initialization of word vectors
        self.params['W_embed'] = np.random.randn(len(word_to_idx), wordvec_dim) / 100
        # initialization of hidden state projection parameters for CNN
        self.params['W_proj'] = np.random.randn(input_dim, hidden_dim) / np.sqrt(input_dim)
        self.params['b_proj'] = np.zeros(hidden_dim)
        # initialization of RNN parameters
        dimension_factor = {'rnn':1, 'lstm':4}[cell_type]
        self.params['Wx'] = np.random.randn(wordvec_dim, dimension_factor * hidden_dim) / np.sqrt(wordvec_dim)
        self.params['Wh'] = np.random.randn(hidden_dim, dimension_factor * hidden_dim) / np.sqrt(hidden_dim)
        self.params['b'] = np.zeros(dimension_factor * hidden_dim)
        # initialization of vocab weights
        self.params['W_vocab'] = np.random.randn(hidden_dim, len(word_to_idx)) / np.sqrt(hidden_dim)
        self.params['b_vocab'] = np.zeros(len(word_to_idx))
        # cast dtype
        for para_name, param in self.params.items():
            self.params[para_name] = param.astype(self.dtype)

    def loss(self, features, captions):
        """
        Calculate the training loss for captioning RNN.
        Arguments:
             features: input image features with shape of (N, D)
             captions: an integer array of ground-truth captions with shape of (N, T) with elements in [0, V)
        Outputs:
            loss: float of loss value
            grads: dictionary of gradients of parameters in self.params
        """
        # Cut out the last words of captions as input, and the expected output is everything but the first words.
        # Note that the first element of captions would be the START token.
        captions_in = captions[:, :-1]  # entire caption except for the last words
        captions_out = captions[:, 1:]  # entire caption except for the first words

        mask = (captions_out != self.null)  # Indicating non-NULL indices to be used

        # unpack initialized parameters
        W_embed = self.params['W_embed']
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        # loss calculation
        h0 = np.dot(features, W_proj) + b_proj  # initial hidden state from image features - (N, H)
        x, cache_embed = word_embedding_forward(captions_in, W_embed)  # transform words in captions_in - (N, T, W)
        if self.cell_type == 'rnn':  # use Vanilla RNN to produce hidden states from input word vectors - (N, T, H)
            h, cache_rnn = rnn_forward(x, h0, Wx, Wh, b)
        else:                        # use LSTM to produce hidden states from input word vectors - (N, T, H)
            h, cache_lstm = lstm_forward(x, h0, Wx, Wh, b)
        scores, cache_temporal = temporal_affine_forward(h, W_vocab, b_vocab)  # compute scores - (N, T, V)
        loss, dscores = temporal_softmax_loss(scores, captions_out, mask)  # compute loss, ignoring <NULL> tokens

        # gradients calculation using back-props
        dh, dW_vocab, db_vocab = temporal_affine_backward(dscores, cache_temporal)
        if self.cell_type == 'rnn':
            dx, dh0, dWx, dWh, db = rnn_backward(dh, cache_rnn)
        else:
            dx, dh0, dWx, dWh, db = lstm_backward(dh, cache_lstm)
        dW_embed = word_embedding_backward(dx, cache_embed)
        dW_proj = np.dot(features.T, dh0)
        db_proj = np.sum(dh0, axis=0)

        # put gradients into dictionary
        # note that the keys have the same strings as in parameters for convenience during extraction
        grads = {}
        grads['W_embed'] = dW_embed
        grads['W_proj'], grads['b_proj'] = dW_proj, db_proj
        grads['Wx'], grads['Wh'], grads['b'] = dWx, dWh, db
        grads['W_vocab'], grads['b_vocab'] = dW_vocab, db_vocab

        return loss, grads

    def generate_captions(self, features, max_length=30):
        """
        Generate captions from the image features.
        Arguments:
             features: input image features with shape of (N, D)
             max_length: maximum length T of generated caption
        Outputs:
            captions: array of generated captions with shape of (N, T) and each element lies in [0, V)
        """
        N, D = features.shape
        captions = self.null * np.ones((N, max_length), dtype=np.int32)  # initialize captions to <NULL>s

        # Unpack parameters
        W_embed = self.params['W_embed']
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        # Generate captions
        h0 = np.dot(features, W_proj) + b_proj  # initial hidden state from image features - (N, H)
        captions[:, 0] = self.start  # set <START> tokens to the generated captions
        capt = self.start * np.ones((N, 1), dtype=np.int32)  # set <START> tokens to the generated word for each time step
        prev_h = h0
        prev_c = np.zeros(h0.shape)  # initialize the cell state to zeros
        for t in range(max_length):
            x, _ = word_embedding_forward(capt, W_embed)  # word embedding
            # get next hidden state
            if self.cell_type == 'rnn':
                h, _ = rnn_step_forward(np.squeeze(x), prev_h, Wx, Wh, b)  # note: squeeze for dimension match
                prev_h = h
            else:
                h, c, _ = lstm_step_forward(np.squeeze(x), prev_h, prev_c, Wx, Wh, b)
                prev_h = h
                prev_c = c
            scores, _ = temporal_affine_forward(h[:, np.newaxis, :], W_vocab, b_vocab)  # note: new axis for dimension match
            capt = np.squeeze(np.argmax(scores, axis=2))
            captions[:, t] = capt  # store generated captions

        return captions


# This code is modified from CS231n.
import json
import numpy as np
import h5py
import urllib.request, urllib.error, urllib.parse, tempfile, os
from imageio import imread

DATA_DIR = 'data/coco_captioning'  # define the dataset path


def load_coco_dataset(data_dir=DATA_DIR, PCA_features=True, max_train=None):
    """
    Load Microsoft COCO dataset.
    Arguments:
        data_dir: path to the dataset
        PCA_features: whether use PCA features
        max_train: max number of training data if only a subset is needed
    Outputs:
        data: dictionary containing different datasets with their names
    """
    data = {}
    caption_file = os.path.join(data_dir, 'coco2014_captions.h5')
    with h5py.File(caption_file, 'r') as f:  # read caption file with h5py
        for k, v in f.items():
            data[k] = np.asarray(v)

    # extract training features
    if PCA_features:
        train_feature_file = os.path.join(data_dir, 'train2014_vgg16_fc7_pca.h5')
    else:
        train_feature_file = os.path.join(data_dir, 'train2014_vgg16_fc7.h5')
    with h5py.File(train_feature_file, 'r') as f:
        data['train_features'] = np.asarray(f['features'])

    # extract validation features
    if PCA_features:
        val_feature_file = os.path.join(data_dir, 'val2014_vgg16_fc7_pca.h5')
    else:
        val_feature_file = os.path.join(data_dir, 'val2014_vgg16_fc7.h5')
    with h5py.File(val_feature_file, 'r') as f:
        data['val_features'] = np.asarray(f['features'])

    # extract index-to-word and word-to-index into dictionary
    dict_file = os.path.join(data_dir, 'coco2014_vocab.json')
    with open(dict_file, 'r') as f:
        dict_data = json.load(f)
        for k, v in dict_data.items():
            data[k] = v

    # read image files from website, note that some of them might not be available for now
    train_url_file = os.path.join(data_dir, 'train2014_urls.txt')  # this file includes urls for the training images
    with open(train_url_file, 'r') as f:
        train_urls = np.asarray([line.strip() for line in f])
    data['train_urls'] = train_urls

    val_url_file = os.path.join(data_dir, 'val2014_urls.txt')  # this file includes urls for the validation images
    with open(val_url_file, 'r') as f:
        val_urls = np.asarray([line.strip() for line in f])
    data['val_urls'] = val_urls

    # Maybe subsample the training data
    if max_train is not None:
        num_train = data['train_captions'].shape[0]
        mask = np.random.randint(num_train, size=max_train)
        data['train_captions'] = data['train_captions'][mask]
        data['train_image_idxs'] = data['train_image_idxs'][mask]

    return data


def sample_coco_minibatch(data, batch_size=100, split='train'):
    """
    Sample a small amount of data.
    Arguments:
        data: loaded dataset from COCO
        batch_size: int for batch size
        split: string of either 'train' or 'val' indicating training/validation set
    Outputs:
        captions: ground truth captions of the images
        image_features: features of the images
        urls: image urls for image display
    """
    split_size = data['%s_captions' % split].shape[0]
    mask = np.random.choice(split_size, batch_size)
    captions = data['%s_captions' % split][mask]
    image_idxs = data['%s_image_idxs' % split][mask]
    image_features = data['%s_features' % split][image_idxs]
    urls = data['%s_urls' % split][image_idxs]
    return captions, image_features, urls


def decode_captions(captions, idx_to_word):
    """
    Decode output captions into worded captions.
    Arguments:
        captions: output captions to be decoded
        idx_to_word: dictionary of word-index vocabulary table
    Outputs:
        decoded: decoded worded captions
    """
    singleton = False
    if captions.ndim == 1:
        singleton = True
        captions = captions[None]
    decoded = []
    N, T = captions.shape
    for i in range(N):
        words = []
        for t in range(T):
            word = idx_to_word[captions[i, t]]
            if word != '<NULL>':
                words.append(word)
            if word == '<END>':
                break
        decoded.append(' '.join(words))
    if singleton:
        decoded = decoded[0]
    return decoded


def image_from_url(url):
    """
    Read an image from a URL. Returns a numpy array with the pixel data.
    Arguments:
        url: urls for images for display
    Outputs:
        img: numpy array for the image
    """
    try:
        f = urllib.request.urlopen(url)
        _, fname = tempfile.mkstemp()
        with open(fname, 'wb') as ff:
            ff.write(f.read())
        img = imread(fname)
        # os.remove(fname)
        return img
    except urllib.error.HTTPError as e:
        print('HTTP Error: ', e.code, url)
    except urllib.error.URLError as e:
        print('URL Error: ', e.reason, url)


# This file includes SGD and Adam for parameter update.
import numpy as np
def sgd(w, dw, params={}):
    """
    Perform Vanilla SGD for parameter update.
    Arguments:
        w: numpy array of current weight
        dw: numpy array of gradient of loss w.r.t. current weight
        params: dictionary containing hyper-parameters
            - lr: float of learning rate
    Outputs:
        next_w: updated weight
        params: updated dictionary of hyper-parameters
    """
    # set default parameters
    params.setdefault('lr', 1e-2)
    # update w
    next_w = w - params['lr'] * dw

    return next_w, params

def adam(w, dw, params={}):
    """
    Perform Adam update rule for parameter update.
    This update rule incorporates moving averages of both the gradient and its square and a bias correction term.
    Arguments:
        w: numpy array of current weight
        dw: numpy array of gradient of loss w.r.t. current weight
        params: dictionary containing hyper-parameters
            - lr: float of learning rate
            - beta1: float of decay rate for moving average of first moment of gradient
            - beta2: float of decay rate for moving average of second moment of gradient
            - epsilon: float of a small value used for smoothing to avoid dividing by zero
            - m: numpy array of moving average of gradient with the sameshape of w
            - v: moving average of squared gradient with the sameshape of w
            - t: int of iteration number
    Outputs:
        next_w: updated weight
        params: updated dictionary of hyper-parameters
    """
    # set default parameters
    params.setdefault('lr', 1e-2)
    params.setdefault('beta1', 0.9)
    params.setdefault('beta2', 0.999)
    params.setdefault('epsilon', 1e-8)
    params.setdefault('m', np.zeros_like(w))
    params.setdefault('v', np.zeros_like(w))
    params.setdefault('t', 0)
    # update w
    lr, beta1, beta2, epsilon, m, v, t = \
        params['lr'], params['beta1'], params['beta2'], params['epsilon'], params['m'], params['v'], params['t']
    m = beta1 * m + (1 - beta1) * dw
    v = beta2 * v + (1 - beta2) * dw ** 2
    t += 1
    alpha = params['lr'] * np.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
    w -= alpha * (m / (np.sqrt(v) + epsilon))
    params['t'] = t
    params['m'] = m
    params['v'] = v
    next_w = w

    return next_w, params


import numpy as np
from builtins import object
#import update_method


class CaptionTrain(object):
    """
    This class defines the training of the caption generator using SGD.
    """
    def __init__(self, data, model, **kwargs):
        """
        Initialization of CaptionTrain instance.
        Arguments:
            data: dictionary of training and validation dataset
            model: model object from RNNImageCaption
            optional arguments:
                update: string of update method of either 'sgd' or 'adam'
                update_params: dictionary of hyper-parameters for update method
                lr_decay: float of learning rate decay
                batch_size: integer of batch size for loss and gradient computation
                num_epochs: integer of number of epochs
                print_freq: integer of loss printing frequency steps
        """
        self.data = data
        self.model = model

        self.update = kwargs.pop('update', 'sgd')
        self.update_params = kwargs.pop('update_params', {})
        self.lr_decay = kwargs.pop('lr_decay', 1.0)
        self.batch_size = kwargs.pop('batch_size', 100)
        self.num_epochs = kwargs.pop('num_epochs', 10)
        self.print_freq = kwargs.pop('print_freq', 10)
        # throw error if more parameters are detected
        if len(kwargs) > 0:
            unreg_args = ', '.join('"%s"' % k for k in list(kwargs.keys()))
            raise ValueError('Unrecognized arguments %s' % unreg_args)
        # throw error if the update method is not supported
        if self.update not in ['sgd', 'adam']:
            raise ValueError('Unsupported update method %s' % self.update)
        #self.update_method = getattr(update_method, self.update)  # get update method from file "update_method.py"
        if self.update == 'sgd':
            self.update_method = sgd
        else:
            self.update_method = adam


        # initialize training parameters
        self.epoch = 0
        self.best_params = {}
        self.best_val_acc = 0.0
        self.loss_history = []
        self.train_acc_history = []
        self.val_acc_history = []

        # perform a deep copy of the update method parameters for each model parameter
        self.update_params_all = {}
        for param in self.model.params:
            self.update_params_all[param] = {k:v for k, v in self.update_params.items()}

    def train(self):
        """
        Train the model.
        """
        num_train = self.data['train_features'].shape[0]
        num_iter_epoch = max(num_train // self.batch_size, 1)
        num_iters = num_iter_epoch * self.num_epochs

        for t in range(num_iters):
            self._gradient_update()
            if t % self.print_freq == 0:
                print('(Iteration %d / %d) loss: %f' % (t + 1, num_iters, self.loss_history[-1]))
            if (t + 1) % num_iter_epoch == 0:
                self.epoch += 1
                for param in self.update_params_all:
                    self.update_params_all[param]['lr'] *= self.lr_decay

    def _gradient_update(self):
        """
        Conduct a gradient update for training.
        """
        # sample minibatch
        captions, image_features, urls = sample_coco_minibatch(self.data, self.batch_size, split='train')
        # compute loss and gradient
        loss, gradients = self.model.loss(image_features, captions)
        self.loss_history.append(loss)
        # parameter update
        for para_name, param in self.model.params.items():
            dparam = gradients[para_name]
            next_param, params = self.update_method(param, dparam, self.update_params_all[para_name])
            self.model.params[para_name] = next_param
            self.update_params_all[para_name] = params

In [None]:
import json
import numpy as np
import h5py
import urllib.request, urllib.error, urllib.parse, tempfile, os
from imageio import imread

DATA_DIR = 'coco_captioning'  # define the dataset path


def load_coco_dataset(data_dir=DATA_DIR, PCA_features=True, max_train=None):
    """
    Load Microsoft COCO dataset.
    Arguments:
        data_dir: path to the dataset
        PCA_features: whether use PCA features
        max_train: max number of training data if only a subset is needed
    Outputs:
        data: dictionary containing different datasets with their names
    """
    data = {}
    caption_file = os.path.join(data_dir, 'coco2014_captions.h5')
    with h5py.File(caption_file, 'r') as f:  # read caption file with h5py
        for k, v in f.items():
            data[k] = np.asarray(v)

    # extract training features
    if PCA_features:
        train_feature_file = os.path.join(data_dir, 'train2014_vgg16_fc7_pca.h5')
    else:
        train_feature_file = os.path.join(data_dir, 'train2014_vgg16_fc7.h5')
    with h5py.File(train_feature_file, 'r') as f:
        data['train_features'] = np.asarray(f['features'])

    # extract validation features
    if PCA_features:
        val_feature_file = os.path.join(data_dir, 'val2014_vgg16_fc7_pca.h5')
    else:
        val_feature_file = os.path.join(data_dir, 'val2014_vgg16_fc7.h5')
    with h5py.File(val_feature_file, 'r') as f:
        data['val_features'] = np.asarray(f['features'])

    # extract index-to-word and word-to-index into dictionary
    dict_file = os.path.join(data_dir, 'coco2014_vocab.json')
    with open(dict_file, 'r') as f:
        dict_data = json.load(f)
        for k, v in dict_data.items():
            data[k] = v

    # read image files from website, note that some of them might not be available for now
    train_url_file = os.path.join(data_dir, 'train2014_urls.txt')  # this file includes urls for the training images
    with open(train_url_file, 'r') as f:
        train_urls = np.asarray([line.strip() for line in f])
    data['train_urls'] = train_urls

    val_url_file = os.path.join(data_dir, 'val2014_urls.txt')  # this file includes urls for the validation images
    with open(val_url_file, 'r') as f:
        val_urls = np.asarray([line.strip() for line in f])
    data['val_urls'] = val_urls

    # Maybe subsample the training data
    if max_train is not None:
        num_train = data['train_captions'].shape[0]
        mask = np.random.randint(num_train, size=max_train)
        data['train_captions'] = data['train_captions'][mask]
        data['train_image_idxs'] = data['train_image_idxs'][mask]

    return data

data = load_coco_dataset(PCA_features=True)
subset_data = load_coco_dataset(max_train=10000)
data['train_captions'].shape, data['train_features'].shape


sub_rnn_model = RNNImageCaption(cell_type='rnn', word_to_idx=data['word_to_idx'],
          input_dim=data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256,)
# train model
sub_rnn_solver = CaptionTrain(subset_data, sub_rnn_model, update='adam', num_epochs=50,
                                batch_size=100, update_params={'lr': 5e-3}, lr_decay=0.95, print_freq=100)
sub_rnn_solver.train()

(Iteration 1 / 41350) loss: 76.808959
(Iteration 101 / 41350) loss: 69.643441
(Iteration 201 / 41350) loss: 64.993081
(Iteration 301 / 41350) loss: 58.692508
(Iteration 401 / 41350) loss: 55.621276
(Iteration 501 / 41350) loss: 52.129130
(Iteration 601 / 41350) loss: 52.450638
(Iteration 701 / 41350) loss: 51.360152
(Iteration 801 / 41350) loss: 50.882756
(Iteration 901 / 41350) loss: 48.966848
(Iteration 1001 / 41350) loss: 48.850169
(Iteration 1101 / 41350) loss: 48.630443
(Iteration 1201 / 41350) loss: 47.669474
(Iteration 1301 / 41350) loss: 48.499142
(Iteration 1401 / 41350) loss: 48.282832
(Iteration 1501 / 41350) loss: 47.180309
(Iteration 1601 / 41350) loss: 45.866480
(Iteration 1701 / 41350) loss: 47.075227
(Iteration 1801 / 41350) loss: 46.000001
(Iteration 1901 / 41350) loss: 45.668971
(Iteration 2001 / 41350) loss: 44.584328
(Iteration 2101 / 41350) loss: 47.253761
(Iteration 2201 / 41350) loss: 45.015524
(Iteration 2301 / 41350) loss: 44.012747
(Iteration 2401 / 41350) los

# Ablation

In [7]:
#make directory and get annotations for training and testing
!mkdir data
!wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P ./data/
!unzip ./data/captions_train-val2014.zip -d ./data/
!rm ./data/captions_train-val2014.zip

!mkdir data/images
!mkdir data/images/train
!mkdir data/images/test

mkdir: cannot create directory ‘data’: File exists
--2022-08-23 14:07:35--  http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip
Resolving msvocds.blob.core.windows.net (msvocds.blob.core.windows.net)... 20.60.195.163
Connecting to msvocds.blob.core.windows.net (msvocds.blob.core.windows.net)|20.60.195.163|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19673183 (19M) [application/octet-stream Charset=UTF-8]
Saving to: ‘./data/captions_train-val2014.zip’


2022-08-23 14:07:40 (3.94 MB/s) - ‘./data/captions_train-val2014.zip’ saved [19673183/19673183]

Archive:  ./data/captions_train-val2014.zip
replace ./data/annotations/captions_train2014.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ./data/annotations/captions_train2014.json  
  inflating: ./data/annotations/captions_val2014.json  


In [12]:
!ls data/annotations

captions_train2014.json  captions_val2014.json


In [8]:
import csv
from shutil import copyfile
from pycocotools.coco import COCO
from tqdm import tqdm

coco = COCO('./data/annotations/captions_train2014.json')

#get ids of training images
with open('TrainImageIds.csv', 'r') as f:
    reader = csv.reader(f)
    trainIds = list(reader)
    
trainIds = [int(i) for i in trainIds[0]]
print(len(trainIds))

loading annotations into memory...
Done (t=0.89s)
creating index...
index created!


FileNotFoundError: ignored

In [2]:
import torch
import torch.nn as nn
from torchvision import models

class EncoderCNN(nn.Module):
    def __init__(self, embed_size = 1024):
        super(EncoderCNN, self).__init__()
        
        # get the pretrained densenet model
        self.densenet = models.densenet121(pretrained=True)
        
        # replace the classifier with a fully connected embedding layer
        self.densenet.classifier = nn.Linear(in_features=1024, out_features=1024)
        
        # add another fully connected layer
        self.embed = nn.Linear(in_features=1024, out_features=embed_size)
        
        # dropout layer
        self.dropout = nn.Dropout(p=0.5)
        
        # activation layers
        self.prelu = nn.PReLU()
        
    def forward(self, images):
        
        # get the embeddings from the densenet
        densenet_outputs = self.dropout(self.prelu(self.densenet(images)))
        
        # pass through the fully connected
        embeddings = self.embed(densenet_outputs)
        
        return embeddings

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        
        # define the properties
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        # lstm cell
        self.lstm_cell = nn.LSTMCell(input_size=embed_size, hidden_size=hidden_size)
    
        # output fully connected layer
        self.fc_out = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size)
    
        # embedding layer
        self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embed_size)
    
        # activations
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, features, captions):
        
        # batch size
        batch_size = features.size(0)
        
        # init the hidden and cell states to zeros
        hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda()
        cell_state = torch.zeros((batch_size, self.hidden_size)).cuda()
    
        # define the output tensor placeholder
        outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()

        # embed the captions
        captions_embed = self.embed(captions)
        
        # pass the caption word by word
        for t in range(captions.size(1)):

            # for the first time step the input is the feature vector
            if t == 0:
                hidden_state, cell_state = self.lstm_cell(features, (hidden_state, cell_state))
                
            # for the 2nd+ time step, using teacher forcer
            else:
                hidden_state, cell_state = self.lstm_cell(captions_embed[:, t, :], (hidden_state, cell_state))
            
            # output of the attention mechanism
            out = self.fc_out(hidden_state)
            
            # build the output tensor
            outputs[:, t, :] = out
    
        return outputs

In [3]:
# get the losses for vizualization
losses = list()
val_losses = list()

for epoch in range(1, 10+1):
    
    for i_step in range(1, total_step+1):
        
        # zero the gradients
        decoder.zero_grad()
        encoder.zero_grad()
        
        # set decoder and encoder into train mode
        encoder.train()
        decoder.train()
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = train_data_loader.dataset.get_train_indices()
        
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        train_data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(train_data_loader))
        
        # make the captions for targets and teacher forcer
        captions_target = captions[:, 1:].to(device)
        captions_train = captions[:, :captions.shape[1]-1].to(device)

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions_train)
        
        # Calculate the batch loss
        loss = criterion(outputs.view(-1, vocab_size), captions_target.contiguous().view(-1))
        
        # Backward pass
        loss.backward()
        
        # Update the parameters in the optimizer
        optimizer.step()
        
        # - - - Validate - - -
        # turn the evaluation mode on
        with torch.no_grad():
            
            # set the evaluation mode
            encoder.eval()
            decoder.eval()

            # get the validation images and captions
            val_images, val_captions = next(iter(val_data_loader))

            # define the captions
            captions_target = val_captions[:, 1:].to(device)
            captions_train = val_captions[:, :val_captions.shape[1]-1].to(device)

            # Move batch of images and captions to GPU if CUDA is available.
            val_images = val_images.to(device)

            # Pass the inputs through the CNN-RNN model.
            features = encoder(val_images)
            outputs = decoder(features, captions_train)

            # Calculate the batch loss.
            val_loss = criterion(outputs.view(-1, vocab_size), captions_target.contiguous().view(-1))
        
        # append the validation loss and training loss
        val_losses.append(val_loss.item())
        losses.append(loss.item())
        
        # save the losses
        np.save('losses', np.array(losses))
        np.save('val_losses', np.array(val_losses))
        
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Val Loss: %.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), val_loss.item())
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
            
    # Save the weights.
    if epoch % save_every == 0:
        print("\nSaving the model")
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pth' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pth' % epoch))

NameError: ignored