<a href="https://colab.research.google.com/github/parmarsuraj99/keras-transformer-flex/blob/master/VirTex/CaptFormer_Exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[VirTex - Learning Visual Representations from Textual Annotations](https://github.com/kdexd/virtex)

Karan Desai and Justin Johnson
University of Michigan

Preprint: [arxiv.org/abs/2006.06666](https://arxiv.org/abs/2006.06666)

### installling HuggingFace Transformers to use a pretrained tokenizer

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 27.0MB/s eta 0:00:01[K     |▉                               | 20kB 3.4MB/s eta 0:00:01[K     |█▎                              | 30kB 4.5MB/s eta 0:00:01[K     |█▊                              | 40kB 4.8MB/s eta 0:00:01[K     |██▏                             | 51kB 4.0MB/s eta 0:00:01[K     |██▋                             | 61kB 4.5MB/s eta 0:00:01[K     |███                             | 71kB 4.8MB/s eta 0:00:01[K     |███▍                            | 81kB 5.2MB/s eta 0:00:01[K     |███▉                            | 92kB 5.5MB/s eta 0:00:01[K     |████▎                           | 102kB 5.4MB/s eta 0:00:01[K     |████▊                           | 112kB 5.4MB/s eta 0:00:01[K     |█████▏                          | 122kB 5.4M

### importing required TF stuff and a backbone(ResNet50)

In [2]:
import gc
import functools
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

from tensorflow.keras.preprocessing.image import load_img, img_to_array, array_to_img

In [3]:
visual_backbone = tf.keras.applications.ResNet50(input_shape=(224, 224, 3), include_top=False, weights="imagenet")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


##Transformer in keras

Heavily inspired by this TensorFlow Example [Transformer model for language understanding](https://www.tensorflow.org/tutorials/text/transformer)

In [4]:
def scaled_dot_product_attention(query, key, value, mask):
  """Calculate the attention weights. """
  matmul_qk = tf.matmul(query, key, transpose_b=True)

  # scale matmul_qk
  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth)

  # add the mask to zero out padding tokens
  if mask is not None:
    logits += (mask * -1e9)

  # softmax is normalized on the last axis (seq_len_k)
  attention_weights = tf.nn.softmax(logits, axis=-1)

  output = tf.matmul(attention_weights, value)

  return output

In [5]:
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self, d_model, num_heads, name="multi_head_attention"):
    super(MultiHeadAttention, self).__init__(name=name)
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.query_dense = tf.keras.layers.Dense(units=d_model)
    self.key_dense = tf.keras.layers.Dense(units=d_model)
    self.value_dense = tf.keras.layers.Dense(units=d_model)

    self.dense = tf.keras.layers.Dense(units=d_model)

  def split_heads(self, inputs, batch_size):
    inputs = tf.reshape(
        inputs, shape=(batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(inputs, perm=[0, 2, 1, 3])

  def call(self, query, key, value, mask):
    batch_size = tf.shape(query)[0]

    # linear layers
    query = self.query_dense(query)
    key = self.key_dense(key)
    value = self.value_dense(value)

    # split heads
    query = self.split_heads(query, batch_size)
    key = self.split_heads(key, batch_size)
    value = self.split_heads(value, batch_size)

    # scaled dot-product attention
    print(f"Q:{query.shape}, K:{key.shape}, V:{value.shape}")
    scaled_attention = scaled_dot_product_attention(query, key, value, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

    # concatenation of heads
    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))

    # final linear layer
    outputs = self.dense(concat_attention)

    return outputs

In [6]:
#Teting MHA

temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((20, 30, 512))  # (batch_size, encoder_sequence, d_model)
y2 = tf.random.uniform((20, 32, 512))  # (batch_size, encoder_sequence, d_model)
out = temp_mha(y2, y, y, mask=None)
out.shape

Q:(20, 8, 32, 64), K:(20, 8, 30, 64), V:(20, 8, 30, 64)


TensorShape([20, 32, 512])

In [7]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])


**We'll use Decoder layer only**

**Note**
LayerNormalization is applied before Attention

In [8]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.self_attn = MultiHeadAttention(d_model = d_model, num_heads=num_heads)
    self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)

    self.ffn1 = tf.keras.layers.Dense(dff, activation='relu')
    self.dropout = tf.keras.layers.Dropout(rate)
    self.ffn2 = tf.keras.layers.Dense(d_model)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, tgt, enc_output, training=True,
           look_ahead_mask=None, padding_mask=None):
      
    tgt = tf.transpose(tgt, [1, 0, 2])
    enc_output = tf.transpose(enc_output, [1, 0, 2])
      
    #Changed First layernorm then  masked attn
    tgt = self.layernorm1(tgt)
    print(f"TGT:{tgt.shape}")
    tgt2 = self.self_attn(tgt, tgt, tgt, mask=look_ahead_mask)
    #print(tgt2.shape)
    tgt = tgt + self.dropout1(tgt2)
    
    #print(enc_output.shape)
    #LayerNorm then decoder attn
    tgt = self.layernorm2(tgt)
    print(f"TGT:{tgt.shape}, ENC: {enc_output.shape}")
    tgt2 = self.mha(tgt, enc_output, enc_output, mask=None)
    print(f"target: {tgt.shape}, enc_op:{enc_output.shape}")
    tgt = tgt + self.dropout2(tgt2)
    
    #LayerNorm then FFN
    tgt = self.layernorm3(tgt)
    tgt2 = self.ffn2(self.dropout(self.ffn1(tgt),training))
    tgt = tgt + self.dropout3(tgt2)
    
    tgt = tf.transpose(tgt, [1, 0, 2])

    return tgt

In [9]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    attention_weights = {}

    for i in range(self.num_layers):
      x = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x


In [10]:
%%time
sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048)

tmp_memory = tf.random.uniform((60, 64, 512), dtype=tf.float32, minval=0, maxval=200)
tmp_tgt = tf.random.uniform((62, 64, 512), dtype=tf.float32, minval=0, maxval=200)

tgt_mask = tf.linalg.band_part(tf.ones([62, 62]), 0, -1)

output = sample_decoder(tmp_tgt, 
                        enc_output=tmp_memory, 
                        training=False,
                        look_ahead_mask=tgt_mask, 
                        padding_mask=None
                    )

output.shape

TGT:(64, 62, 512)
Q:(64, 8, 62, 64), K:(64, 8, 62, 64), V:(64, 8, 62, 64)
TGT:(64, 62, 512), ENC: (64, 60, 512)
Q:(64, 8, 62, 64), K:(64, 8, 60, 64), V:(64, 8, 60, 64)
target: (64, 62, 512), enc_op:(64, 60, 512)
TGT:(64, 62, 512)
Q:(64, 8, 62, 64), K:(64, 8, 62, 64), V:(64, 8, 62, 64)
TGT:(64, 62, 512), ENC: (64, 60, 512)
Q:(64, 8, 62, 64), K:(64, 8, 60, 64), V:(64, 8, 60, 64)
target: (64, 62, 512), enc_op:(64, 60, 512)
CPU times: user 124 ms, sys: 7.5 ms, total: 132 ms
Wall time: 209 ms


In [11]:
class WordAndPositionalEmbedding(tf.keras.layers.Layer):
    r"""
    A :class:`~torch.nn.Module` for learned word embeddings and position
    embeddings for input tokens. Each token is mapped to a fixed dimensional
    word embedding; and corresponding positional embedding based on its index.
    These are summed together followed by layer normalization and an optional
    dropout.
    Parameters
    ----------
    vocab_size: int
        Size of token vocabulary.
    hidden_size: int
        Size of token embedding vectors.
    max_caption_length: int, optional (default = 30)
        Maximum length of input captions; this is used to create a fixed
        positional embedding lookup table.
    dropout: float, optional (default = 0.1)
        Dropout probability for final dropout applied after layer normalization.
    padding_idx: int, optional (default = 0)
        Token index of ``[PAD]`` token, word embedding for these tokens will
        be a vector of zeroes (and not trainable).
    """

    def __init__(self, 
                 vocab_size: int, 
                 hidden_size:int, 
                 max_caption_length: int = 30, 
                 rate:float = 0.0,
                 padding_idx: int=0):
        super(WordAndPositionalEmbedding, self).__init__()

        self.vocab_size = vocab_size
        self.padding_idx = padding_idx

        self.words = L.Embedding(vocab_size, hidden_size, mask_zero=True)

        self.positions = L.Embedding(max_caption_length, hidden_size)
        self.layer_norm = L.LayerNormalization(
            epsilon=1e-8,
        )

        self.dropout = L.Dropout(rate = rate)


    def call(self, tokens):
        
        r"""
        Get combined word and positional embeddings for input tokens.
        Parameters
        ----------
        tokens: torch.Tensor
            A tensor of shape ``(batch_size, max_caption_length)`` containing
            a batch of caption tokens, with values in ``[0, vocab_size)``.
        Returns
        -------
        torch.Tensor
            A tensor of shape ``(batch_size, max_caption_length, hidden_size)``
            containing corresponding token embeddings.
        """
        print(tokens.shape)
        position_indices = self._create_position_indices(tokens)
        print(position_indices.shape)

        word_embeddings = self.words(tokens)
        positional_embeddings = self.positions(position_indices)


        embeddings = self.layer_norm(word_embeddings + positional_embeddings)
        embeddings = self.dropout(embeddings)


        token_mask = tf.expand_dims(tokens != self.padding_idx, -1)


        embeddings = embeddings * tf.cast(token_mask, dtype=embeddings.dtype)
        return embeddings

    def _create_position_indices(self, tokens):

        # Create position indices of the same size as token indices.
        batch_size, max_caption_length = tokens.shape
        positions = tf.range(
            max_caption_length, dtype=tokens.dtype
        )
        # shape: (batch_size, max_caption_length)
        positions = tf.broadcast_to(tf.expand_dims(positions, 0), [batch_size, max_caption_length])        
        return positions

In [12]:
class TextualHead(tf.keras.layers.Layer):
    r"""
    Base class for all textual heads. All child classes can simply inherit
    from :class:`~torch.nn.Module`, however this is kept here for uniform
    type annotations.
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_size: int,
    ):
        super(TextualHead, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

    @property
    def textual_feature_size(self):
        
        r"""
        Size of the last dimension of output from forward pass; typically same
        as :attr:`hidden_size` for most modules. This property is used to add
        more modules on top of this.
        """
        return self.hidden_size


class LinearTextualHead(TextualHead):
    r"""
    Textual head containing a single linear layer projecting from textual
    feature size to output vocabulary size.
    """
    def __init__(
        self,
        vocab_size: int,
        hidden_size: int,
    ):
        super(LinearTextualHead, self).__init__(vocab_size, hidden_Size)
        self.output = L.Dense(vocab_size)

    def call(self,
             caption_tokens,
             caption_lengths,
             visual_features):
        
        output_logits = self.output(visual_features)
        return output_logits


In [13]:
class TransformerTextualHead(TextualHead):
    def __init__(
        self,
        vocab_size: int,
        hidden_size:int,
        num_layers: int,
        attention_heads: int,
        feedforward_size: int,
        dropout: float = 0.1,
        norm_type:str="pre",
        padding_idx: int=0,
        max_caption_length: int = 30
    ):
        super().__init__(vocab_size, hidden_size)
        self.num_layers = num_layers
        self.attention_heads =attention_heads
        self.feedforward_size = feedforward_size
        self.dropout = dropout
        self.padding_idx = padding_idx

        self.embedding = WordAndPositionalEmbedding(
            self.vocab_size,
            self.textual_feature_size,
            max_caption_length = max_caption_length,
            rate = dropout,
        )
        LayerClass = (
            DecoderLayer
        )
        _layer = DecoderLayer(
            self.textual_feature_size,
            self.attention_heads,
            dff = self.feedforward_size,
            rate = dropout
        )
        
        self.encoder = Decoder(self.num_layers, self.textual_feature_size,
            self.attention_heads,
            dff = self.feedforward_size,
            rate = dropout)
        """
        self.encoder = DecoderLayer(
            self.textual_feature_size,
            self.attention_heads,
            dff = self.feedforward_size,
            rate = dropout)
        """

        self.outputL = L.Dense(vocab_size)
        #self.output.weight = self.embedding.words.weight

    def call(self,
             caption_tokens,
             caption_lengths,
             visual_features
             ):
        batch_size, max_caption_length = caption_tokens.shape
        print(max_caption_length)

        ones = tf.ones_like(caption_tokens)
        caption_mask = tf.expand_dims(caption_lengths, 1) < tf.cumsum(ones, 1)

        caption_embeddings = self.embedding(caption_tokens)

        unidirectional_mask = self._generate_future_mask(max_caption_length)

        print("cap_vis_mask:", caption_embeddings.shape, visual_features.shape, unidirectional_mask.shape)

        caption_embeddings = tf.transpose(caption_embeddings, [1, 0, 2])
        visual_features = tf.transpose(visual_features, [1, 0, 2])
        print(caption_embeddings.shape, visual_features.shape)

        textual_features = self.encoder(caption_embeddings,
                                        visual_features,
                                        look_ahead_mask=unidirectional_mask, 
                                        padding_mask=caption_mask)
        textual_features = tf.transpose(textual_features, [1, 0, 2])
        op = self.outputL(textual_features)

        return op

    def  _generate_future_mask(self, size:int):

        mask = tf.linalg.band_part(
            tf.ones([size, size]), 0, -1
        )
        return mask


## Forward Pass

In [283]:
class CaptioningModel(tf.keras.layers.Layer):

    def __init__(self, 
                 tokenizer, 
                 textual_head: TransformerTextualHead,
                 visual_backbone,
                 max_caption_length: int = 30, 
                 rate:float = 0.0,
                ):
        super(CaptioningModel, self).__init__()

        self.tokenizer = tokenizer
        self.visual_backbone = visual_backbone
        self.max_caption_length = max_caption_length
        self.textual_head = textual_head
        self.visual_projection = L.Dense(self.textual_head.textual_feature_size)

    def call(self, images, descriptions):


        ps = self.visual_backbone(img)
        enc = tokenizer.batch_encode_plus(descriptions, max_length=self.max_caption_length,truncation=True, 
                                          pad_to_max_length=True,  return_tensors="tf")["input_ids"]

        #enc = tf.cast(enc, tf.int64)
        #print(enc)
        
        #(batch_size, height, width, n_channel) -> (batch_size, n_channel, height, width)
        ps = tf.transpose(ps, [0, 3, 1, 2]); ps.shape

        #(batch_size, n_channel, height, width) -> (batch_size, n_channel, height * width) 
        rs = tf.reshape(ps, [ps.shape[0], ps.shape[1], -1])

        #(batch_size, n_channel, height * width) -> (batch_size, height * width, n_channel)
        rs = tf.transpose(rs, [0, 2, 1]); rs.shape

        #(batch_size, n_channel, height * width) -> (batch_size, height * width, visual features)
        projected = self.visual_projection(rs)

        caps = trf(enc, tf.fill((64), 50), projected)

        preds_tokens = tf.argmax(caps, 2); preds_tokens.shape

        return preds_tokens

        



In [None]:
from transformers import GPT2TokenizerFast, BertTokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("distilgpt2")

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

### Data prep

In [None]:
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip

In [None]:
#!unzip /content/Flickr8k_Dataset.zip
!unzip /content/Flickr8k_text.zip -d Flickr8k_text

In [None]:
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# extract descriptions for images
def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# remove filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# store the first description for each image
		if image_id not in mapping:
			mapping[image_id] = image_desc
	return mapping

def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc in descriptions.items():
		# tokenize
		desc = desc.split()
		# convert to lower case
		desc = [word.lower() for word in desc]
		# remove punctuation from each token
		desc = [w.translate(table) for w in desc]
		# remove hanging 's' and 'a'
		desc = [word for word in desc if len(word)>1]
		# store as string
		descriptions[key] =  ' '.join(desc)

# save descriptions to file, one per line
def save_doc(descriptions, filename):
	lines = list()
	for key, desc in descriptions.items():
		lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

filename = 'Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_descriptions(descriptions)
# summarize vocabulary
all_tokens = ' '.join(descriptions.values()).split()
vocabulary = set(all_tokens)
print('Vocabulary Size: %d' % len(vocabulary))
# save descriptions
save_doc(descriptions, 'descriptions.txt')

In [None]:
!du -shc /content/Flicker8k_Dataset

In [None]:
from tqdm.auto import tqdm

In [None]:
from os import listdir
from pickle import dump
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.layers import Input

# extract features from each photo in the directory
def extract_features(directory):
	# load the model
	model = ResNet50(input_shape=(224, 224, 3), include_top=False, weights="imagenet")
	print(model.summary())
	# extract features from each photo
	features = dict()
	for name in tqdm(listdir(directory)):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature
		features[image_id] = feature
		#print('>%s' % name)
	return features

# extract features from all images
directory = 'Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

### Data Loading and testing

In [35]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [36]:
import numpy as np
import pandas as pd

In [37]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/ImageCaption/descriptions.txt', names=['id'])
df[['id','caption']] = df["id"].str.split(" ", 1, expand=True)
df["caption"] = df["caption"].str.strip("-")
print(df)

                         id                                            caption
0     1000268201_693b08cb0e  child in pink dress is climbing up set of stai...
1     1001773457_577c3a7d70             black dog and spotted dog are fighting
2     1002674143_1b742ab4b8  little girl covered in paint sits in front of ...
3     1003163366_44323f5815        man lays on bench while his dog sits by him
4     1007129816_e794419615         man in an orange hat starring at something
...                     ...                                                ...
8087   990890291_afc72be141    man does wheelie on his bicycle on the sidewalk
8088    99171998_7cc800ceef             group is sitting around snowy crevasse
8089    99679241_adc853a5c0  grey bird stands majestically on beach while w...
8090   997338199_7343367d7f                    person stands near golden walls
8091   997722733_0cb5439472                 man in pink shirt climbs rock face

[8092 rows x 2 columns]


In [38]:
import os

In [39]:
imgs=os.listdir("/content/drive/My Drive/ImageCaption/Flicker8k_Dataset"); imgs[:10]

['378170167_9b5119d918.jpg',
 '378453580_21d688748e.jpg',
 '379006645_b9a2886b51.jpg',
 '380034515_4fbdfa6b26.jpg',
 '380041023_0dfd712ef1.jpg',
 '380515798_c2abbf46b0.jpg',
 '380527679_574749123d.jpg',
 '380537190_11d6c0a412.jpg',
 '380590140_25b9889772.jpg',
 '381052465_722e00807b.jpg']

In [40]:
!pip install lycon

Collecting lycon
[?25l  Downloading https://files.pythonhosted.org/packages/83/90/b3ff2cdd02dbb0a4ae25f77d6fe8ed8012e7896e59ed59eec78ccf9a92ad/lycon-0.2.0.tar.gz (129kB)
[K     |██▌                             | 10kB 27.6MB/s eta 0:00:01[K     |█████                           | 20kB 2.9MB/s eta 0:00:01[K     |███████▋                        | 30kB 3.9MB/s eta 0:00:01[K     |██████████                      | 40kB 4.3MB/s eta 0:00:01[K     |████████████▋                   | 51kB 3.4MB/s eta 0:00:01[K     |███████████████▏                | 61kB 3.9MB/s eta 0:00:01[K     |█████████████████▋              | 71kB 4.2MB/s eta 0:00:01[K     |████████████████████▏           | 81kB 4.6MB/s eta 0:00:01[K     |██████████████████████▊         | 92kB 4.9MB/s eta 0:00:01[K     |█████████████████████████▏      | 102kB 4.6MB/s eta 0:00:01[K     |███████████████████████████▊    | 112kB 4.6MB/s eta 0:00:01[K     |██████████████████████████████▎ | 122kB 4.6MB/s eta 0:00:01[K     

In [41]:
import lycon

In [194]:
def get_sample(df: pd.DataFrame, img_dir: str=None, batch_size=64):
    len_df = len(df)
    print(len_df)

    batch_img = []
    batch_desc = []
    for i in range(batch_size):
        idx = np.random.randint(0, len_df, 1)
        print(idx)

        img_id = df.iloc[idx, 0].values[0]
        img_desc = df.iloc[idx, 1].values[0]

        print(img_id, img_desc)

        img_path = os.path.join(img_dir, img_id+".jpg")
        
        img = load_img(img_path)
        img = img.resize([224, 224])

        img = img_to_array(img)
        img /= 255.0

        batch_img.append(img)
        batch_desc.append(img_desc)

    batch_img = np.array(batch_img)

    return batch_img, batch_desc

In [195]:
%%time
img, desc = get_sample(df, "/content/drive/My Drive/ImageCaption/Flicker8k_Dataset")

8092
[1319]
2250555512_71670078f5 dark haired man in his twenties drinks green liquid from plastic mug
[102]
111497985_38e9f88856 kid rock climbing against the backdrop of green valley
[6016]
3514184232_b336414040 baseball player is making play nearby large sign and boundary of the field
[5181]
3346289227_198fced308 male on rollerblades skating down railing next to stairs
[2407]
2603690144_7a28b1d13c boy and dog are running down hill away from crowd
[4993]
3308018795_68a97a425c group of bike riders are riding on the street
[6284]
3562282690_cd2a95fe9e black and white dog running on the beach while man stands behind it
[4131]
3124838157_7ef96745b7 man and two women pose outside retail store
[6986]
3713177334_32f3245fd8 group of people gathered around mural in an urban area
[2944]
2788652511_4f10060e07 boy with an orange shirt lies on bodyboard in the surf
[7730]
549520317_af3d5c32eb child in red jacket sitting atop slide
[3781]
3036971334_78187a9570 blond man jumping off cliff into some

### Forward Pass

In [293]:
cap = CaptioningModel(tokenizer, trf, visual_backbone, max_caption_length=50)

In [294]:
trf = TransformerTextualHead(vocab_size=50258, hidden_size=256, num_layers=6, attention_heads=8, feedforward_size=256, max_caption_length=50)

In [297]:
%%time
caps = cap(img, desc)

50
(64, 50)
(64, 50)
cap_vis_mask: (64, 50, 256) (64, 49, 256) (50, 50)
(50, 64, 256) (49, 64, 256)
TGT:(64, 50, 256)
Q:(64, 8, 50, 32), K:(64, 8, 50, 32), V:(64, 8, 50, 32)
TGT:(64, 50, 256), ENC: (64, 49, 256)
Q:(64, 8, 50, 32), K:(64, 8, 49, 32), V:(64, 8, 49, 32)
target: (64, 50, 256), enc_op:(64, 49, 256)
TGT:(64, 50, 256)
Q:(64, 8, 50, 32), K:(64, 8, 50, 32), V:(64, 8, 50, 32)
TGT:(64, 50, 256), ENC: (64, 49, 256)
Q:(64, 8, 50, 32), K:(64, 8, 49, 32), V:(64, 8, 49, 32)
target: (64, 50, 256), enc_op:(64, 49, 256)
TGT:(64, 50, 256)
Q:(64, 8, 50, 32), K:(64, 8, 50, 32), V:(64, 8, 50, 32)
TGT:(64, 50, 256), ENC: (64, 49, 256)
Q:(64, 8, 50, 32), K:(64, 8, 49, 32), V:(64, 8, 49, 32)
target: (64, 50, 256), enc_op:(64, 49, 256)
TGT:(64, 50, 256)
Q:(64, 8, 50, 32), K:(64, 8, 50, 32), V:(64, 8, 50, 32)
TGT:(64, 50, 256), ENC: (64, 49, 256)
Q:(64, 8, 50, 32), K:(64, 8, 49, 32), V:(64, 8, 49, 32)
target: (64, 50, 256), enc_op:(64, 49, 256)
TGT:(64, 50, 256)
Q:(64, 8, 50, 32), K:(64, 8, 50, 3