In [1]:
import numpy as np
import tensorflow as tf

from transformer import *

from transformers import GPT2Tokenizer, BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [2]:
visual_backbone = tf.keras.applications.ResNet50(input_shape=(224, 224, 3), include_top=False, weights="imagenet")

In [3]:
%%time
sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048)

tmp_memory = tf.random.uniform((60, 64, 512), dtype=tf.float32, minval=0, maxval=200)
tmp_tgt = tf.random.uniform((62, 64, 512), dtype=tf.float32, minval=0, maxval=200)

tgt_mask = tf.linalg.band_part(tf.ones([62, 62]), 0, -1)

output = sample_decoder(tmp_tgt, 
                        enc_output=tmp_memory, 
                        training=False,
                        look_ahead_mask=tgt_mask, 
                        padding_mask=None
                    )

output.shape

TGT:(64, 62, 512)
Q:(64, 8, 62, 64), K:(64, 8, 62, 64), V:(64, 8, 62, 64)
TGT:(64, 62, 512), ENC: (64, 60, 512)
Q:(64, 8, 62, 64), K:(64, 8, 60, 64), V:(64, 8, 60, 64)
target: (64, 62, 512), enc_op:(64, 60, 512)
TGT:(64, 62, 512)
Q:(64, 8, 62, 64), K:(64, 8, 62, 64), V:(64, 8, 62, 64)
TGT:(64, 62, 512), ENC: (64, 60, 512)
Q:(64, 8, 62, 64), K:(64, 8, 60, 64), V:(64, 8, 60, 64)
target: (64, 62, 512), enc_op:(64, 60, 512)
Wall time: 1.55 s


TensorShape([62, 64, 512])

In [4]:
img = tf.random.uniform([1, 224,224,3]); img.shape

TensorShape([1, 224, 224, 3])

In [5]:
ps = visual_backbone(img); ps.shape
ps = tf.transpose(ps, [0, 3, 1, 2]); ps.shape

TensorShape([1, 2048, 7, 7])

In [6]:
rs = tf.reshape(ps, [ps.shape[0], ps.shape[1], -1])
rs = tf.transpose(rs, [0, 2, 1]); rs.shape

TensorShape([1, 49, 2048])

In [7]:
projected = L.Dense(256)(rs); projected.shape

TensorShape([1, 49, 256])

In [8]:
enc = tokenizer.encode("hello Hi, How is it going", max_length=49, pad_to_max_length=True,  return_tensors="tf")

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
from textual_heads import TransformerTextualHead

In [10]:
trf = TransformerTextualHead(vocab_size=28996, hidden_size=256, num_layers=6, attention_heads=8, feedforward_size=256, max_caption_length=50)

In [11]:
projected.shape; enc.shape

TensorShape([1, 49])

In [12]:
%%time
caps = trf(enc, [50], projected, training=False)

49
(1, 49)
(1, 49)
cap_vis_mask: (1, 49, 256) (1, 49, 256) (49, 49)
(49, 1, 256) (49, 1, 256)
TGT:(1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
TGT:(1, 49, 256), ENC: (1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
target: (1, 49, 256), enc_op:(1, 49, 256)
TGT:(1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
TGT:(1, 49, 256), ENC: (1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
target: (1, 49, 256), enc_op:(1, 49, 256)
TGT:(1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
TGT:(1, 49, 256), ENC: (1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
target: (1, 49, 256), enc_op:(1, 49, 256)
TGT:(1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
TGT:(1, 49, 256), ENC: (1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
target: (1, 49, 256), enc_op:(1, 49, 256)
TGT:(1, 49, 256)
Q:(1, 8, 49, 32), K:(1, 8, 49, 32), V:(1, 8, 49, 32)
TGT:(1, 49, 256), ENC: (1, 49, 2

In [13]:
tokenizer.batch_decode(tf.argmax(caps, 2))

['Owens Owensament Owenstectec Owens Owens Owenstectectectectectectectectectectec Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens Owens']