In [2]:
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

from glob import glob
import numpy as np 

from PIL import Image
import pickle
import time

In [4]:
# Model settings
NAME = 'COCO' # Flicker30k
IMAGE_MODEL = 'vgg16'
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
top_k = 10000
vocab_size = top_k + 1
num_steps = 400 if NAME == 'COCO' else 160 
features_shape = 512 if IMAGE_MODEL == 'vgg16' else 2048 #2048 for inception_v3
attention_features_shape = 81 if IMAGE_MODEL == 'vgg16' else 64 #64 for inception_v3
max_length = 45 if NAME == 'COCO' else 75

In [5]:
# Load the numpy files
def load_data(batch, data='COCO', im_model='vgg16'):
    if data == 'COCO':
        root = '/Users/mamu867/PNNL_Mac/Springboard/image_caption_generator/data/raw/COCO/train_vectors/'
    else:
        root = '/Users/mamu867/PNNL_Mac/Springboard/image_caption_generator/data/raw/Flickr30k/flickr30k_images/train_vectors/'
    query = root + '*_{}_*_X_y_{:04d}_*'.format(im_model, batch)
    f_path = glob(query)
    data = np.load(f_path[0], allow_pickle=True)[()]
    return data['X'].numpy(), data['y']

In [7]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)
    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [8]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

    def call(self, x, features):
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        x = self.fc1(output)
        x = tf.reshape(x, (-1, x.shape[2]))
        x = self.fc2(x)
        return x, state, attention_weights
    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [9]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [10]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [11]:
checkpoint_path = "/Users/mamu867/PNNL_Mac/Springboard/image_caption_generator/data/processed/COCO/checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [12]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    ckpt.restore(ckpt_manager.latest_checkpoint)

In [13]:
with open('/Users/mamu867/PNNL_Mac/Springboard/image_caption_generator/data/interim/COCO/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [14]:
@tf.function
def train_step(img_tensor, target):
    loss = 0
    hidden = decoder.reset_state(batch_size=target.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)
    
    with tf.GradientTape() as tape:
        features = encoder(img_tensor)
        for i in range(1, target.shape[1]):
            predictions, hidden, _ = decoder(dec_input, features, hidden)
            loss += loss_function(target[:, i], predictions)
            dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    return loss, total_loss



In [None]:
loss_plot = []

In [15]:
EPOCHS = 20
for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0
    for batch in range(1, num_steps+1):
        img_tensor, target = load_data(batch, data='COCO')
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            average_batch_loss = batch_loss.numpy()/int(target.shape[1])
            print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    if epoch % 5 == 0:
      ckpt_manager.save()

    print(f'Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 100 Loss 2.3629
Epoch 1 Batch 200 Loss 2.4344
Epoch 1 Batch 300 Loss 1.7938
Epoch 1 Batch 400 Loss 2.6674


NameError: name 'loss_plot' is not defined