##**Some Important Dependencies and References**

In the Captioning Task we have used LSTM (Long Short Term Memory) and CNN (Convolutional Neural Network). Used tranfer learning to utilised these two model:

*   **InceptionV3** - https://arxiv.org/abs/1512.00567
*   **Glove** - https://nlp.stanford.edu/projects/glove/

Dataset is downloaded from the following link and other necessary dependencies:


*   **Glove Embeddings** - https://nlp.stanford.edu/projects/glove/
*   **Flickr8K Dataset** - https://www.kaggle.com/adityajn105/flickr8k/download
*   **Flickr8K Captions** - https://www.kaggle.com/adityajn105/flickr8k?select=captions.txt

After that whole dataset is uploaded to the drive. Here is the link below for the dataset and model architecture

**Google Drive:** https://drive.google.com/drive/folders/10zWRo3lGhDdGf0sb2qQzlK3fWXcKp775?usp=sharing

**Reference for algorithm that we used:** Andrej Karpathy's Dissertation: https://cs.stanford.edu/people/karpathy/main.pdf



###**1. Install GPU**

In [1]:
# Installing Tensorflow GPU Version 2.5
 
!pip install tensorflow-gpu
exit(0)

Collecting tensorflow-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/1d/a2/5ccf0a418eb22e0a2ae9edc1e7f5456d0a4b8b49524572897564b4030a9b/tensorflow_gpu-2.5.0-cp37-cp37m-manylinux2010_x86_64.whl (454.3MB)
[K     |████████████████████████████████| 454.3MB 39kB/s 
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.5.0


###**2. Import required Libraries**

In [1]:
import os
import numpy as np
import string

# Library that expands a wildcard pattern into the list of pathnames matching the pattern
import glob

# For output a smart progress bar by wrapping around any iterable
from tqdm import tqdm

# MobileNet and InceptionV3 from Keras and Tensorflow
from tensorflow.keras.applications import MobileNet
import tensorflow.keras.applications.mobilenet  
from tensorflow.keras.applications.inception_v3 import InceptionV3
import tensorflow.keras.applications.inception_v3

# Pre-processing Image Library
import tensorflow.keras.preprocessing.image

# For serializing and deserializing a Python object structure
import pickle
from time import time

# PIL - Python Imaging Library: For opening, manipulating, and saving images
from PIL import Image

# Importing layers and dependencies
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (LSTM, Embedding, TimeDistributed, Dense, RepeatVector, Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization)
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers

# Pad Sequence and Visualise the model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

# For Captions and Epoch 
START = "startseq"
STOP = "endseq"
EPOCHS = 10
USE_INCEPTION = True

###**3. Mounting Google Drive**

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
# Changing working directory

%cd /content/gdrive/MyDrive/Image_Captioning/Flickr8k Dataset

/content/gdrive/MyDrive/Image_Captioning/Flickr8k Dataset


In [4]:
# Assign captioning dataset

root_captioning = "/content/gdrive/MyDrive/Image_Captioning/Flickr8k Dataset"

###**4. Pre-processing Data**

In [5]:
# cleaning of extra whitespace, punctuation, and other distractions from Annotations

null_punct = str.maketrans('', '', string.punctuation)
lookup = dict()

with open( os.path.join(root_captioning,'Flickr8k_captions',\
                        'Flickr8k.token.txt'), 'r') as fp:
  
  max_length = 0
  for line in fp.read().split('\n'):
    tok = line.split()
    if len(line) >= 2:
      id = tok[0].split('.')[0]
      desc = tok[1:]
      
      # Cleanup description
      desc = [word.lower() for word in desc]
      desc = [w.translate(null_punct) for w in desc]
      desc = [word for word in desc if len(word)>1]
      desc = [word for word in desc if word.isalpha()]
      max_length = max(max_length,len(desc))
      
      if id not in lookup:
        lookup[id] = list()
      lookup[id].append(' '.join(desc))
      
lex = set()
for key in lookup:
  [lex.update(d.split()) for d in lookup[key]]

In [6]:
# Stats on the dataset downloaded and processed

# How many unique words
print(len(lookup))

# Dictionary
print(len(lex))

# Maximum length of a caption in words
print(max_length) 

8092
8763
32


###**5. Loading Glove Embeddings**

In [7]:
img = glob.glob(os.path.join(root_captioning,'Flickr8k_images', '*.jpg'))

In [8]:
# How many images we have
print(len(img))

8091


###**6. Read all image names and use the predefined train/test sets**

In [9]:
train_images_path = os.path.join(root_captioning,
            'Flickr8k_captions','Flickr_8k.trainImages.txt') 
train_images = set(open(train_images_path, 'r').read().strip().split('\n'))
test_images_path = os.path.join(root_captioning,
            'Flickr8k_captions','Flickr_8k.testImages.txt') 
test_images = set(open(test_images_path, 'r').read().strip().split('\n'))

train_img = []
test_img = []

for i in img:
  f = os.path.split(i)[-1]
  if f in train_images: 
    train_img.append(f) 
  elif f in test_images:
    test_img.append(f)

In [10]:
# Splitting Dataset status and size

print(len(train_images))
print(len(test_images))

6000
1000


###**7. Build the Sequences**

In [11]:
# Include a start and stop token at the beginning/end
# We will later use the start token to begin the process of generating a caption
# Encountering the stop token in the generated text will let us know the process is complete

train_descriptions = {k:v for k,v in lookup.items() if f'{k}.jpg'in train_images}
for n,v in train_descriptions.items(): 
  for d in range(len(v)):
    v[d] = f'{START} {v[d]} {STOP}'

In [12]:
len(train_descriptions)

6000

###**8. Computer Vision Neural Network to Transfer Lerning for Building the Model**

In [13]:
# Glove for the text embedding and InceptionV3 to extract features from the images

if USE_INCEPTION:
  encode_model = InceptionV3(weights='imagenet')
  encode_model = Model(encode_model.input, encode_model.layers[-2].output)
  WIDTH = 299
  HEIGHT = 299
  OUTPUT_DIM = 2048   # InceptionV3 has 2048 features 
  preprocess_input = tensorflow.keras.applications.inception_v3.preprocess_input
else:
  encode_model = MobileNet(weights='imagenet',include_top=False)
  WIDTH = 224
  HEIGHT = 224
  OUTPUT_DIM = 50176  # MobileNet has 50K features; this will increase processing time
  preprocess_input = tensorflow.keras.applications.mobilenet.preprocess_input

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [14]:
# Model Summary
encode_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 149, 149, 32) 96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 149, 149, 32) 0           batch_normalization[0][0]        
______________________________________________________________________________________________

###**9. Creating Training Set**

In [15]:
def encodeImage(img):
  # Resize all images to a standard size (specified bythe image encoding network)
  img = img.resize((WIDTH, HEIGHT), Image.ANTIALIAS)

  # Convert a PIL image to a numpy array
  x = tensorflow.keras.preprocessing.image.img_to_array(img)

  # Expand to 2D array
  x = np.expand_dims(x, axis=0)

  # Perform any preprocessing needed by InceptionV3 or others
  x = preprocess_input(x)

  # Call InceptionV3 to extract the smaller feature set for the image
  # Get the encoding vector for the image. Shape to correct form to be accepted by LSTM captioning network.
  x = encode_model.predict(x) 
  x = np.reshape(x, OUTPUT_DIM )
  return x

###**10. Training the Model**

In [16]:
# Formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"

In [17]:
train_path = os.path.join(root_captioning,"data",f'train{OUTPUT_DIM}.pkl')
if not os.path.exists(train_path):
  start = time()
  encoding_train = {}
  for id in tqdm(train_img):
     image_path = os.path.join(root_captioning,'Flickr8k_images', id)
     img = tensorflow.keras.preprocessing.image.load_img(image_path, target_size=(HEIGHT, WIDTH))
     encoding_train[id] = encodeImage(img)
  with open(train_path, "wb") as fp:
     pickle.dump(encoding_train, fp)
     print(f"\nGenerating training set took: {hms_string(time()-start)}")
else:
  with open(train_path, "rb") as fp:
    encoding_train = pickle.load(fp)

###**11. Similar process for Test Images**

In [18]:
test_path = os.path.join(root_captioning,"data",f'test{OUTPUT_DIM}.pkl')
if not os.path.exists(test_path):
  start = time()
  encoding_test = {}
  for id in tqdm(test_img):
    image_path = os.path.join(root_captioning,'Flickr8k_images', id)
    img = tensorflow.keras.preprocessing.image.load_img(image_path, \
                target_size=(HEIGHT, WIDTH))
    encoding_test[id] = encodeImage(img)
  with open(test_path, "wb") as fp:
    pickle.dump(encoding_test, fp)
  print(f"\nGenerating testing set took: {hms_string(time()-start)}")
else:
  with open(test_path, "rb") as fp:
    encoding_test = pickle.load(fp)

In [19]:
# Separate the captions that we will use for training. There are two sides to this training, the images, and the captions
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

30000

In [20]:
# Remove any words that occur less than ten times
# Display the new reduced size of the vocabulary shrunk

word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))

preprocessed words 7578 ==> 1651


In [21]:
# The idxtoword converts index numbers to actual words to index values
# The wordtoidx lookup table performs the opposite

idxtoword = {}
wordtoidx = {}

ix = 1
for w in vocab:
    wordtoidx[w] = ix
    idxtoword[ix] = w
    ix += 1
    
vocab_size = len(idxtoword) + 1 
vocab_size

1652

In [22]:
# Maximum length of the caption

max_length +=2
print(max_length)

34


###**12. Data Generator**

In [23]:
# Flikr8K dataset has five captions for each picture; we are just training on two captions
# Guided by Dr. Nabin Sharma

def data_generator(descriptions, photos, wordtoidx, \
                   max_length, num_photos_per_batch):
  # x1 - Training data for photos
  # x2 - The caption that goes with each photo
  # y - The predicted rest of the caption
  x1, x2, y = [], [], []
  n=0
  while True:
    for key, desc_list in descriptions.items():
      n+=1
      photo = photos[key+'.jpg']
      # Each photo has 5 descriptions
      for desc in desc_list:
        # Convert each word into a list of sequences.
        seq = [wordtoidx[word] for word in desc.split(' ') \
               if word in wordtoidx]
        # Generate a training case for every possible sequence and outcome
        for i in range(1, len(seq)):
          in_seq, out_seq = seq[:i], seq[i]
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
          x1.append(photo)
          x2.append(in_seq)
          y.append(out_seq)
      if n==num_photos_per_batch:
        yield ([np.array(x1), np.array(x2)], np.array(y))
        x1, x2, y = [], [], []
        n=0

###**13. Loading Glove Directory Path**

In [24]:
glove_dir = os.path.join(root_captioning,'glove.6B')
embeddings_index = {} 
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

400000it [00:23, 16961.47it/s]

Found 400000 word vectors.





###**14. Building the Neural Network**

In [25]:
# Build an embedding matrix from Glove
# Directly copy this matrix to the weight matrix of the neural network

embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoidx.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [26]:
# Matrix Dimensions: It is 1652 (the size of the vocabulary) by 200 (number of features Glove generated for each word)
embedding_matrix.shape

(1652, 200)

In [27]:
inputs1 = Input(shape=(OUTPUT_DIM,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [28]:
embedding_dim

200

In [29]:
caption_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 34)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 34, 200)      330400      input_3[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 2048)         0           input_2[0][0]                    
____________________________________________________________________________________________

In [30]:
caption_model.layers[2].set_weights([embedding_matrix])
caption_model.layers[2].trainable = False
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

###**15. Train the Neural Network**

In [31]:
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath

In [32]:
model_path = os.path.join(root_captioning,"data",f'caption-model.hdf5')
if not os.path.exists(model_path):
  for i in tqdm(range(EPOCHS*2)):
      generator = data_generator(train_descriptions, encoding_train, wordtoidx, max_length, number_pics_per_bath)
      caption_model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)

  caption_model.optimizer.lr = 1e-4
  number_pics_per_bath = 4
  steps = len(train_descriptions)//number_pics_per_bath

  for i in range(EPOCHS):
      generator = data_generator(train_descriptions, encoding_train, wordtoidx, max_length, number_pics_per_bath)
      caption_model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)  
  caption_model.save_weights(model_path)
else:
  caption_model.load_weights(model_path)

###**16. Generating Captions**

In [33]:
def generateCaption(photo):
    in_text = START
    for i in range(max_length):
        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idxtoword[yhat]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

###**17. Evaluate Performance on Test Data from Flicker8k**

In [34]:
import random 
mylist = random.sample(range(0,6000),25)

for z in mylist:
  pic = list(encoding_train.keys())[z]
  image = encoding_train[pic].reshape((1,OUTPUT_DIM))
  print(os.path.join(root_captioning,'Flickr8k_Dataset', pic))
  x=plt.imread(os.path.join(root_captioning,'Flickr8k_images', pic))
  plt.imshow(x)
  plt.show()
  print("Caption:",generateCaption(image))
  print("_____________________________________")

Output hidden; open in https://colab.research.google.com to view.