**STEP 1) CONNECTING GOOGLE COLAB TO GOOGLE DRIVE**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


**STEP 2) IMPORTING LIBRARIES**

In [None]:
from pickle import load
import datetime, os
import matplotlib.pyplot as plt
import numpy as np
import string
import glob
from tensorflow.keras.layers import add, LSTM, Embedding, Dense, Dropout
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import Input
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint,EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import corpus_bleu
from time import time

**STEP 3) SETTING UP WORKING PLATFORM**

In [None]:
# Skip this code block if you have already cloned the github repo in your GDrive


# Change working directory to your Google Drive
os.chdir('/content/drive/MyDrive/')

# Cloning the repository in your Google Drive.
# If you are doing inference right after doing training then no need to clone as during training process, this GitHub repo is cloned.
!git clone https://github.com/malayjoshi13/Describer.git

# STOP!

Before moving ahead, open this link https://drive.google.com/drive/folders/13YJjbA-iMBkM6NOEbdYpJs56q4JQ9FQx?usp=sharing. Save a shortcut of this folder (to use in this script later) in "Describer" directory which would have just created in your Google Drive after running above command. Now move forward.

In [None]:
# Change working directory to your cloned repository
os.chdir('/content/drive/MyDrive/Describer/')

# Enter location of folder inside directory named "Describer" in your Google Drive, from where you will fetch trained model and needed files.
# Two options:
#     "temporary" (if evaluating on your own trained model) or
#     "default_model_checkpoint" (if evaluating on default pre-trained model)
chkp_location = "./default_model_checkpoint/"

**STEP 4) SOME PARAMETERS TO BE USED HERE**

In [None]:
max_length=34
vocab_size=1652
embedding_dim=200
image_feature_shape=(2048,)

filename = chkp_location+'word_to_index.pkl'
word_to_index = load(open(filename, 'rb'))

filename1 = chkp_location+'index_to_word.pkl'
index_to_word = load(open(filename1, 'rb'))

filename2 = chkp_location+'all_captions_GLOVE_embedding.pkl'
embedding_matrix = load(open(filename2, 'rb'))

# Reading names of testing images present in "TestImagesName.txt" file and saving them to "text_content" variable
with open('dataset/TestImagesName.txt', 'r') as file:
 text_content = file.read()

# Reading all image encodings from "all_images_encodings.pkl" file and storing it into "image_content" variable
image_content = load(open(chkp_location+'all_images_encodings.pkl', 'rb'))

**STEP 5) EVALUATING THE TRAINED MODEL**

**STEP 5.1) Reading names of test images present in "DevImagesNames.txt"**

In [None]:
# We make a list "test_images_name" which stores names of all test images extracted from "TestImagesName.txt" file
test_images_name = list()
for line in text_content.split('\n'):
    if len(line) < 1:
        continue
    name = line.split('.')[0]
    test_images_name.append(name)

**STEP 5.2) Seperating encodings of test images from total images encodings**

In [None]:
# One by one we pick names of every test images from list "test_images_name" (from step 2.1) using "for k in test_images_name".
# Then we save pairs of "test_images_name" and their correponding "image encodings" in dictionary "test_images_encodings" by using "k: content[k]";
# here k->name of test image and content[k]->encoding of that test image
test_images_encodings = {k: image_content[k] for k in test_images_name}

**STEP 5.3) Seperating test captions from captions corresponding to total images**

In [None]:
# Then we check that from dictionary "modified_captions" which we made in (step 2.2),
# which all imagename-caption pairs matches to names of test images extracted from "TestImages.txt" file.
# Those who match are added to "test_captions" dictionary

test_captions = dict()

with open(chkp_location+'processed_captions.txt', 'r') as f:
  for line in f.readlines():
    bits = line.split()
    idd, caps_token = bits[0], bits[1:]
    if idd in test_images_name:
        if idd not in test_captions:
            test_captions[idd] = list()
        caps = ' '.join(caps_token)
        test_captions[idd].append(caps)
# This "test_captions" dictionary having test_images_name and their captions pairs looks like:
# {'1000268201_693b08cb0e': ['startseq child in pink dress....entry way endseq', 'startseq girl....building endseq', 'startseq little girl....endseq',.....], '1001773457_577c3a7d70': ['startseq black dog and spotted dog are fighting endseq',...}

**STEP 5.4) Re-initializing the model and loading saved weights to the model**

In [None]:
# image-encoding pipeline
inputs1 = Input(shape=image_feature_shape) #see code-block 2
layer1 = Dropout(0.5)(inputs1)
layer2 = Dense(256, activation='relu')(layer1)

# caption-pipeline
inputs2 = Input(shape=(max_length,))
layerA = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False, mask_zero=True)(inputs2)
layerB = Dropout(0.5)(layerA)
layerC = LSTM(256)(layerB)

# decoder (feed forward) model
merging_point = add([layer2, layerC])
activator = Dense(256, activation='relu')(merging_point)
outputs = Dense(vocab_size, activation='softmax')(activator)

trained_model = Model(inputs=[inputs1, inputs2], outputs=outputs)

#load best weight of trained model
trained_model.load_weights(chkp_location+'weights/best.h5')

# This "model.h5" file has memory of connections between words of training captions corresponnding to each encodings of training images.
# this weigths has been memorized during training process.
# Using this memory here in test process, we will give input of model encoding of test images and will ask model to predict sequences of words corresponding to given test image.
# Then we will compare caption predicted by model to that of caption we already have in test dataset. Then on basis of how close both captions are we will rate performance of model's prediction
# in terms of BLEU score.

**STEP 5.5) Initiating evaluation/test process**<br><br>

--> Evaluating using Greedy Search

In [None]:
def greedy_search(test_img_encodings):
    actual, predicted = list(), list()

    for idd, cap_list in test_captions.items():
      caption = 'startseq'

      for i in range(max_length):
        # during first iteration "hints_of_caption":- "startseq".
        # then "caption.split()" gives "startseq" which is present in "word_to_index", thus
        # "hints_of_caption"=[word_to_index[word]]=[1 (which in real is the label encoding for this word)]
        hints_of_caption = [word_to_index[word] for word in caption.split() if word in word_to_index]
        # becaue of "pad_sequences", "hints_of_caption" for first iteration becomes [1, 0, 0,....31 more zeroes], so that "hints_of_caption" of every
        # iteration becomes of same length equal to "max_length", i.e. 34
        padded_hints_of_caption = pad_sequences([hints_of_caption], maxlen=max_length)
        # then "validation_images_features[idd]" stores "encoding of image" corresponding to "idd" and "cap_list"
        # then this "image_encoding" and half caption consisting of "startseq" is given to model as an input and ask him to use his
        # "weights"/"memory" to predict what could be possible word next to "startseq"

        yhat = trained_model.predict([np.array([test_images_encodings[idd]]), np.array(padded_hints_of_caption)])

        # then out of all 1798 (=most_occuring_words in the vocabulary) possible outputs for next word, we choose ouput having highest probability
        yhat = np.argmax(yhat)
        # then we convert this label encoded output, into label decoded output, in simple terms if yhat=[6] then "index_to_word" will convert it to yhat=['stairs']
        word = index_to_word[yhat]
        # then "caption" = "startseq"+"stairs" = "startseq stairs"
        caption += ' ' + word
        # and now as "word"!="endseq", thus we again go into "for loop" and now "i"=2
        # and now unlike for i=1 where "caption":- "startseq", for i=2 "caption":- "startseq stairs"
        # again above process will repeat and using current "caption" next word will be predicted.
        # this process will keep continuing till either predicted next word called "word" is "endseq" or we have iterated to all 34 words of a caption
        if word == 'endseq':
          break

      # once "for loop" ends for a particular caption and we come out of it, next thing is that we will remove "startseq" and "endseq" from predicted caption.
      # We will do this by spliting variable:- finalz = "startseq stairs are high endseq", and then using:- finalz[1:-1], we will get:- finalz= ['stairs', 'are', 'high']
      # and then using:- ' '.join(finalz) we will get:- final_captionz = stairs are high
      finalz = caption.split()
      finalz = finalz[1:-1]
      final_captionz = ' '.join(finalz)

      # then we will split it again and add all words of that predicted caption in list "predicted"
      predicted.append(final_captionz.split())
      print(predicted)

      # and then add all words of actual captions of input image in list "actual"
      references = [d.split() for d in cap_list]
      actual.append(references)
      print(actual)

      return actual, predicted

--> Evaluating using Beam Search

In [None]:
def beam_search(beam_index, test_img_encodings):
    actual, predicted = list(), list()

    for idd, cap_list in test_captions.items():

        start = [word_to_index["startseq"]]
        start_word = [[start, 0.0]]
        while len(start_word[0][0]) < max_length:
            temp = []
            for s in start_word:
                par_caps = pad_sequences([s[0]], maxlen=max_length, padding='post')
                preds = trained_model.predict([np.array([test_img_encodings[idd]]),np.array(par_caps)], verbose=0)
                word_preds = np.argsort(preds[0])[-beam_index:]
                # Getting the top <beam_index>(n) predictions and creating a
                # new list so as to put them via the model again
                for w in word_preds:
                    next_cap, prob = s[0][:], s[1]
                    next_cap.append(w)
                    prob += preds[0][w]
                    temp.append([next_cap, prob])

            start_word = temp
            # Sorting according to the probabilities
            start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
            # Getting the top words
            start_word = start_word[-beam_index:]

        start_word = start_word[-1][0]
        intermediate_caption = [index_to_word[i] for i in start_word]
        final_caption = []

        for i in intermediate_caption:
            if i != 'endseq':
                final_caption.append(i)
            else:
                break

        final_caption = ' '.join(final_caption[1:])

    # then we will split it again and add all words of that predicted caption in list "predicted"
    predicted.append(final_caption.split())

    # and then add all words of actual captions of input image in list "actual"
    references = [d.split() for d in cap_list]
    actual.append(references)

    return actual, predicted

**STEP 5.6) TRAINED MODEL's PERFROMANCE --> BLEU scores**

--> Evaluating using Greedy Search

In [None]:
actual_list, predicted_list = greedy_search(test_images_encodings)

print('BLEU-1: %f' % corpus_bleu(actual_list, predicted_list, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual_list, predicted_list, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual_list, predicted_list, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % corpus_bleu(actual_list, predicted_list, weights=(0.25, 0.25, 0.25, 0.25)))

BLEU-1: 0.795413
BLEU-2: 0.666974
BLEU-3: 0.581795
BLEU-4: 0.399388


--> Evaluating using Beam Search

In [None]:
actual_list1, predicted_list1 = beam_search(3, test_images_encodings)

print('BLEU-1: %f' % corpus_bleu(actual_list1, predicted_list1, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual_list1, predicted_list1, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual_list1, predicted_list1, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % corpus_bleu(actual_list1, predicted_list1, weights=(0.25, 0.25, 0.25, 0.25)))