Side note: all the cells until the `Pipelining Preprocessing Steps` are similar to the cells in `...phase_1.ipynb` file, as we've worked on preprocessing steps to be used in phase 2 when writing the code for phase 1

# Cells for Google Colab

In [1]:
import os
runningFromColab = False
if 'CGROUP_MEMORY_EVENTS' in os.environ and 'colab' in os.environ['CGROUP_MEMORY_EVENTS']:
  runningFromColab = True

In [2]:
if runningFromColab:
  from google.colab import drive
  drive.mount('/content/drive')

In [3]:
if runningFromColab:
  %cd /content/drive/MyDrive/ColabProjects

In [4]:
if runningFromColab:
  !git clone https://github.com/OdyAsh/nlp-image-captioning.git

In [5]:
if runningFromColab:
  %cd /content/drive/MyDrive/ColabProjects/nlp-image-captioning

In [6]:
if runningFromColab:
  !git pull
  # if it DOES NOT say "Already up to date.", then you need to close this notebook file (i.e., the browser tab) and open it again for it to change 

In [7]:
# if runningFromColab:
#   try:
#     import condacolab
#     condacolab.install()
#   except:
#     !pip install -q condacolab
#     import condacolab
#     condacolab.install()
#     # now restart the kernel

In [8]:
# if runningFromColab:
#   !conda env create -f environment.yml
#   # !conda update conda -y -q
#   # !source /usr/local/etc/profile.d/conda.sh
#   # !conda init 
#   # !conda install -n root _license -y -q
#   # !source activate myenv

In [9]:
# if runningFromColab:
#   import sys
#   sys.path.insert(0, '/usr/local/bin/conda')

# Imports & Global Functions/Variables

In [10]:
from pprint import pprint
from glob import glob
from time import time
import os
import pickle
import regex as re
import string
import nltk
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
#                          Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
# from tensorflow.keras.layers import Bidirectional
# from tensorflow.keras.layers import Add # merge.add
from tensorflow.keras.applications import inception_v3 # inception_v3.preprocess_input
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras import preprocessing # preprocessing.image, preprocessing.sequence, preprocessing.text.Tokenizer, preprocessing.sequence.pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers
from tensorflow.keras.utils import to_categorical

# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.autograd import Variable

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def pklSave(contentToBeSaved, fullPath):
    with open(fullPath, 'wb') as f:
        pickle.dump(contentToBeSaved, f)

def pklLoad(fullPath):
    with open(fullPath, 'rb') as f:
        content = pickle.load(f)
    return content

def pklForceLoad(path, dtype = 'dict'):
    try:
        content = pklLoad(path)
        return content
    except Exception as e:
        if dtype == 'list':
            pklSave([], path)
            return []
        else:
            pklSave({}, path)
            return {}

# more about naming standards for path components here: https://stackoverflow.com/questions/2235173/what-is-the-naming-standard-for-path-components
def joinPaths(baseDirectory, relativePath):
    return os.path.normpath(os.path.join(baseDirectory, relativePath))

In [13]:
datasetImgsBasePath = 'dataset/Flicker8k_Dataset/'
fullImgsPath = glob(datasetImgsBasePath + '*.jpg')
fullImgsPaths = [os.path.normpath(path) for path in fullImgsPath]
len(fullImgsPaths)

8091

# Data Collection
The dataset is obtained from [here](https://forms.illinois.edu/sec/1713398)

In [14]:
# checking the 5 captions per image
filename = "dataset/Flicker8k_TextFiles/Flickr8k.token.txt"
with open(filename, 'r') as f:
    doc = f.read()
lines = doc.split('\n')
print('first image\'s captions:')
pprint(lines[:5])
print('\nattention image\'s captions:')
pprint(lines[5:10])
print('\nand so forth...')

first image's captions:
['1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of '
 'stairs in an entry way .',
 '1000268201_693b08cb0e.jpg#1\tA girl going into a wooden building .',
 '1000268201_693b08cb0e.jpg#2\tA little girl climbing into a wooden playhouse '
 '.',
 '1000268201_693b08cb0e.jpg#3\tA little girl climbing the stairs to her '
 'playhouse .',
 '1000268201_693b08cb0e.jpg#4\tA little girl in a pink dress going into a '
 'wooden cabin .']

attention image's captions:
['1001773457_577c3a7d70.jpg#0\tA black dog and a spotted dog are fighting',
 '1001773457_577c3a7d70.jpg#1\tA black dog and a tri-colored dog playing with '
 'each other on the road .',
 '1001773457_577c3a7d70.jpg#2\tA black dog and a white dog with brown spots '
 'are staring at each other in the street .',
 '1001773457_577c3a7d70.jpg#3\tTwo dogs of different breeds looking at each '
 'other on the road .',
 '1001773457_577c3a7d70.jpg#4\tTwo dogs on pavement moving toward each other .']

an

The captions above are for these two images:

<img src="project_media/1000268201_693b08cb0e.jpg" width="100" />

<img src="project_media/1001773457_577c3a7d70.jpg" width="150" />

# Data Cleaning
Includes:
* `imgToCaptions` dictionary
* `cleanCaptions()` to remove stopwords/punctuations
* `createVocab()` to limit vocab size based on word frequency
* `train`, `val`, and `test` `ImgToCaptions` which prepends `startseq` and appends `endseq`

In [15]:
# getting these captions in a dictionary; 
# where the key is the image's name (without .jpg) and the value is a list of 5 captions

imgToCaptions = dict()
for line in lines:
    idAndCaption = re.split("\..+\t", line)
    if len(idAndCaption) < 2:
        continue
    imgId, caption = idAndCaption
    if imgId not in imgToCaptions:
        imgToCaptions[imgId] = list()
    imgToCaptions[imgId].append(caption)
    
imgToCaptions['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [16]:
# removing punctuation using maketrans (i.e., translation table)
# more about maketrans method: https://www.w3schools.com/python/ref_string_maketrans.asp#:~:text=The%20third%20parameter%20in%20the%20mapping%20table%20describes%20characters%20that%20you%20want%20to%20remove%20from%20the%20string%3A


def cleanCaptions(imgToCaptions, levelOfStopwordsPresence=1):
    table = str.maketrans('', '', string.punctuation) # third argument: removes any character in this list: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    for key, descList in imgToCaptions.items():
        # when this for loop is done, all 5 captions of an image will be cleaned
        for i in range(len(descList)):
            desc = descList[i]
            desc = desc.split(' ')
            desc = [word.lower() for word in desc]
            desc = [word.translate(table) for word in desc] # remove punctuation from each token
            stopswordsToRemove = []
            if levelOfStopwordsPresence == 1:
                stopswordsToRemove = ['a', 'an', 'the']
            elif levelOfStopwordsPresence >= 2:
                stopswordsToRemove = set(stopwords.words('english'))
            desc = [word for word in desc if word not in stopswordsToRemove]
            desc = [word for word in desc if word.isalpha()] # remove tokens with numbers in them
            descList[i] =  ' '.join(desc) # store as string

# cleanCaptions(imgToCaptions, levelOfStopwordsPresence=1)
# pklSave(imgToCaptions, 'dataset/pickles/imgToCaptionsSWKept.pickle')
imgToCaptions = pklLoad('dataset/pickles/imgToCaptionsSWKept.pickle')
imgToCaptions['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

example with stopwords removed:
<br><br>
'little girl climbing stairs playhouse',

<br>
example with only ['a', 'an', 'the'] removed:
<br><br>
'little girl climbing stairs to her playhouse',
<br><br>
from the lack of context seen above, we've decided to keep the rest of the stopwords

In [17]:
# creating vocab of unique words (where each word occured at least freqThreshold number of times)
def createVocab(imgToCaptions, freqThreshold = 10):
    vocab = set()
    for key in imgToCaptions.keys():
        [vocab.update(desc.split()) for desc in imgToCaptions[key]]
    print(f'Original vocabulary (i.e., unique words) size: {len(vocab)}')

    # keeping words that appear at least freqThrehold number of times
    vocabWordFreq = {key: 0 for key in vocab}
    for key, descs in imgToCaptions.items():
        for desc in descs:
            descList = desc.split(' ')
            for word in descList:
                if word != '':
                    vocabWordFreq[word] += 1
    
    vocab = set()
    i = 0
    for word, freq in vocabWordFreq.items():
        if freq >= freqThreshold:
            i += 1
            vocab.add(word)
            vocabWordFreq[word] = freq
    print(f'Vocabulary size after removing less frequent words (< {freqThreshold} words): {len(vocab)}')

    vocabWordFreqRemoved = {word: freq for word, freq in vocabWordFreq.items() if word not in vocabWordFreq}

    return vocab, vocabWordFreq, vocabWordFreqRemoved

def createVocabTxtFiles(vocabWordFreq, vocabWordFreqRemoved, filePrefix="vocabFreqThreshold"):
    with open(f'dataset/{filePrefix}.txt', 'w') as f:
        f.write(str(dict(sorted(vocabWordFreq.items(), key=lambda x: x[1], reverse=True))))
    with open(f'dataset/{filePrefix}Removed.txt', 'w') as f:
        f.write(str(dict(sorted(vocabWordFreqRemoved.items(), key=lambda x: x[1], reverse=True))))

freqThreshold = 5
vocab, vocabWordFreq, vocabWordFreqRemoved = createVocab(imgToCaptions, freqThreshold)
createVocabTxtFiles(vocabWordFreq, vocabWordFreqRemoved, filePrefix=f"vocabFreqThreshold{freqThreshold}")

Original vocabulary (i.e., unique words) size: 8366
Vocabulary size after removing less frequent words (< 5 words): 2945


In [18]:
# function to get filenames of images from a text file (without the extension)
def getImgsIdsList(txtPath):
    with open(txtPath, 'r') as f:
        doc = f.read()
    ImgsIds = []
    for line in doc.split('\n'):
        imgId = line.split('.')[0]
        ImgsIds.append(imgId)
    ImgsIds = [id for id in ImgsIds if id != '']
    return ImgsIds

trainImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.trainImages.txt')
valImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.devImages.txt')
testImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.testImages.txt')
print(f'Train Dataset: {len(trainImgsIds)}')
print(f'Validation Dataset: {len(valImgsIds)}')
print(f'Test Dataset: {len(testImgsIds)}')

Train Dataset: 6000
Validation Dataset: 1000
Test Dataset: 1000


In the code below, we use `startseq` and `endseq` for the following reasons:
* startseq : Will indicate the start of the caption generation process
* endseq : to stop predicting words as soon as it appears

In [19]:
trainImgToCaptions = dict()
valImgToCaptions = dict()
testImgToCaptions = dict()
for imgId in trainImgsIds:
    trainImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
for imgId in valImgsIds:
    valImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
for imgId in testImgsIds:
    testImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
print(f'images in training set: {len(trainImgToCaptions)}\n')
print(f'images in validation set: {len(valImgToCaptions)}\n')
print(f'images in testing set: {len(testImgToCaptions)}\n')
print('example from training set:')
pprint(trainImgToCaptions['2513260012_03d33305cf'])

images in training set: 6000

images in validation set: 1000

images in testing set: 1000

example from training set:
['startseq black dog is running after white dog in snow endseq',
 'startseq black dog chasing brown dog through snow endseq',
 'startseq two dogs chase each other across snowy ground endseq',
 'startseq two dogs play together in snow endseq',
 'startseq two dogs running through low lying body of water endseq']


# Preparing File Paths

In [20]:
trainImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in trainImgToCaptions.keys()]
valImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in valImgToCaptions.keys()]
testImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in testImgToCaptions.keys()]
len(trainImgsPaths), len(valImgsPaths), len(testImgsPaths)

(6000, 1000, 1000)

# Data Pre-Processing
Includes:
* Pre-Processing Images
* Pre-Processing Captions

## Pre-Processing Images
Includes:
* Loading Google's `InceptionV3` model
* Preprocessing the image
* Encoding the image by inputting it to `InceptionV3` to get a `2048` feature vector of the image

In [21]:
# getting the feature vector of each image using the InceptionV3 CNN model created by Google Research
inception_model = InceptionV3(weights='imagenet') # getting the InceptionV3 model trained on imagenet data
inception_model.layers[-1].output

<KerasTensor: shape=(None, 1000) dtype=float32 (created by layer 'predictions')>

In [22]:
modelForFeatureExtraction = Model(inception_model.input, inception_model.layers[-2].output) # removing the last layer (output softmax layer)
modelForFeatureExtraction.layers[-1].output

<KerasTensor: shape=(None, 2048) dtype=float32 (created by layer 'avg_pool')>

In [23]:
# function to preprocess the input image
def preprocess(imgPath):
    pilImg = preprocessing.image.load_img(imgPath, target_size=(299, 299)) # Convert all the images to size 299x299 as expected by the inception v3 model
    x = preprocessing.image.img_to_array(pilImg) # Convert PIL image to numpy array of 3-dimensions
    x = np.expand_dims(x, axis=0) # Add one more dimension; from (299, 299, 3) to (1, 299, 299, 3)
    x = inception_v3.preprocess_input(x) # takes in (batch_size, height, width, channels), returns same dimensions, but does some preprocessing operations, like scaling values to be from -1 to 1
    return x

# function to encode a given image (from its path) into a vector of size (2048, )
def encode(imgPath, modelForFeatureExtraction):
    imgPath = preprocess(imgPath) # preprocess the image
    featureVec = modelForFeatureExtraction.predict(imgPath) # Get the encoding vector for the image
    featureVec = np.reshape(featureVec, featureVec.shape[1]) # reshape from (1, 2048) to (2048, )
    return featureVec

In [24]:
# Call the funtion to encode all the train images (dictionary where an image id --> feature vector of length 2048)
# This will take a while on CPU - Execute this only once (took around 13 minutes on my high-end laptop)
def encodeImgToFeatures(imgsPaths, modelForFeatureExtraction):
    imgToFeatures = dict()
    for imgPath in imgsPaths:
        imgToFeatures[imgPath[len(datasetImgsBasePath):]] = encode(imgPath, modelForFeatureExtraction)
    return imgToFeatures

# trainImgToFeatures = encodeImgToFeatures(trainImgsPaths, modelForFeatureExtraction)
# valImgToFeatures = encodeImgToFeatures(valImgsPaths, modelForFeatureExtraction)
# testImgToFeatures = encodeImgToFeatures(testImgsPaths, modelForFeatureExtraction)
# pklSave(trainImgToFeatures, 'dataset/pickles/trainImgToFeatures.pickle')
# pklSave(valImgToFeatures, 'dataset/pickles/valImgToFeatures.pickle')
# pklSave(testImgToFeatures, 'dataset/pickles/testImgToFeatures.pickle')
trainImgToFeatures = pklLoad('dataset/pickles/trainImgToFeatures.pickle')
valImgToFeatures = pklLoad('dataset/pickles/valImgToFeatures.pickle')
testImgToFeatures = pklLoad('dataset/pickles/testImgToFeatures.pickle')
len(trainImgToFeatures), len(valImgToFeatures), len(testImgToFeatures), trainImgToFeatures['2513260012_03d33305cf.jpg'].shape

(6000, 1000, 1000, (2048,))

## Pre-Processing Captions
Includes:
* `mapIdxAndWord()` to map indices to words and vice versa, where the words are obtained from `createVocab()`
* `maxCaptionLength()` to get the caption with the most amount of words, to be used later to pad input sequences <br> (explained in `Preparing Model Generator` section)

In [25]:
# creating two dictionaries: word to index, and index to word

def mapIdxAndWord(vocab):
    idxToWord = {}
    wordToIdx = {}
    idx = 1
    for word in vocab:
        wordToIdx[word] = idx
        idxToWord[idx] = word
        idx += 1
    return idxToWord, wordToIdx

vocab, _, _ = createVocab(imgToCaptions, freqThreshold=5)
idxToWord, wordToIdx = mapIdxAndWord(vocab)
vocabSize = len(idxToWord) + 1 # one for appended 0's; represents "startseq" (explained in "Preparing Model Generator" section)
vocabSize

Original vocabulary (i.e., unique words) size: 8366
Vocabulary size after removing less frequent words (< 5 words): 2945


2946

In [26]:
# getting the length of the longest caption; as we will later need to encode each word into a fixed sized vector

# convert a dictionary of clean captions to a list of captions
def toCaptionsList(ImgToCaptions):
	captionsList = list()
	for imgId in ImgToCaptions.keys():
		[captionsList.append(caption) for caption in ImgToCaptions[imgId]]
	return captionsList

# calculate the length of the description with the most words
def maxCaptionLength(ImgToCaptions):
    captions = toCaptionsList(ImgToCaptions)
    captionsLengths = [len(caption.split()) for caption in captions]
    return max(captionsLengths)

# determine the maximum sequence length
maxCapLen = maxCaptionLength(trainImgToCaptions)
print(f'Description Length: {maxCapLen}')

Description Length: 32


In [27]:
maxCaptionLength(testImgToCaptions)

29

# Word Embedding

## Using Glove Embedding

In [28]:
# run the commented code lines below once to create word_to_glove_embedding dictionary, then load it using pklLoad()

# Load Glove vectors
glove_dir = './dataset/glove_word_embeddings/'
# word_to_glove_embedding = {} # empty dictionary
# with open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8") as f:
#     for line in f:
#         word_and_embedding_of_word = line.split()
#         word = word_and_embedding_of_word[0]
#         coefs = np.asarray(word_and_embedding_of_word[1:], dtype='float32')
#         word_to_glove_embedding[word] = coefs
# pklSave(word_to_glove_embedding, './dataset/pickles/word_to_glove_embedding.pickle')

word_to_glove_embedding = pklLoad('./dataset/pickles/word_to_glove_embedding.pickle')

print('Found %s word vectors.' % len(word_to_glove_embedding))

Found 400000 word vectors.


In [29]:
def get_embedding_matrix(vocabSize, embedding_dim=200):
    # Get embedding_dim dense vector for each word in our vocabulary
    embedding_matrix = np.zeros((vocabSize, embedding_dim))

    words_with_no_embedding = []
    for word, i in wordToIdx.items():
        #if i < max_words:
        embedding_vector = word_to_glove_embedding.get(word)
        if embedding_vector is None:
            # Words not found in the embedding index will be all zeros
            words_with_no_embedding.append(word)
            continue
        embedding_matrix[i] = embedding_vector[:embedding_dim]

    return embedding_matrix, words_with_no_embedding

In [30]:
# for debugging: the code above will be re-written in preprocessing_pipeline()

#hyperparameter:
embedding_dim = 200 # Note: 200 is the maximum number of dimensions specified in glove.6B.200d.txt

embedding_matrix, words_with_no_embedding = get_embedding_matrix(vocabSize, embedding_dim)
print('number of words with no embeddings:', len(words_with_no_embedding))
print('which are:')
print(words_with_no_embedding)
print('embedding matrix dimensions (vocab size x embedding dimensions):')
print(embedding_matrix.shape)

number of words with no embeddings: 15
which are:
['wakeboards', 'plushie', 'corndogs', 'somthing', 'wakeboarder', 'outstreached', 'rollerblader', 'surfboarder', 'floaties', 'bicycler', 'parasails', 'dalmation', 'waterskier', 'inground', 'windsurfs']
embedding matrix dimensions (vocab size x embedding dimensions):
(2946, 200)


# Pipelining Preprocessing Steps

In [31]:
def preprocessing_pipeline(freq_threshold, model_for_feature_extraction, embedding_dim=200, load_features=False): # Note: in embedding_dim, 200 is the maximum number of dimensions specified in glove.6B.200d.txt

    filename = "dataset/Flicker8k_TextFiles/Flickr8k.token.txt"
    with open(filename, 'r') as f:
        doc = f.read()
    lines = doc.split('\n')

    # getting these captions in a dictionary; 
    # where the key is the image's name (without .jpg) and the value is a list of 5 captions
    imgToCaptions = dict()
    for line in lines:
        idAndCaption = re.split("\..+\t", line)
        if len(idAndCaption) < 2:
            continue
        imgId, caption = idAndCaption
        if imgId not in imgToCaptions:
            imgToCaptions[imgId] = list()
        imgToCaptions[imgId].append(caption)

    freqThreshold = freq_threshold
    vocab, vocabWordFreq, vocabWordFreqRemoved = createVocab(imgToCaptions, freqThreshold)
    # createVocabTxtFiles(vocabWordFreq, vocabWordFreqRemoved, filePrefix=f"vocabFreqThreshold{freqThreshold}")
    idxToWord, wordToIdx = mapIdxAndWord(vocab)
    vocabSize = len(idxToWord) + 1 # one for appended 0's; represents "startseq" (explained in "Preparing Model Generator" section)

    trainImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.trainImages.txt')
    valImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.devImages.txt')
    testImgsIds = getImgsIdsList('dataset/Flicker8k_TextFiles/Flickr_8k.testImages.txt')

    trainImgToCaptions = dict()
    valImgToCaptions = dict()
    testImgToCaptions = dict()
    for imgId in trainImgsIds:
        trainImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
    for imgId in valImgsIds:
        valImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]
    for imgId in testImgsIds:
        testImgToCaptions[imgId] = ['startseq ' + desc + ' endseq' for desc in imgToCaptions[imgId]]

    datasetImgsBasePath = 'dataset/Flicker8k_Dataset/'
    fullImgsPath = glob(datasetImgsBasePath + '*.jpg')
    trainImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in trainImgToCaptions.keys()]
    valImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in valImgToCaptions.keys()]
    testImgsPaths = [joinPaths(datasetImgsBasePath, imgId)+'.jpg' for imgId in testImgToCaptions.keys()]

    if load_features:
        trainImgToFeatures = pklLoad('dataset/pickles/trainImgToFeatures.pickle')
        valImgToFeatures = pklLoad('dataset/pickles/valImgToFeatures.pickle')
        testImgToFeatures = pklLoad('dataset/pickles/testImgToFeatures.pickle')   
    else: 
        trainImgToFeatures = encodeImgToFeatures(trainImgsPaths, model_for_feature_extraction) # shape of each encoded image: (2048,); returned by Google's Inception Model
        valImgToFeatures = encodeImgToFeatures(valImgsPaths, model_for_feature_extraction)
        testImgToFeatures = encodeImgToFeatures(testImgsPaths, model_for_feature_extraction)
        # pklSave(trainImgToFeatures, 'dataset/pickles/trainImgToFeatures.pickle')
    # pklSave(valImgToFeatures, 'dataset/pickles/valImgToFeatures.pickle')
    # pklSave(testImgToFeatures, 'dataset/pickles/testImgToFeatures.pickle')

    # determine the maximum sequence length
    maxCapLen = maxCaptionLength(trainImgToCaptions)

    embedding_matrix, words_with_no_embedding = get_embedding_matrix(vocabSize, embedding_dim)
    print('number of words with no embeddings:', len(words_with_no_embedding))
    print('which are:')
    print(words_with_no_embedding)
    print('embedding matrix dimensions (vocab size x embedding dimensions):')
    print(embedding_matrix.shape)

    return (imgToCaptions, vocab, vocabWordFreq, vocabWordFreqRemoved, 
            idxToWord, wordToIdx, vocabSize, maxCapLen,
            embedding_matrix, words_with_no_embedding,
            trainImgsIds, valImgsIds, testImgsIds, 
            trainImgToFeatures, valImgToFeatures, testImgToFeatures, 
            trainImgToCaptions, valImgToCaptions, testImgToCaptions)

# LSTM Model

In [32]:
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
import tensorflow as tf

def create_lstm_model(embedding_matrix, vocabSize, droput_rate = 0.5, dense_layer_units = 256, hidden_layers_activation = 'relu'): 
    inputs1 = Input(shape=(2048,)) # 2048 is fixed, as this is an image's feature vector size returned by Google's Inception model
    fe1 = Dropout(droput_rate)(inputs1)
    fe2 = Dense(dense_layer_units, activation=hidden_layers_activation)(fe1)
    inputs2 = Input(shape=(maxCapLen,))
    se1 = Embedding(vocabSize, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(droput_rate)(se1)
    se3 = LSTM(dense_layer_units)(se2) 
    merge_img_and_txt_features = concatenate([fe2, se3])
    decoder = Dense(dense_layer_units, activation=hidden_layers_activation)(merge_img_and_txt_features)
    outputs = Dense(vocabSize, activation='softmax')(decoder)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # setting the Embedding layer to not be trainable, as the glove embeddings fetched from the txt file are already pre-trained results
    model.layers[2].set_weights([embedding_matrix])
    model.layers[2].trainable = False

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['acc']) 

    return model

# gru Model

In [40]:
from keras.layers import GRU, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
import tensorflow as tf

def create_gru_model(embedding_matrix, vocabSize, droput_rate = 0.5, dense_layer_units = 256, hidden_layers_activation = 'relu'): 
    inputs1 = Input(shape=(2048,)) # 2048 is fixed, as this is an image's feature vector size returned by Google's Inception model
    fe1 = Dropout(droput_rate)(inputs1)
    fe2 = Dense(dense_layer_units, activation=hidden_layers_activation)(fe1)
    inputs2 = Input(shape=(maxCapLen,))
    se1 = Embedding(vocabSize, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(droput_rate)(se1)
    se3 = GRU(dense_layer_units)(se2)
    merge_img_and_txt_features = concatenate([fe2, se3])
    decoder = Dense(dense_layer_units, activation=hidden_layers_activation)(merge_img_and_txt_features)
    outputs = Dense(vocabSize, activation='softmax')(decoder)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    # setting the Embedding layer to not be trainable, as the glove embeddings fetched from the txt file are already pre-trained results
    model.layers[2].set_weights([embedding_matrix])
    model.layers[2].trainable = False

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['acc']) 

    return model

In [33]:
# failed attempt to create attention model

# from keras.layers import LSTM, Embedding, Dropout, concatenate, Bidirectional
                         
# from keras_self_attention import SeqSelfAttention

# def create_attention_model(embedding_matrix, vocabSize, droput_rate = 0.5, dense_layer_units = 256, hidden_layers_activation = 'relu'): 
#     inputs1 = Input(shape=(2048,)) # 2048 is fixed, as this is an image's feature vector size returned by Google's Inception model
#     fe1 = Dropout(droput_rate)(inputs1)
#     fe2 = Dense(dense_layer_units, activation=hidden_layers_activation)(fe1)
#     inputs2 = Input(shape=(maxCapLen,))
#     se1 = Embedding(vocabSize, embedding_dim, mask_zero=True)(inputs2)
#     se2 = Dropout(droput_rate)(se1)
#     se3 = LSTM(dense_layer_units, return_sequences=True)(se2) 
#     se4 = SeqSelfAttention(attention_activation='sigmoid')(se3)
#     se5 = Flatten()(se4)
#     print(se5)
#     merge_img_and_txt_features = concatenate([fe2, se5])
#     print(merge_img_and_txt_features)
#     decoder = Dense(dense_layer_units, activation=hidden_layers_activation)(merge_img_and_txt_features)
#     outputs = Dense(vocabSize, activation='softmax')(decoder)
#     model = Model(inputs=[inputs1, inputs2], outputs=outputs)

#     # setting the Embedding layer to not be trainable, as the glove embeddings fetched from the txt file are already pre-trained results
#     model.layers[1].set_weights([embedding_matrix])
#     model.layers[1].trainable = False

#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['acc']) 


#     return model

The attempted attention model shown above failed with this code error:

<img src='./project_media/failed_attention_model.png' width='800' />

# Preparing Model Generator

In [34]:
# data generator, intended to be used in a call to model.fit_generator()
def dataGenerator(imgToCaptions, imgToFeatures, wordToIdx, vocabSize, maxCaptionLength, imgsBatchSize):
    X1, X2, y = list(), list(), list()
    n = 0
    # loop forever over images
    while True:
        for imgId, captions in imgToCaptions.items():
            n += 1
            imgFeatures = imgToFeatures[imgId+'.jpg'] # retrieve the image's feature vector
            for caption in captions:
                seq = [wordToIdx[word] for word in caption.split(' ') if word in wordToIdx] # encode the caption into a sequence of numbers instead of words
                for i in range(1, len(seq)): # split one sequence into multiple X, y pairs
                    inSeq, outSeq = seq[:i], seq[i] # split into input and output pair
                    inSeq = preprocessing.sequence.pad_sequences([inSeq], maxlen=maxCaptionLength, padding="pre")[0] # pad input sequence
                    outSeq = to_categorical([outSeq], num_classes=vocabSize)[0] # (one-hot) encodes the output sequence (note: to_categorical() is a keras-related function)
                    X1.append(imgFeatures) # store the values
                    X2.append(inSeq)
                    y.append(outSeq)
            # yield the batch data
            if n == imgsBatchSize:
                yield [[np.array(X1), np.array(X2)], np.array(y)] # "yield" saves the function's state, returns [[..]], then continues function from that statement when function is called again
                X1, X2, y = list(), list(), list()
                n = 0

Explanation of inner-most for loop above:

<img src="project_media/xi_as_words.png" width="600" />

However, since we're using `wordToIdx` mapping, the table above will be:

<img src="project_media/xi_as_idxss.png" width="600" />

Note 1: the table above is for the case of `post` padding. However, we'll assume `pre` padding, as it is [generally advised](https://stackoverflow.com/questions/46298793/how-does-choosing-between-pre-and-post-zero-padding-of-sequences-impact-results#:~:text=I%20always%20recommend%20using%20pre%2Dpadding%20over%20post%2Dpadding%2C%20even%20for%20CNNs%2C%20unless%20the%20problem%20specifically%20requires%20post%2Dpadding.)

Note 2: under the hood, `target word` is one-hot encoded representation of the numerical values displayed in the table above; this is because one-hot encoding the target word allows us to represent this probability distribution as a vector of probabilities over the entire vocabulary, which can be directly compared to the predicted probability distribution output by the neural network  


# Model Training

## LSTM Model

In [35]:
# hyperparameters:
freq_threshold = 5
inception_model = InceptionV3(weights='imagenet') # getting the InceptionV3 model trained on imagenet data
model_for_feature_extraction = Model(inception_model.input, inception_model.layers[-2].output) # removing the last layer (output softmax layer)
imgs_batch_size = 32
epochs = 30
embedding_dim = 200

(imgToCaptions, vocab, vocabWordFreq, vocabWordFreqRemoved, 
idxToWord, wordToIdx, vocabSize, maxCapLen,
embedding_matrix, words_with_no_embedding,
trainImgsIds, valImgsIds, testImgsIds, 
trainImgToFeatures, valImgToFeatures, testImgToFeatures, 
trainImgToCaptions, valImgToCaptions, testImgToCaptions) = preprocessing_pipeline(freq_threshold, model_for_feature_extraction, embedding_dim=embedding_dim, load_features=True) # set load_features=True if you won't change "weights" argument of "model" variable to avoid bottleneck of using Google's Inception Model

lstm_wordToIdx = wordToIdx.copy()

train_datagen = dataGenerator(trainImgToCaptions, trainImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
val_datagen = dataGenerator(valImgToCaptions, valImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
test_datagen = dataGenerator(testImgToCaptions, testImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)

Original vocabulary (i.e., unique words) size: 9630
Vocabulary size after removing less frequent words (< 5 words): 3107
number of words with no embeddings: 15
which are:
['wakeboards', 'plushie', 'corndogs', 'somthing', 'wakeboarder', 'outstreached', 'rollerblader', 'surfboarder', 'floaties', 'bicycler', 'parasails', 'dalmation', 'waterskier', 'inground', 'windsurfs']
embedding matrix dimensions (vocab size x embedding dimensions):
(3108, 200)


In [36]:
# model hyperparameters:
droput_rate = 0.5
dense_layer_units = 256
hidden_layers_activation = 'relu'

lstm_model = create_lstm_model(embedding_matrix, vocabSize, droput_rate, dense_layer_units, hidden_layers_activation)

In [38]:
# hyperparameters:
steps = len(trainImgToCaptions) // imgs_batch_size
vsteps = len(valImgToCaptions) // imgs_batch_size

tensorboardCallback = tf.keras.callbacks.TensorBoard(log_dir='models/lstm_logs', histogram_freq=1)
# earlyStoppingCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
checkpointCallback = tf.keras.callbacks.ModelCheckpoint('./models/lstm_weights/lstm_epochs_of-{epoch:04d}-and_val_loss_of-{val_loss:.4f}.h5', 
                                                        verbose=0, save_weights_only=True, save_freq='epoch')

train_datagen = dataGenerator(trainImgToCaptions, trainImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
lstm_history = lstm_model.fit(train_datagen, epochs=epochs, steps_per_epoch=steps, verbose=2, 
                              callbacks=[tensorboardCallback, checkpointCallback], #, earlyStoppingCallback
                              validation_data=val_datagen, validation_steps=vsteps)

Epoch 1/30
187/187 - 61s - loss: 4.8994 - acc: 0.1876 - val_loss: 4.1860 - val_acc: 0.2496 - 61s/epoch - 328ms/step
Epoch 2/30
187/187 - 60s - loss: 3.9206 - acc: 0.2655 - val_loss: 3.7244 - val_acc: 0.2909 - 60s/epoch - 321ms/step
Epoch 3/30
187/187 - 62s - loss: 3.5576 - acc: 0.2964 - val_loss: 3.5321 - val_acc: 0.3122 - 62s/epoch - 329ms/step
Epoch 4/30
187/187 - 63s - loss: 3.3503 - acc: 0.3159 - val_loss: 3.4288 - val_acc: 0.3259 - 63s/epoch - 336ms/step
Epoch 5/30
187/187 - 64s - loss: 3.2070 - acc: 0.3287 - val_loss: 3.3809 - val_acc: 0.3339 - 64s/epoch - 342ms/step
Epoch 6/30
187/187 - 64s - loss: 3.0987 - acc: 0.3385 - val_loss: 3.3434 - val_acc: 0.3405 - 64s/epoch - 342ms/step
Epoch 7/30
187/187 - 65s - loss: 3.0080 - acc: 0.3458 - val_loss: 3.3200 - val_acc: 0.3433 - 65s/epoch - 347ms/step
Epoch 8/30
187/187 - 64s - loss: 2.9317 - acc: 0.3545 - val_loss: 3.3054 - val_acc: 0.3466 - 64s/epoch - 343ms/step
Epoch 9/30
187/187 - 64s - loss: 2.8615 - acc: 0.3616 - val_loss: 3.3202

In [52]:
pklSave(lstm_history.history, './models/lstm_history.pickle')
lstm_history = pklLoad('./models/lstm_history.pickle')

In [None]:
# after training the model, you can comment the lines above, and load the model using:
lstm_model.load_weights(f'models/lstm_weights/lstm_model_with_{epochs}_epochs.h5')

## gru Model

In [41]:
# hyperparameters to tune
freq_threshold = 5
inception_model = InceptionV3(weights='imagenet') # getting the InceptionV3 model trained on imagenet data
model_for_feature_extraction = Model(inception_model.input, inception_model.layers[-2].output) # removing the last layer (output softmax layer)
imgs_batch_size = 32
epochs = 30
embedding_dim = 200

(imgToCaptions, vocab, vocabWordFreq, vocabWordFreqRemoved, 
idxToWord, wordToIdx, vocabSize, maxCapLen,
embedding_matrix, words_with_no_embedding,
trainImgsIds, valImgsIds, testImgsIds, 
trainImgToFeatures, valImgToFeatures, testImgToFeatures, 
trainImgToCaptions, valImgToCaptions, testImgToCaptions) = preprocessing_pipeline(freq_threshold, model_for_feature_extraction, embedding_dim=embedding_dim, load_features=True) # set load_features=True if you won't change "weights" argument of "model" variable to avoid bottleneck of using Google's Inception Model

gru_wordToIdx = wordToIdx.copy()

train_datagen = dataGenerator(trainImgToCaptions, trainImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
val_datagen = dataGenerator(valImgToCaptions, valImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
test_datagen = dataGenerator(testImgToCaptions, testImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)

Original vocabulary (i.e., unique words) size: 9630
Vocabulary size after removing less frequent words (< 5 words): 3107
number of words with no embeddings: 200
which are:
['Asian', 'Old', 'Girls', 'Cyclists', 'In', 'Men', 'RV', 'Easter', 'Boys', 'Person', 'Child', 'wakeboards', 'Domino', 'Adults', 'University', 'Football', 'People', 'Legos', 'plushie', 'Blond', 'An', 'Blue', 'Four', 'Closeup', 'Black', 'Yellow', 'Lady', 'Kid', 'Tan', 'Marx', 'BMX', 'Rollerblades', 'Where', 'Baseball', 'Sheltie', 'Speedo', 'Blonde', 'Crowd', 'Seven', 'corndogs', 'Brown', 'ATM', 'Frisbee', 'A', 'Distant', 'Dogs', 'mid-jump', 'European', 'somthing', 'Three', 'Children', 'Doberman', 'Snow', 'Two', 'Seattle', 'Jeep', 'Snowboarder', 'Santa', 'Bikers', 'DJ', 'Teen', 'Man', 'Biker', 'Dalmation', 'Dark', 'Light', 'Dog', 'Jack', 'Someone', 'Washington', 'The', 'Skiers', 'Number', 'Japanese', 'Hockey', 'Hawaiian', 'Female', 'New', 'Indian', 'Boy', 'Big', 'Jesus', 'Racing', 'wakeboarder', 'Group', 'Little', 'ATV'

In [42]:
# model hyperparameters:
droput_rate = 0.5
dense_layer_units = 256
hidden_layers_activation = 'relu'

gru_model = create_gru_model(embedding_matrix, vocabSize, droput_rate, dense_layer_units, hidden_layers_activation)

In [44]:
# hyperparameters:
steps = len(trainImgToCaptions) // imgs_batch_size
vsteps = len(valImgToCaptions) // imgs_batch_size

tensorboardCallback = tf.keras.callbacks.TensorBoard(log_dir='models/gru_logs', histogram_freq=1)
# earlyStoppingCallback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
checkpointCallback = tf.keras.callbacks.ModelCheckpoint('./models/gru_weights/gru_epochs_of-{epoch:04d}-and_val_loss_of-{val_loss:.4f}.h5', 
                                                        verbose=0, save_weights_only=True, save_freq='epoch')

train_datagen = dataGenerator(trainImgToCaptions, trainImgToFeatures, wordToIdx, vocabSize, maxCapLen, imgs_batch_size)
gru_history = gru_model.fit(train_datagen, epochs=epochs, steps_per_epoch=steps, verbose=2, 
                                        callbacks=[tensorboardCallback, checkpointCallback], # , earlyStoppingCallback
                                        validation_data=val_datagen, validation_steps=vsteps)

Epoch 1/30
187/187 - 63s - loss: 3.7445 - acc: 0.2834 - val_loss: 3.5765 - val_acc: 0.3090 - 63s/epoch - 339ms/step
Epoch 2/30
187/187 - 64s - loss: 3.4199 - acc: 0.3110 - val_loss: 3.3979 - val_acc: 0.3268 - 64s/epoch - 342ms/step
Epoch 3/30
187/187 - 64s - loss: 3.2328 - acc: 0.3277 - val_loss: 3.3075 - val_acc: 0.3376 - 64s/epoch - 343ms/step
Epoch 4/30
187/187 - 66s - loss: 3.0973 - acc: 0.3395 - val_loss: 3.2582 - val_acc: 0.3457 - 66s/epoch - 351ms/step
Epoch 5/30
187/187 - 67s - loss: 2.9942 - acc: 0.3488 - val_loss: 3.2268 - val_acc: 0.3521 - 67s/epoch - 360ms/step
Epoch 6/30
187/187 - 68s - loss: 2.9113 - acc: 0.3561 - val_loss: 3.2119 - val_acc: 0.3542 - 68s/epoch - 363ms/step
Epoch 7/30
187/187 - 67s - loss: 2.8357 - acc: 0.3633 - val_loss: 3.2288 - val_acc: 0.3527 - 67s/epoch - 360ms/step
Epoch 8/30
187/187 - 68s - loss: 2.7715 - acc: 0.3689 - val_loss: 3.2426 - val_acc: 0.3521 - 68s/epoch - 362ms/step
Epoch 9/30
187/187 - 67s - loss: 2.7123 - acc: 0.3744 - val_loss: 3.2563

In [53]:
pklSave(gru_history.history, './models/gru_history.pickle')
gru_history = pklLoad('./models/gru_history.pickle')

In [None]:
# after training the model, you can comment the lines above, and load the model using:
gru_model.load_weights(f'models/gru_weights/gru_model_with_{epochs}_epochs.h5')

# Image Inference Function

In [74]:
def greedy_search(model, img_features, max_cap_len, word_to_idx, idx_to_word):
    in_text = 'startseq'
    # we reshape to (1,2048) instead of (2048,) as #rows must indicate the #samples
    img_features = img_features.reshape(1, 2048) 
    for i in range(maxCapLen):
        sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        # this returns shape (1, max_cap_len)
        sequence = preprocessing.sequence.pad_sequences([sequence], maxlen=max_cap_len) 
        yhat = model.predict([img_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx_to_word[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final_caption_list = final[1:-1]
    final_caption_string = ' '.join(final)
    return final_caption_list, final_caption_string

# Rogue & Bleu Metrics Function

In [92]:
import tensorflow_text as text
from nltk.translate.bleu_score import sentence_bleu

def calculate_metrics(metric_type, model, test_img_paths, test_img_to_features, test_img_to_captions, max_cap_len, word_to_idx):
    idx_to_word = {v: k for k, v in lstm_wordToIdx.items()}
    rogue_scores = []
    bleu_scores = []
    predicted_captions = []
    for img_path in test_img_paths:
        # basename() gets filename (id and .jpg), splitext() gives us tuple of (id, extension), so we write [0] to get id only
        img_id = os.path.splitext(os.path.basename(img_path))[0] 
        img_captions_including_startend_seqs = test_img_to_captions[img_id]
        img_features = test_img_to_features[img_id+'.jpg']

        img_captions = []
        for caption in img_captions_including_startend_seqs:
            img_captions.append(caption[1:-1]) # removing 'startseq' and 'endseq' from the 5 captions of an image

        prediction_as_list, prediction_as_string = greedy_search(model, img_features, max_cap_len, word_to_idx, idx_to_word)

        if metric_type == 'rogue' or "all":
            # rogue-l documentation: https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l
            # the metric measures how similar two sequences are, based on the length of the longest common subsequence (LCS)
            # therefore, we take the max similarity between the hypothesis (y_pred) and one of the 5 captions of an image (y_true) 
            y_pred = tf.ragged.constant([prediction_as_list])
            print(y_pred)
            max_f1 = 0
            for y_true in img_captions:
                y_true = y_true.split(' ')
                y_true = tf.ragged.constant([y_true])
                # 0.5 means both precision and recall are given the same weight. more details here: https://www.tensorflow.org/text/tutorials/text_similarity#:~:text=ROUGE%2DL%20has%20an%20additional%20hyperparameter%2C%20alpha%2C%20which%20determines%20the%20weight%20of%20the%20harmonic%20mean%20used%20for%20computing%20the%20F%2DMeasure.%20Values%20closer%20to%200%20treat%20Recall%20as%20more%20important%20and%20values%20closer%20to%201%20treat%20Precision%20as%20more%20important 
                f1, precision, recall = text.metrics.rouge_l(y_pred, y_true, alpha=0.5)
                # converting a tensor object (i.e., Tensor([x])) to just a float (i.e., x)
                f1 = float(f1[0])
                if f1 > max_f1:
                    max_f1 = f1     
            rogue_scores.append(max_f1)
        if metric_type == 'bleu' or 'all':
            # sentence_bleu tutorial: https://www.digitalocean.com/community/tutorials/bleu-score-in-python
            y_pred = prediction_as_list
            y_true_all_img_captions = []
            for caption in img_captions:
                y_true_all_img_captions.append(caption.split(' '))

            bleu_score = sentence_bleu(y_true_all_img_captions, y_pred)
            rogue_scores.append(bleu_score)

        predicted_captions.append(prediction_as_string)

    if metric_type == 'all':
        return rogue_scores, bleu_scores, predicted_captions
    elif metric_type == 'rogue':
        return rogue_scores, bleu_scores, predicted_captions
    elif metric_type == 'bleu':
        return bleu_scores, predicted_captions
    else:
        return predicted_captions

# Test Set Results

In [None]:
images_dir = './dataset/Flicker8k_Dataset/'
test_imgs_encoded = pklLoad('./dataset/pickles/testImgToFeatures.pickle')

## LSTM Results

In [None]:
# hyperparameters used when model was trained:
droput_rate = 0.5
dense_layer_units = 256
hidden_layers_activation = 'relu'

current_lstm_model = create_lstm_model(embedding_matrix, vocabSize, droput_rate, dense_layer_units, hidden_layers_activation)
current_lstm_model.load_weights(f'models/lstm_weights/lstm_epochs_of-0030-and_val_loss_of-3.7172.h5')

In [93]:
# rogue and bleu results

rogue_scores, bleu_scores, predicted_captions = calculate_metrics('all', current_lstm_model, testImgsPaths, testImgToFeatures, testImgToCaptions, 
                                                   maxCapLen, lstm_wordToIdx)

<tf.RaggedTensor [[b'brown', b'dog', b'is', b'running', b'on', b'the', b'grass', b'.',
  b'of', b'a', b'body', b'of', b'water', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.']]>


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


<tf.RaggedTensor [[b'water', b'squirts', b'with', b'a', b'red', b'and', b'white', b'life',
  b'jacket', b'.', b'out', b'of', b'the', b'water', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'pool', b'.', b'.',
  b'pool', b'.', b'.', b'pool', b'.', b'.', b'pool', b'.', b'.', b'.',
  b'.']]>
<tf.RaggedTensor [[b'older', b'man', b'in', b'a', b'red', b'shirt', b'and', b'red',
  b'shorts', b'is', b'walking', b'on', b'a', b'sidewalk', b'with', b'a',
  b'colorful', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.']]>
<tf.RaggedTensor [[b'people', b'are', b'sitting', b'at', b'a', b'table', b'in', b'front',
  b'of', b'a', b'building', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.']]>
<tf.RaggedTensor [[b'in', b'Sooners', b'uniform', b'Sooners', b'Sooners', b'Sooners',
 

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


<tf.RaggedTensor [[b'people', b'are', b'walking', b'along', b'a', b'sidewalk', b'.', b'by',
  b'a', b'fence', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.']]>
<tf.RaggedTensor [[b'man', b'with', b'a', b'nose', b'piercing', b'and', b'beard', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.']]>
<tf.RaggedTensor [[b'dogs', b'are', b'playing', b'in', b'a', b'grassy', b'area', b'.',
  b'.', b'of', b'a', b'tree', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'from', b'the', b'ground', b'.', b'from', b'the', b'ground', b'.']]>
<tf.RaggedTensor [[b'older', b'man', b'in', b'a', b'red', b'shirt', b'and', b'jeans',
  b'stands', b'outside', b'of', b'a',

In [None]:
# checking out the average of all scores:
print('average of all rogue scores:')
print(round(sum(rogue_scores) / len(rogue_scores), 5))

# checking out result of 1st test image:
print('1st rogue score:')
print(rogue_scores[0])

print()

# checking out the average of all scores:
print('average of all bleu scores:')
print(round(sum(bleu_scores) / len(bleu_scores), 5))

# checking out result of 1st test image:
print('1st bleu score:')
print(bleu_scores[0])

In [63]:
# checking out a predicted/actual caption along with an image used:

print('predicted caption:')
print(predicted_captions[0])
print('actual captions:')
img_id = os.path.splitext(os.path.basename(testImgsPaths[0]))[0] 
print(testImgToCaptions[img_id])
print('the image itself:')
x = plt.imread(testImgsPaths[0])
plt.imshow(x)
plt.show()

## GRU Model

In [None]:
# hyperparameters used when model was trained:
droput_rate = 0.5
dense_layer_units = 256
hidden_layers_activation = 'relu'

current_gru_model = create_gru_model(embedding_matrix, vocabSize, droput_rate, dense_layer_units, hidden_layers_activation)
current_gru_model.load_weights(f'models/gru_weights/gru_epochs_of-0030-and_val_loss_of-3.5945.h5')

In [None]:
# rogue and bleu results

rogue_scores, bleu_scores, predicted_captions = calculate_metrics('all', current_gru_model, testImgsPaths, testImgToFeatures, testImgToCaptions, 
                                                   maxCapLen, lstm_wordToIdx)

In [None]:
# checking out the average of all scores:
print('average of all rogue scores:')
print(round(sum(rogue_scores) / len(rogue_scores), 5))

# checking out result of 1st test image:
print('1st rogue score:')
print(rogue_scores[0])

print()

# checking out the average of all scores:
print('average of all bleu scores:')
print(round(sum(bleu_scores) / len(bleu_scores), 5))

# checking out result of 1st test image:
print('1st bleu score:')
print(bleu_scores[0])
# checking out a predicted/actual caption along with an image used:

print('predicted caption:')
print(predicted_captions[0])
print('actual captions:')
img_id = os.path.splitext(os.path.basename(testImgsPaths[0]))[0] 
print(testImgToCaptions[img_id])
print('the image itself:')
x = plt.imread(testImgsPaths[0])
plt.imshow(x)
plt.show()