In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive/', force_remount=True)
    COLAB = True
    print("Note: using Google CoLab")
    %tensorflow_version 2.x
except:
    print("Note: not using Google CoLab")
    COLAB = False

Mounted at /content/drive/
Note: using Google CoLab


In [None]:
# All paths are relative to train_val.py file
config = {
	'images_path': '/content/drive/My Drive/projects/captions/Flicker8k_Dataset/', #Make sure you put that last slash(/)
	'train_data_path': '/content/drive/My Drive/projects/captions/Flickr8k_text/Flickr_8k.trainImages.txt',
	'val_data_path': '/content/drive/My Drive/projects/captions/Flickr8k_text/Flickr_8k.devImages.txt',
	'captions_path': '/content/drive/My Drive/projects/captions/Flickr8k_text/Flickr8k.token.txt',
	'tokenizer_path': '/content/drive/My Drive/projects/captions/data_temp/tokenizer.pkl',
	'model_data_path': '/content/drive/My Drive/projects/captions/data_temp/', #Make sure you put that last slash(/)
	'model_load_path': '/content/drive/My Drive/projects/captions/data_temp/model_inception_v3_epoch-20_train_loss-2.4050_val_loss-3.0527.hdf5',
	'num_of_epochs': 5,
	'max_length': 34, #This is set manually after training of model and required for test.py
	'batch_size': 64,
	'beam_search_k':3,
	'test_data_path': 'test_data/', #Make sure you put that last slash(/)
	'model_type': 'inception_v3', # inception_v3 or vgg16
	'random_seed': 1035
}

rnnConfig = {
	'embedding_size': 300,
	'LSTM_units': 256,
	'dense_units': 256,
	'dropout': 0.3
}

In [None]:
import numpy as np
import os
from pickle import dump
import string
from tqdm import tqdm
from keras.preprocessing.image import load_img, img_to_array
from datetime import datetime as dt

# Utility function for pretty printing
def mytime(with_date=False):
	_str = ''
	if with_date:
		_str = str(dt.now().year)+'-'+str(dt.now().month)+'-'+str(dt.now().day)+' '
		_str = _str+str(dt.now().hour)+':'+str(dt.now().minute)+':'+str(dt.now().second)
	else:
		_str = str(dt.now().hour)+':'+str(dt.now().minute)+':'+str(dt.now().second)
	return _str

"""
	*This function returns a dictionary of form:
	{
		image_id1 : image_features1,
		image_id2 : image_features2,
		...
	}
"""
def extract_features(path, model_type):
	if model_type == 'inception_v3':
		from keras.applications.inception_v3 import preprocess_input
		target_size = (299, 299)
	elif model_type == 'vgg16':
		from keras.applications.vgg16 import preprocess_input
		target_size = (224, 224)
	# Get CNN Model from model.py
	model = CNNModel(model_type)
	features = dict()
	# Extract features from each photo
	for name in tqdm(os.listdir(path)):
		# Loading and resizing image
		filename = path + name
		image = load_img(filename, target_size=target_size)
		# Convert the image pixels to a numpy array
		image = img_to_array(image)
		# Reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# Prepare the image for the CNN Model model
		image = preprocess_input(image)
		# Pass image into model to get encoded features
		feature = model.predict(image, verbose=0)
		# Store encoded features for the image
		image_id = name.split('.')[0]
		features[image_id] = feature
	return features

"""
	*Extract captions for images
	*Glimpse of file:
		1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
		1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
		1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
		1000268201_693b08cb0e.jpg#3	A little girl climbing the stairs to her playhouse .
		1000268201_693b08cb0e.jpg#4	A little girl in a pink dress going into a wooden cabin .
"""
def load_captions(filename):
	file = open(filename, 'r')
	doc = file.read()
	file.close()
	"""
	Captions dict is of form:
	{
		image_id1 : [caption1, caption2, etc],
		image_id2 : [caption1, caption2, etc],
		...
	}
	"""
	captions = dict()
	# Process lines by line
	_count = 0
	for line in doc.split('\n'):
		# Split line on white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# Take the first token as the image id, the rest as the caption
		image_id, image_caption = tokens[0], tokens[1:]
		# Extract filename from image id
		image_id = image_id.split('.')[0]
		# Convert caption tokens back to caption string
		image_caption = ' '.join(image_caption)
		# Create the list if needed
		if image_id not in captions:
			captions[image_id] = list()
		# Store caption
		captions[image_id].append(image_caption)
		_count = _count+1
	print('{}: Parsed captions: {}'.format(mytime(),_count))
	return captions

def clean_captions(captions):
	# Prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for _, caption_list in captions.items():
		for i in range(len(caption_list)):
			caption = caption_list[i]
			# Tokenize i.e. split on white spaces
			caption = caption.split()
			# Convert to lowercase
			caption = [word.lower() for word in caption]
			# Remove punctuation from each token
			caption = [w.translate(table) for w in caption]
			# Remove hanging 's' and 'a'
			caption = [word for word in caption if len(word)>1]
			# Remove tokens with numbers in them
			caption = [word for word in caption if word.isalpha()]
			# Store as string
			caption_list[i] =  ' '.join(caption)

"""
	*Save captions to file, one per line
	*After saving, captions.txt is of form :- `id` `caption`
		Example : 2252123185_487f21e336 stadium full of people watch game
"""
def save_captions(captions, filename):
	lines = list()
	for key, captions_list in captions.items():
		for caption in captions_list:
			lines.append(key + ' ' + caption)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

def preprocessData(config):
	print('{}: Using {} model'.format(mytime(),config['model_type'].title()))
	# Extract features from all images
	if os.path.exists(config['model_data_path']+'features_'+str(config['model_type'])+'.pkl'):
		print('{}: Image features already generated at {}'.format(mytime(), config['model_data_path']+'features_'+str(config['model_type'])+'.pkl'))
	else:
		print('{}: Generating image features using '+str(config['model_type'])+' model...'.format(mytime()))
		features = extract_features(config['images_path'], config['model_type'])
		# Save to file
		dump(features, open(config['model_data_path']+'features_'+str(config['model_type'])+'.pkl', 'wb'))
		print('{}: Completed & Saved features for {} images successfully'.format(mytime(),len(features)))
	# Load file containing captions and parse them
	if os.path.exists(config['model_data_path']+'captions.txt'):
		print('{}: Parsed caption file already generated at {}'.format(mytime(), config['model_data_path']+'captions.txt'))
	else:
		print('{}: Parsing captions file...'.format(mytime()))
		captions = load_captions(config['captions_path'])
		# Clean captions
		# Ignore this function because Tokenizer from keras will handle cleaning
		# clean_captions(captions)
		# Save captions
		save_captions(captions, config['model_data_path']+'captions.txt')
		print('{}: Parsed & Saved successfully'.format(mytime()))

In [None]:
import numpy as np
from pickle import load, dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import random
'''
	*We have Flickr_8k.trainImages.txt and Flickr_8k.devImages.txt files which consist of unique identifiers(id) 
		which can be used to filter the images and their descriptions
	*Load a pre-defined list of image identifiers(id)
	*Glimpse of file:
		2513260012_03d33305cf.jpg
		2903617548_d3e38d7f88.jpg
		3338291921_fe7ae0c8f8.jpg
		488416045_1c6d903fe0.jpg
		2644326817_8f45080b87.jpg
'''
def load_set(filename):
	file = open(filename, 'r')
	doc = file.read()
	file.close()
	ids = list()
	# Process line by line
	for line in doc.split('\n'):
		# Skip empty lines
		if len(line) < 1:
			continue
		# Get the image identifier(id)
		_id = line.split('.')[0]
		ids.append(_id)
	return set(ids)

'''
	*The model we'll develop will generate a caption for a given image and the caption will be generated one word at a time. 
	*The sequence of previously generated words will be provided as input. Therefore, we will need a ‘first word’ to 
		kick-off the generation process and a ‘last word‘ to signal the end of the caption.
	*We'll use the strings ‘startseq‘ and ‘endseq‘ for this purpose. These tokens are added to the captions
		as they are loaded. 
	*It is important to do this now before we encode the text so that the tokens are also encoded correctly.
	*Load captions into memory
	*Glimpse of file:
		1000268201_693b08cb0e child in pink dress is climbing up set of stairs in an entry way
		1000268201_693b08cb0e girl going into wooden building
		1000268201_693b08cb0e little girl climbing into wooden playhouse
		1000268201_693b08cb0e little girl climbing the stairs to her playhouse
		1000268201_693b08cb0e little girl in pink dress going into wooden cabin
'''
def load_cleaned_captions(filename, ids):
	file = open(filename, 'r')
	doc = file.read()
	file.close()
	captions = dict()
	_count = 0
	# Process line by line
	for line in doc.split('\n'):
		# Split line on white space
		tokens = line.split()
		# Split id from caption
		image_id, image_caption = tokens[0], tokens[1:]
		# Skip images not in the ids set
		if image_id in ids:
			# Create list
			if image_id not in captions:
				captions[image_id] = list()
			# Wrap caption in start & end tokens
			caption = 'startseq ' + ' '.join(image_caption) + ' endseq'
			# Store
			captions[image_id].append(caption)
			_count = _count+1
	return captions, _count

# Load image features
def load_image_features(filename, ids):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {_id: all_features[_id] for _id in ids}
	return features

# Convert a dictionary to a list
def to_lines(captions):
	all_captions = list()
	for image_id in captions.keys():
		[all_captions.append(caption) for caption in captions[image_id]]
	return all_captions

'''
	*The captions will need to be encoded to numbers before it can be presented to the model.
	*The first step in encoding the captions is to create a consistent mapping from words to unique integer values.
		Keras provides the Tokenizer class that can learn this mapping from the loaded captions.
	*Fit a tokenizer on given captions
'''
def create_tokenizer(captions):
	lines = to_lines(captions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# Calculate the length of the captions with the most words
def calc_max_length(captions):
	lines = to_lines(captions)
	return max(len(line.split()) for line in lines)

'''
	*Each caption will be split into words. The model will be provided one word & the image and it generates the next word. 
	*Then the first two words of the caption will be provided to the model as input with the image to generate the next word. 
	*This is how the model will be trained.
	*For example, the input sequence “little girl running in field” would be 
		split into 6 input-output pairs to train the model:

		X1		X2(text sequence) 								y(word)
		-----------------------------------------------------------------
		image	startseq,										little
		image	startseq, little,								girl
		image	startseq, little, girl,							running
		image	startseq, little, girl, running,				in
		image	startseq, little, girl, running, in,			field
		image	startseq, little, girl, running, in, field,		endseq
'''
# Create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, captions_list, image):
	# X1 : input for image features
	# X2 : input for text features
	# y  : output word
	X1, X2, y = list(), list(), list()
	vocab_size = len(tokenizer.word_index) + 1
	# Walk through each caption for the image
	for caption in captions_list:
		# Encode the sequence
		seq = tokenizer.texts_to_sequences([caption])[0]
		# Split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# Split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# Pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# Encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# Store
			X1.append(image)
			X2.append(in_seq)
			y.append(out_seq)
	return X1, X2, y

# Data generator, intended to be used in a call to model.fit_generator()
def data_generator(images, captions, tokenizer, max_length, batch_size, random_seed):
	# Setting random seed for reproducibility of results
	random.seed(random_seed)
	# Image ids
	image_ids = list(captions.keys())
	_count=0
	assert batch_size<= len(image_ids), 'Batch size must be less than or equal to {}'.format(len(image_ids))
	while True:
		if _count >= len(image_ids):
			# Generator exceeded or reached the end so restart it
			_count = 0
		# Batch list to store data
		input_img_batch, input_sequence_batch, output_word_batch = list(), list(), list()
		for i in range(_count, min(len(image_ids), _count+batch_size)):
			# Retrieve the image id
			image_id = image_ids[i]
			# Retrieve the image features
			image = images[image_id][0]
			# Retrieve the captions list
			captions_list = captions[image_id]
			# Shuffle captions list
			random.shuffle(captions_list)
			input_img, input_sequence, output_word = create_sequences(tokenizer, max_length, captions_list, image)
			# Add to batch
			for j in range(len(input_img)):
				input_img_batch.append(input_img[j])
				input_sequence_batch.append(input_sequence[j])
				output_word_batch.append(output_word[j])
		_count = _count + batch_size
		yield [[np.array(input_img_batch), np.array(input_sequence_batch)], np.array(output_word_batch)]

def loadTrainData(config):
	train_image_ids = load_set(config['train_data_path'])
	# Check if we already have preprocessed data saved and if not, preprocess the data.
	# Create and save 'captions.txt' & features.pkl
	preprocessData(config)
	# Load captions
	train_captions, _count = load_cleaned_captions(config['model_data_path']+'captions.txt', train_image_ids)
	# Load image features
	train_image_features = load_image_features(config['model_data_path']+'features_'+str(config['model_type'])+'.pkl', train_image_ids)
	print('{}: Available images for training: {}'.format(mytime(),len(train_image_features)))
	print('{}: Available captions for training: {}'.format(mytime(),_count))
	if not os.path.exists(config['model_data_path']+'tokenizer.pkl'):
		# Prepare tokenizer
		tokenizer = create_tokenizer(train_captions)
		# Save the tokenizer
		dump(tokenizer, open(config['model_data_path']+'tokenizer.pkl', 'wb'))
	# Determine the maximum sequence length
	max_length = calc_max_length(train_captions)
	return train_image_features, train_captions, max_length

def loadValData(config):
	val_image_ids = load_set(config['val_data_path'])
	# Load captions
	val_captions, _count = load_cleaned_captions(config['model_data_path']+'captions.txt', val_image_ids)
	# Load image features
	val_features = load_image_features(config['model_data_path']+'features_'+str(config['model_type'])+'.pkl', val_image_ids)
	print('{}: Available images for validation: {}'.format(mytime(),len(val_features)))
	print('{}: Available captions for validation: {}'.format(mytime(),_count))
	return val_features, val_captions

In [None]:
import numpy as np
# Keras
from keras.applications.inception_v3 import InceptionV3
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Input, Dense, Dropout, LSTM, Embedding, concatenate, RepeatVector, TimeDistributed, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
# To measure BLEU Score
from nltk.translate.bleu_score import corpus_bleu

"""
	*Define the CNN model
"""
def CNNModel(model_type):
	if model_type == 'inception_v3':
		model = InceptionV3()
	elif model_type == 'vgg16':
		model = VGG16()
	model.layers.pop()
	model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
	return model

"""
	*Define the RNN model
"""
def RNNModel(vocab_size, max_len, rnnConfig, model_type):
	embedding_size = rnnConfig['embedding_size']
	if model_type == 'inception_v3':
		# inception_v3 outputs a 1000 dimensional vector for each image, which we'll feed to RNN Model
		image_input = Input(shape=(2048,))
	elif model_type == 'vgg16':
		# vgg16 outputs a 4096 dimensional vector for each image, which we'll feed to RNN Model
		image_input = Input(shape=(4096,))
	image_model_1 = Dropout(rnnConfig['dropout'])(image_input)
	image_model = Dense(embedding_size, activation='relu')(image_model_1)

	caption_input = Input(shape=(max_len,))
	# mask_zero: We zero pad inputs to the same length, the zero mask ignores those inputs. E.g. it is an efficiency.
	caption_model_1 = Embedding(vocab_size, embedding_size, mask_zero=True)(caption_input)
	caption_model_2 = Dropout(rnnConfig['dropout'])(caption_model_1)
	caption_model = LSTM(rnnConfig['LSTM_units'])(caption_model_2)

	# Merging the models and creating a softmax classifier
	final_model_1 = concatenate([image_model, caption_model])
	final_model_2 = Dense(rnnConfig['dense_units'], activation='relu')(final_model_1)
	final_model = Dense(vocab_size, activation='softmax')(final_model_2)

	model = Model(inputs=[image_input, caption_input], outputs=final_model)
	model.compile(loss='categorical_crossentropy', optimizer='adam')
	return model

"""
	*Define the RNN model with different architecture
"""
def AlternativeRNNModel(vocab_size, max_len, rnnConfig, model_type):
	embedding_size = rnnConfig['embedding_size']
	if model_type == 'inception_v3':
		# inception_v3 outputs a 4096 dimensional vector for each image, which we'll feed to RNN Model
		image_input = Input(shape=(4096,))
	elif model_type == 'inception_v3':
		# inception_v3 outputs a 4096 dimensional vector for each image, which we'll feed to RNN Model
		image_input = Input(shape=(4096,))
	image_model_1 = Dense(embedding_size, activation='relu')(image_input)
	image_model = RepeatVector(max_len)(image_model_1)

	caption_input = Input(shape=(max_len,))
	# mask_zero: We zero pad inputs to the same length, the zero mask ignores those inputs. E.g. it is an efficiency.
	caption_model_1 = Embedding(vocab_size, embedding_size, mask_zero=True)(caption_input)
	# Since we are going to predict the next word using the previous words
	# (length of previous words changes with every iteration over the caption), we have to set return_sequences = True.
	caption_model_2 = LSTM(rnnConfig['LSTM_units'], return_sequences=True)(caption_model_1)
	# caption_model = TimeDistributed(Dense(embedding_size, activation='relu'))(caption_model_2)
	caption_model = TimeDistributed(Dense(embedding_size))(caption_model_2)

	# Merging the models and creating a softmax classifier
	final_model_1 = concatenate([image_model, caption_model])
	# final_model_2 = LSTM(rnnConfig['LSTM_units'], return_sequences=False)(final_model_1)
	final_model_2 = Bidirectional(LSTM(rnnConfig['LSTM_units'], return_sequences=False))(final_model_1)
	# final_model_3 = Dense(rnnConfig['dense_units'], activation='relu')(final_model_2)
	# final_model = Dense(vocab_size, activation='softmax')(final_model_3)
	final_model = Dense(vocab_size, activation='softmax')(final_model_2)

	model = Model(inputs=[image_input, caption_input], outputs=final_model)
	model.compile(loss='categorical_crossentropy', optimizer='adam')
	# model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
	return model

"""
	*Map an integer to a word
"""
def int_to_word(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

"""
	*Generate a caption for an image, given a pre-trained model and a tokenizer to map integer back to word
	*Uses simple argmax
"""
def generate_caption(model, tokenizer, image, max_length):
	# Seed the generation process
	in_text = 'startseq'
	# Iterate over the whole length of the sequence
	for _ in range(max_length):
		# Integer encode input sequence
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		# Pad input
		sequence = pad_sequences([sequence], maxlen=max_length)
		# Predict next word
		# The model will output a prediction, which will be a probability distribution over all words in the vocabulary.
		yhat = model.predict([image,sequence], verbose=0)
		# The output vector representins a probability distribution where maximum probability is the predicted word position
		# Take output class with maximum probability and convert to integer
		yhat = np.argmax(yhat)
		# Map integer back to word
		word = int_to_word(yhat, tokenizer)
		# Stop if we cannot map the word
		if word is None:
			break
		# Append as input for generating the next word
		in_text += ' ' + word
		# Stop if we predict the end of the sequence
		if word == 'endseq':
			break
	return in_text

"""
	*Generate a caption for an image, given a pre-trained model and a tokenizer to map integer back to word
	*Uses BEAM Search algorithm
"""
def generate_caption_beam_search(model, tokenizer, image, max_length, beam_index=3):
	# in_text --> [[idx,prob]] ;prob=0 initially
	in_text = [[tokenizer.texts_to_sequences(['startseq'])[0], 0.0]]
	while len(in_text[0][0]) < max_length:
		tempList = []
		for seq in in_text:
			padded_seq = pad_sequences([seq[0]], maxlen=max_length)
			preds = model.predict([image,padded_seq], verbose=0)
			# Take top (i.e. which have highest probailities) `beam_index` predictions
			top_preds = np.argsort(preds[0])[-beam_index:]
			# Getting the top `beam_index` predictions and 
			for word in top_preds:
				next_seq, prob = seq[0][:], seq[1]
				next_seq.append(word)
				# Update probability
				prob += preds[0][word]
				# Append as input for generating the next word
				tempList.append([next_seq, prob])
		in_text = tempList
		# Sorting according to the probabilities
		in_text = sorted(in_text, reverse=False, key=lambda l: l[1])
		# Take the top words
		in_text = in_text[-beam_index:]
	in_text = in_text[-1][0]
	final_caption_raw = [int_to_word(i,tokenizer) for i in in_text]
	final_caption = []
	for word in final_caption_raw:
		if word=='endseq':
			break
		else:
			final_caption.append(word)
	final_caption.append('endseq')
	return ' '.join(final_caption)

"""
	*Evaluate the model on BLEU Score using argmax predictions
"""
def evaluate_model(model, images, captions, tokenizer, max_length):
	actual, predicted = list(), list()
	for image_id, caption_list in tqdm(captions.items()):
		yhat = generate_caption(model, tokenizer, images[image_id], max_length)
		ground_truth = [caption.split() for caption in caption_list]
		actual.append(ground_truth)
		predicted.append(yhat.split())
	print('BLEU Scores :')
	print('A perfect match results in a score of 1.0, whereas a perfect mismatch results in a score of 0.0.')
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

"""
	*Evaluate the model on BLEU Score using BEAM search predictions
"""
def evaluate_model_beam_search(model, images, captions, tokenizer, max_length, beam_index=3):
	actual, predicted = list(), list()
	for image_id, caption_list in tqdm(captions.items()):
		yhat = generate_caption_beam_search(model, tokenizer, images[image_id], max_length, beam_index=beam_index)
		ground_truth = [caption.split() for caption in caption_list]
		actual.append(ground_truth)
		predicted.append(yhat.split())
	print('BLEU Scores :')
	print('A perfect match results in a score of 1.0, whereas a perfect mismatch results in a score of 0.0.')
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
from pickle import load
from tensorflow.keras.callbacks import ModelCheckpoint
import random
# Setting random seed for reproducibility of results
random.seed(config['random_seed'])

"""
    *Some simple checking
"""
assert type(config['num_of_epochs']) is int, 'Please provide an integer value for `num_of_epochs` parameter in config.py file'
assert type(config['max_length']) is int, 'Please provide an integer value for `max_length` parameter in config.py file'
assert type(config['batch_size']) is int, 'Please provide an integer value for `batch_size` parameter in config.py file'
assert type(config['beam_search_k']) is int, 'Please provide an integer value for `beam_search_k` parameter in config.py file'
assert type(config['random_seed']) is int, 'Please provide an integer value for `random_seed` parameter in config.py file'
assert type(rnnConfig['embedding_size']) is int, 'Please provide an integer value for `embedding_size` parameter in config.py file'
assert type(rnnConfig['LSTM_units']) is int, 'Please provide an integer value for `LSTM_units` parameter in config.py file'
assert type(rnnConfig['dense_units']) is int, 'Please provide an integer value for `dense_units` parameter in config.py file'
assert type(rnnConfig['dropout']) is float, 'Please provide a float value for `dropout` parameter in config.py file'

"""
	*Load Data
	*X1 : Image features
	*X2 : Text features(Captions)
"""
X1train, X2train, max_length = loadTrainData(config)

X1val, X2val = loadValData(config)

"""
	*Load the tokenizer
"""
tokenizer = load(open(config['tokenizer_path'], 'rb'))
vocab_size = len(tokenizer.word_index) + 1

"""
	*Now that we have the image features from CNN model, we need to feed them to a RNN Model.
	*Define the RNN model
"""
# model = RNNModel(vocab_size, max_length, rnnConfig, config['model_type'])
model = RNNModel(vocab_size, max_length, rnnConfig, config['model_type'])
print('RNN Model (Decoder) Summary : ')
print(model.summary())

"""
    *Train the model save after each epoch
"""
num_of_epochs = config['num_of_epochs']
batch_size = config['batch_size']
steps_train = len(X2train)//batch_size
if len(X2train)%batch_size!=0:
    steps_train = steps_train+1
steps_val = len(X2val)//batch_size
if len(X2val)%batch_size!=0:
    steps_val = steps_val+1
model_save_path = config['model_data_path']+"model_"+str(config['model_type'])+"_epoch-{epoch:02d}_train_loss-{loss:.4f}_val_loss-{val_loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(model_save_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

print('steps_train: {}, steps_val: {}'.format(steps_train,steps_val))
print('Batch Size: {}'.format(batch_size))
print('Total Number of Epochs = {}'.format(num_of_epochs))

# Shuffle train data
ids_train = list(X2train.keys())
random.shuffle(ids_train)
X2train_shuffled = {_id: X2train[_id] for _id in ids_train}
X2train = X2train_shuffled

# Create the train data generator
# returns [[img_features, text_features], out_word]
generator_train = data_generator(X1train, X2train, tokenizer, max_length, batch_size, config['random_seed'])
# Create the validation data generator
# returns [[img_features, text_features], out_word]
generator_val = data_generator(X1val, X2val, tokenizer, max_length, batch_size, config['random_seed'])

# Fit for one epoch
model.fit_generator(generator_train,epochs=num_of_epochs,steps_per_epoch=steps_train,validation_data=generator_val,validation_steps=steps_val,
                    callbacks=callbacks,verbose=1)

"""
	*Evaluate the model on validation data and output BLEU score
"""
print('Model trained successfully. Running model on validation set for calculating BLEU score using BEAM search with k={}'.format(config['beam_search_k']))
evaluate_model_beam_search(model, X1val, X2val, tokenizer, max_length, beam_index=config['beam_search_k'])
evaluate_model(model, X1val, X2val, tokenizer, max_length)

6:7:5: Using Inception_V3 model
6:7:5: Image features already generated at /content/drive/My Drive/projects/captions/data_temp/features_inception_v3.pkl
6:7:5: Parsed caption file already generated at /content/drive/My Drive/projects/captions/data_temp/captions.txt
6:7:5: Available images for training: 6000
6:7:5: Available captions for training: 30000
6:7:5: Available images for validation: 1000
6:7:5: Available captions for validation: 5000
RNN Model (Decoder) Summary : 
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 2048)         0                                            
___________________________

  0%|          | 0/1000 [00:00<?, ?it/s]

Model trained successfully. Running model on validation set for calculating BLEU score using BEAM search with k=3


100%|██████████| 1000/1000 [15:34<00:00,  1.07it/s]


BLEU Scores :
A perfect match results in a score of 1.0, whereas a perfect mismatch results in a score of 0.0.
BLEU-1: 0.581191
BLEU-2: 0.329493
BLEU-3: 0.226465


  0%|          | 2/1000 [00:00<01:32, 10.76it/s]

BLEU-4: 0.112489


100%|██████████| 1000/1000 [01:33<00:00, 10.69it/s]


BLEU Scores :
A perfect match results in a score of 1.0, whereas a perfect mismatch results in a score of 0.0.
BLEU-1: 0.569917
BLEU-2: 0.317177
BLEU-3: 0.208284
BLEU-4: 0.096259


In [None]:
pip install keras==2.2.4



In [None]:
pip install tensorflow==1.13.1

Collecting tensorflow==1.13.1
[?25l  Downloading https://files.pythonhosted.org/packages/77/63/a9fa76de8dffe7455304c4ed635be4aa9c0bacef6e0633d87d5f54530c5c/tensorflow-1.13.1-cp36-cp36m-manylinux1_x86_64.whl (92.5MB)
[K     |████████████████████████████████| 92.5MB 49kB/s 
Collecting tensorboard<1.14.0,>=1.13.0
[?25l  Downloading https://files.pythonhosted.org/packages/0f/39/bdd75b08a6fba41f098b6cb091b9e8c7a80e1b4d679a581a0ccd17b10373/tensorboard-1.13.1-py3-none-any.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 52.3MB/s 
Collecting tensorflow-estimator<1.14.0rc0,>=1.13.0
[?25l  Downloading https://files.pythonhosted.org/packages/bb/48/13f49fc3fa0fdf916aa1419013bb8f2ad09674c275b4046d5ee669a46873/tensorflow_estimator-1.13.0-py2.py3-none-any.whl (367kB)
[K     |████████████████████████████████| 368kB 50.1MB/s 
Collecting mock>=2.0.0
  Downloading https://files.pythonhosted.org/packages/cd/74/d72daf8dff5b6566db857cfd088907bb0355f5dd2914c4b3ef065c790735/mock-4.0.2-py3-non

In [None]:
pip uninstall tensorflow

Uninstalling tensorflow-2.3.0:
  Would remove:
    /usr/local/bin/estimator_ckpt_converter
    /usr/local/bin/saved_model_cli
    /usr/local/bin/tensorboard
    /usr/local/bin/tf_upgrade_v2
    /usr/local/bin/tflite_convert
    /usr/local/bin/toco
    /usr/local/bin/toco_from_protos
    /usr/local/lib/python3.6/dist-packages/tensorflow-2.3.0.dist-info/*
    /usr/local/lib/python3.6/dist-packages/tensorflow/*
Proceed (y/n)? y
y
  Successfully uninstalled tensorflow-2.3.0
