<a href="https://colab.research.google.com/github/mjain125/AutomaticImageCaptioning/blob/main/automatic_image_captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image
import glob
import pickle
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# loading doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

filename = "/content/drive/MyDrive/flicker8k/Flickr_Data/Flickr_TextData/Flickr8k.token.txt"
# loading descriptions
doc = load_doc(filename)
print(doc[:300])

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the s


In [None]:
def load_descriptions(doc):
	mapping = dict()
	# processing lines
	for line in doc.split('\n'):
		# spliting line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# taking the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# extracting filename from image id
		image_id = image_id.split('.')[0]
		# converting description tokens back to string
		image_desc = ' '.join(image_desc)
		# creating the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# storing description
		mapping[image_id].append(image_desc)
	return mapping

# parsing descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


In [None]:
list(descriptions.keys())[10:15]


['101654506_8eb26cfb60',
 '101669240_b2d3e7f17b',
 '1016887272_03199f49c4',
 '1019077836_6fc9b15408',
 '1019604187_d087bf9a5f']

In [None]:
descriptions['101654506_8eb26cfb60']

['A brown and white dog is running through the snow .',
 'A dog is running in the snow',
 'A dog running through snow .',
 'a white and brown dog is running through a snow covered field .',
 'The white and brown dog is running over the surface of the snow .']

In [None]:
def clean_descriptions(descriptions):
	# preparing translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenizing
			desc = desc.split()
			# converting to lower case
			desc = [word.lower() for word in desc]
			# removing punctuation from each token
			desc = [w.translate(table) for w in desc]
			# removing hanging words like 'a'
			desc = [word for word in desc if len(word)>1]
			# removing tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# storing as string
			desc_list[i] =  ' '.join(desc)

# cleaning descriptions
clean_descriptions(descriptions)

In [None]:
descriptions['101654506_8eb26cfb60']

['brown and white dog is running through the snow',
 'dog is running in the snow',
 'dog running through snow',
 'white and brown dog is running through snow covered field',
 'the white and brown dog is running over the surface of the snow']

In [None]:
# converting the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
	# building a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc

# summarizing vocabulary
vocabulary = to_vocabulary(descriptions)
print('Original Vocabulary len: %d' % len(vocabulary))

Original Vocabulary len: 8763


In [None]:
# saving descriptions to file, one per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

save_descriptions(descriptions, 'descriptions.txt')

In [None]:
# loading a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# processing line by line
	for line in doc.split('\n'):
		# skiping empty lines
		if len(line) < 1:
			continue
		# getting the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

# loading training dataset (6K)
filename = '/content/drive/MyDrive/flicker8k/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

Dataset: 6000


In [None]:
# Below path contains all the images
images = '/content/drive/MyDrive/flicker8k/Flickr_Data/Images/'
# Creating a list of all image names in the directory
img = glob.glob(images + '*?.jpg')

In [None]:
img

In [None]:
# Below file conatains the names of images to be used in train data
train_images_file = '/content/drive/MyDrive/flicker8k/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt'
# Reading the train image names in a set
train_images = set(open(train_images_file, 'r').read().strip().split('\n'))

# Creating a list of all the training images with their full path names
train_img = []

for i in img: # img is list of full path names of all images
    if i[len(images):] in train_images: # Checking if the image belongs to training set
        train_img.append(i) # Adding it to the list of train images

In [None]:
train_img

In [None]:
# Below file conatains the names of images to be used in test data
test_images_file = '/content/drive/MyDrive/flicker8k/Flickr_Data/Flickr_TextData/Flickr_8k.testImages.txt'
# Reading the validation image names in a set# Read the test image names in a set
test_images = set(open(test_images_file, 'r').read().strip().split('\n'))

# Creating a list of all the test images with their full path names
test_img = []

for i in img: # img is list of full path names of all images
    if i[len(images):] in test_images: # Checking if the image belongs to test set
        test_img.append(i) # Adding it to the list of test images

In [None]:
test_img

In [None]:
# loading clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# loading document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# spliting line by white space
		tokens = line.split()
		# spliting id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skiping images not in the set
		if image_id in dataset:
			# creating list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wraping description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# storing
			descriptions[image_id].append(desc)
	return descriptions

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=6000


In [None]:
def preprocess(image_path):
    # Converting all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(image_path, target_size=(299, 299))
    # Converting PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Adding one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocessing the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

In [None]:
# Loading the inception v3 model
model = InceptionV3(weights='imagenet')

In [None]:
# Creating a new model, by removing the last layer (output layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

NameError: ignored

In [None]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) # preprocessing the image
    fea_vec = model_new.predict(image) # Getting the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshaping from (1, 2048) to (2048, )
    return fea_vec

In [None]:
# Calling the funtion to encode all the train images
start = time()
encoding_train = {}
for img in train_img:
    encoding_train[img[len(images):]] = encode(img)
print("Time taken in seconds =", time()-start)

Time taken in seconds = 2788.4534409046173


In [None]:
# Saving the bottleneck train features to disk to save the time of encoding
with open("/content/drive/MyDrive/flicker8k/Flickr_Data/Pickle_Mridul/encoded_train_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoding_train, encoded_pickle)

In [None]:
# Calling the funtion to encode all the test images
start = time()
encoding_test = {}
for img in test_img:
    encoding_test[img[len(images):]] = encode(img)
print("Time taken in seconds =", time()-start)

Time taken in seconds = 571.1966826915741


In [None]:
# Saving the bottleneck test features to disk to save the time of encoding
with open("/content/drive/MyDrive/flicker8k/Flickr_Data/Pickle_Mridul/encoded_test_images_mridul.pkl", "wb") as encoded_pickle:
    pickle.dump(encoding_test, encoded_pickle)

In [22]:
train_features = load(open("/content/drive/MyDrive/flicker8k/Flickr_Data/Pickle_Mridul/encoded_train_images_mridul.pkl", "rb"))
print('Photos: train=%d' % len(train_features))

Photos: train=6000


In [23]:
# Creating a list of all the training captions
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

30000

In [24]:
# Considering only words which occur at least 10 times in the corpus
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

preprocessed words 7578 -> 1651


In [25]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

In [26]:
vocab_size = len(ixtoword) + 1 # one for appended 0's
vocab_size

1652

In [27]:
# converting a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# calculating the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# determining the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 34


In [33]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieving the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encoding the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # splitting one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # splitting into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pading input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encoding output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # storing
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yielding the batch data
            if n==num_photos_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0

In [34]:
# Loading Glove vectors
embeddings_index = {} # empty dictionary
f = open("/content/drive/MyDrive/flicker8k/Flickr_Data/glove/glove.6B.200d.txt", encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [35]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [36]:
embedding_matrix.shape

(1652, 200)

In [37]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [38]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 34)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 34, 200)      330400      input_3[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 2048)         0           input_2[0][0]                    
____________________________________________________________________________________________