Step 1:- Import the required libraries

In [1]:
import numpy as np
import pandas as pd
from numpy import array
from pickle import load

from PIL import Image
import pickle
from collections import Counter
import matplotlib.pyplot as plt

import sys, time, os, warnings
warnings.filterwarnings("ignore")
import re

import keras
import tensorflow as tf
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense, BatchNormalization
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from tensorflow.keras.layers import add
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from pathlib import Path
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

Step 2:- Data loading and Preprocessing

In [3]:

image_path = "../Flickr8k_Dataset/Flicker8k_Dataset"
dir_Flickr_text = "../Flickr8k_text/Flickr8k.token.txt"
features_path = '../Flickr8k_Dataset/features'

jpgs = os.listdir(image_path)

print("Total Images in Dataset = {}".format(len(jpgs)))

Total Images in Dataset = 8091


We create a dataframe to store the image id and captions for ease of use.

In [4]:
file = open(dir_Flickr_text,'r')
text = file.read()
file.close()

datatxt = []
for line in text.split('\n'):
   col = line.split('\t')
   if len(col) == 1:
       continue
   w = col[0].split("#")
   datatxt.append(w + [col[1].lower()])

data = pd.DataFrame(datatxt,columns=["filename","index","caption"])
data = data.reindex(columns =['index','filename','caption'])
data = data[data.filename != '2258277193_586949ec62.jpg.1']
uni_filenames = np.unique(data.filename.values)

data.head()

Unnamed: 0,index,filename,caption
0,0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1,1000268201_693b08cb0e.jpg,a girl going into a wooden building .
2,2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse .
3,3,1000268201_693b08cb0e.jpg,a little girl climbing the stairs to her playh...
4,4,1000268201_693b08cb0e.jpg,a little girl in a pink dress going into a woo...


size of our vocabulary after cleaning

In [6]:
all_img_name_vector = []
for annot in data["filename"]:
   full_image_path = image_path + '/' + annot
   all_img_name_vector.append(full_image_path)

all_img_name_vector[:10]

['../Flickr8k_Dataset/Flicker8k_Dataset/1000268201_693b08cb0e.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1000268201_693b08cb0e.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1000268201_693b08cb0e.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1000268201_693b08cb0e.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1000268201_693b08cb0e.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1001773457_577c3a7d70.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1001773457_577c3a7d70.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1001773457_577c3a7d70.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1001773457_577c3a7d70.jpg',
 '../Flickr8k_Dataset/Flicker8k_Dataset/1001773457_577c3a7d70.jpg']

40455 image paths and captions.

In [8]:
print(f"len(all_img_name_vector) : {len(all_img_name_vector)}")

len(all_img_name_vector) : 40455


Step 3:- Model Definition

In [9]:
def load_image(image_path):
   img = tf.io.read_file(image_path)
   img = tf.image.decode_jpeg(img, channels=3)
   img = tf.image.resize(img, (224, 224))
   img = preprocess_input(img)
   return img, image_path

image_model = tf.keras.applications.VGG16(include_top=False, weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

image_features_extract_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 block1_conv1 (Conv2D)       (None, None, None, 64)    1792      
                                                                 
 block1_conv2 (Conv2D)       (None, None, None, 64)    36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, None, None, 64)    0         
                                                                 
 block2_conv1 (Conv2D)       (None, None, None, 128)   73856     
                                                                 
 block2_conv2 (Conv2D)       (None, None, None, 128)   147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, None, None, 128)   0     

Map each image name to the function to load the image:-

In [11]:
encode_train = sorted(set(all_img_name_vector))
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(64)

extract the features and store them in the respective .npy files and then pass those features through the encoder.NPY files store all the information required to reconstruct an array on any computer, which includes dtype and shape information.

In [None]:
%%time
for img, path in tqdm(image_dataset):
 batch_features = image_features_extract_model(img)
 batch_features = tf.reshape(batch_features,(batch_features.shape[0], -1, batch_features.shape[3]))

 for bf, p in zip(batch_features, path):
   path_of_feature = Path(p.numpy().decode("utf-8"))
   image_name = path_of_feature.stem
   np.save(f'{features_path}/{image_name}', bf.numpy())

 98%|████████████████████████████████████████████████████████████████████████████▊ | 125/127 [1:54:44<01:39, 49.87s/it]