<a href="https://colab.research.google.com/github/namita0210/ResearchOnImageCaptioning/blob/custom-dataset-ml-mastery/image_captioning_vgg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<b>Flickr8k_Dataset</b>: Contains 8092 photographs in JPEG format. <br>
<b>Flickr8k_text</b>: Contains a number of files containing different sources of descriptions for the photographs. <br>

The dataset has the following

<li>pre-defined training dataset (6,000 images)</li>
<li> development dataset (1,000 images)</li>
<li>test dataset (1,000 images).</li>

In [1]:
#Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Imports
from os import listdir
from pickle import dump
import string
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
import pandas as pd
import matplotlib.pyplot as plt
import numpy as numpy
from pickle import load

<h3>Prepare photo data</h3>

We can load the VGG model in Keras using the VGG class. We will remove the last layer from the loaded model, as this is the model used to predict a classification for a photo. We are not interested in classifying images, but we are interested in the internal representation of the photo right before a classification is made. These are the “features” that the model has extracted from the photo.

In [None]:
# function to load each photo from directory , prepare it for VGG , return a dict of img identifier to img features

def extract_features(directory):

    model = VGG16()
    # re-structure the model - remove last layer (softmax) from vgg model - second to the last layer will give feature description
    # if i had done model.layers[-1].output - i would get the class scores not the features
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())

    features = dict() # extract features from each photo
    for name in listdir(directory):
        filename = directory + '/' + name # load an image from file
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image) # convert the image pixels to a numpy array
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) # reshape data for the model - (batch size, height, width, channels)
        image = preprocess_input(image) # prepare the image for the VGG model
        feature = model.predict(image, verbose=0) # get features -> preprocessed img is passed through vgg16 model to extract features using predict function
        image_id = name.split('.')[0] #storing features in a dictionary and assigning unique identifiers to image
        features[image_id] = feature #features->dict
        print('>%s' % name)
    return features

# extract features from all images
directory = '/content/drive/MyDrive/Flickr/Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [None]:
import string

# Load doc into memory
def load_doc(filename):
    file = open(filename, 'r') # Open the file as read only
    text = file.read() # Read all text
    file.close() # Close the file
    return text

# Extract descriptions for images
def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split() # Split line by white space
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]  # Take the first token as the image id, the rest as the description
        image_id = image_id.split('.')[0] # Remove filename from image id
        image_desc = ' '.join(image_desc) # Convert description tokens back to string

        if image_id not in mapping: # Create the list if needed
            mapping[image_id] = list()

        mapping[image_id].append(image_desc) # Store description
    return mapping

def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation) # Prepare translation table for removing punctuation
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split() # Tokenize
            desc = [word.lower() for word in desc] # Convert to lower case
            desc = [w.translate(table) for w in desc] # Remove punctuation from each token
            desc = [word for word in desc if len(word) > 1] # Remove hanging 's' and 'a'
            desc = [word for word in desc if word.isalpha()] # Remove tokens with numbers in them
            desc_list[i] = ' '.join(desc) # Store as string

# Convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    all_desc = set() # Build a list of all description strings
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
  lines = list()
  for key, desc_list in descriptions.items():
    for desc in desc_list:
      lines.append(key + ' ' + desc)
      data = '\n'.join(lines)
      file = open(filename, 'w')
      file.write(data)
      file.close()

filename = '/content/drive/MyDrive/Flickr/Flickr8k_text/Flickr8k.token.txt'

doc = load_doc(filename) # load descriptions

descriptions = load_descriptions(doc) # parse descriptions - dictionary with img names and 1 description corresponding

print('Loaded: %d ' % len(descriptions))

clean_descriptions(descriptions) # clean descriptions

vocabulary = to_vocabulary(descriptions) # summarize vocabulary
print('Vocabulary Size: %d' % len(vocabulary))

save_descriptions(descriptions, 'descriptions.txt') # save to file

Loaded: 8092 
Vocabulary Size: 8763


Develop Deep Learning Model

In [3]:
#load_set() below will load a pre-defined set of identifiers given the train or development sets filename.
def load_doc(filename):
  file = open(filename,'r')
  text = file.read()
  file.close()
  return text

#load pre defined list of image identifiers
def load_set(filename):
  doc = load_doc(filename)
  dataset=list()
  for line in doc.split('\n'):
    if len(line)<1:
      continue
    identifier = line.split('.')[0]
    dataset.append(identifier)
  return set(dataset)

In [4]:
#load_clean_descriptions() that loads the cleaned text descriptions from ‘descriptions.txt‘ for a given set of identifiers and returns a dictionary of identifiers to lists of text descriptions.
def load_clean_descriptions(filename,dataset):
  doc = load_doc(filename)
  descriptions=dict()
  for line in doc.split('\n'):
    tokens = line.split()
    image_id,image_desc = tokens[0],tokens[1:]
    if image_id  not in dataset:
      if image_id not in descriptions:
        descriptions[image_id]=list()
        desc = 'startseq ' + ' '.join(image_desc)+' endseq '
        descriptions[image_id].append(desc)
  return descriptions

In [5]:
#load_photo_features() that loads the entire set of photo descriptions, then returns the subset of interest for a given set of photo identifiers
def load_photo_features(filename,dataset):
  all_features=load(open(filename,'rb'))
  features={k: all_features[k] for k in dataset if k in all_features}
  return features

In [7]:
# load training dataset (6K)
filename = '/content/drive/MyDrive/Flickr/Flickr8k_text/Flickr_8k.trainImages.txt'

train = load_set(filename)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions('/content/drive/MyDrive/Flickr/descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

# photo features
train_features = load_photo_features('/content/drive/MyDrive/Flickr/features.pkl', train)
print('Photos: train=%d' % len(train_features))

Dataset: 6000
Descriptions: train=2092
Photos: train=1458


In [8]:
#Encode the description.txt
def to_lines(descriptions):
  all_desc=list()
  for key in descriptions.keys():
    [all_desc.append(d) for d in descriptions[key]]
  return all_desc

  # fit a tokenizer
def create_tokenizer(descriptions):
  lines = to_lines(descriptions)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

  #prepare tokenizer
  tokenizer = create_tokenizer(train_descriptions)
  vocab_size = len(tokenizer.word_index) + 1
  print('Vocabulary Size: %d' % vocab_size)

In [9]:
#create sequences of images, input_sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
  X1,X2,y =  list(),list(),list()
  for key,desc_list in descriptions.items():
    for desc in desc_list:
      seq = tokenizer.texts_to_sequences([desc])[0]
      for i in range(1,len(seq)):
        in_seq,out_seq = seq[:i],seq[i]
        in_seq = pad_sequences([in_seq],maxlen=max_length)[0]
        out_seq = to_categorical([out_seq],num_classes=vocab_size)[0]
        X1.append(photos[key][0])
        X2.append(in_seq)
        y.append(out_seq)

  return array(X1),array(X2),array(y)

  # calculate the length of the description with the most words
def max_length(descriptions):
  lines = to_lines(descriptions)
  return max(len(d.split()) for d in lines)