# Imports

In [1]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import os
from os import listdir
from pickle import dump, load

import PIL.Image
from IPython.display import Image,display

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout
from tensorflow.keras.layers import add
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

import nltk
import re
from nltk.stem import WordNetLemmatizer

# Data Preprocessing

In [2]:
# Loading a text file

def load_text(fname):
    f = open(fname, 'r')
    text = f.read()
    f.close
    return text

# Using the previous function, we will create a function to get images&captions

def images_with_captions(fname):
    ourtext = load_text(fname)
    captions = ourtext.split('\n')
    img_cap = {}
    for line in captions[:-1]:
        image, caption = line.split('\t')
        if image[:-2] not in img_cap:
            img_cap[image[:-2]] = [caption]
        else:
            img_cap[image[:-2]].append(caption)
    return img_cap

In [3]:
# Data cleaning: Cleaning captions

def clean_data(image_caption):
    table = str.maketrans("", "", string.punctuation)
    for image, caption in image_caption.items():
        for i, cap in enumerate(caption):
            
            s = cap.replace("-", " ")
            
            # removing punctuation
            s = s.translate(table)
            
            s = s.split()
            
            # Lower casing
            s = [tok.lower() for tok in s]
            # Removing " A " and " 's "
            s = [tok for tok in s if(len(tok)>1)]
            # Remove tokens with numbers
            s = [tok for tok in s if(tok.isalpha())]
            # Getting back our string
            cap = " ".join(s)
            
            image_caption[image][i] = cap
    
    return image_caption    

In [4]:
# Getting all unique words

def vocab(image_caption):
    v = set()
    for k in image_caption.keys():
        [v.update(d.split()) for d in image_caption[k]]
    
    return v

In [5]:
# Creating our image-caption file

def img_cap_file(image_caption, fname):
    img_cap = list()
    for k, caps in image_caption.items():
        for cap in caps:
            img_cap.append(k + "\t" + cap)
    
    data = "\n".join(img_cap)
    
    # Create the file
    f = open(fname, 'w')
    f.write(data)
    f.close()

In [6]:
text_dir = "C:/ImageCaptionGenerator/Flickr8k_text"
image_dir = "C:/ImageCaptionGenerator/Flickr8k_Dataset"

In [7]:
# File of raw captions
fname = text_dir + "/" + "Flickr8k.token.txt"
# Mapping each image to its captions
image_caps = images_with_captions(fname)
print("Size of images_caps: ", len(image_caps))
# Cleaning captions
cleaned = clean_data(image_caps)
#Building our vocab
vocabulary = vocab(cleaned)
print("Vocabulary size: ", len(vocabulary))
#Creating our image_captions file
img_cap_file(cleaned, "C:/ImageCaptionGenerator/image_with_caption.txt")

Size of images_caps:  8092
Vocabulary size:  8422


In [8]:
# Building a function to extract features
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.xception import preprocess_input

def extract_features(path):
    #We will use Xception pretrained model
    model = Xception(include_top=False, pooling='avg')
    features = {}
    
    for image in os.listdir(path):
        fname = path + "/" + image
        img = PIL.Image.open(fname)
        img = img.resize((299,299))
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        
        feature = model.predict(img)
        features[image] = feature
    
    return features

In [9]:
features = extract_features(image_dir)
dump(features, open("C:/ImageCaptionGenerator/features.p", "wb"))

In [10]:
features = load(open("C:/ImageCaptionGenerator/features.p", "rb"))

In [11]:
# Loading the data

def get_images(fname):
    f = load_text(fname)
    images = f.split("\n")[:-1]
    return images

def get_clean_captions(fname, images):
    f = load_text(fname)
    captions = {}
    for cap in f.split("\n"):
        
        tokens = cap.split()
        if len(tokens)<1:
            continue
        
        img, caption = tokens[0], tokens[1:]
        
        if img in images:
            if img not in captions:
                captions[img] = []
            desc = '<start> ' + " ".join(caption) + ' <end>'
            captions[img].append(desc)
    
    return captions

def load_features(images):
    #loading all features from 'features.p'
    all_features = load(open("C:/ImageCaptionGenerator/features.p", "rb"))
    #Selecting needed features
    needed_features = {k:all_features[k] for k in images}
    return needed_features

In [12]:
fname = text_dir + "/" + "Flickr_8k.trainImages.txt"
imgs_caps_path = "C:/ImageCaptionGenerator/image_with_caption.txt"

training_images = get_images(fname)
training_captions = get_clean_captions(imgs_caps_path, training_images)
training_features = load_features(training_images)

In [13]:
# Creating a list containing all clean captions
def captions_in_list(captions):
    all_caps = []
    for k in captions.keys():
        [all_caps.append(cap) for cap in captions[k]]
    
    return all_caps

In [14]:
# Creating a tokenizer to give each word an index

def tokenize(captions):
    caps_list = captions_in_list(captions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(caps_list)
    
    return tokenizer

In [15]:
# Creating a 'tokenizer.p' file
tokenizer = tokenize(training_captions)
dump(tokenizer, open('C:/ImageCaptionGenerator/tokenizer.p', 'wb'))

In [16]:
# Number of words, vocabulary size
vocabulary_size = len(tokenizer.word_index) + 1
print('Vocabulary size: ', vocabulary_size)

Vocabulary size:  7318


In [17]:
# Calculating the length of the longest caption

def max_length(captions):
    caps_list = captions_in_list(captions)
    longest = max(len(d.split()) for d in caps_list)
    
    return longest

max_length = max_length(image_caps)
max_length

33

# Building The Model

Define our model based on the merge-model in these papers:

* https://arxiv.org/abs/1703.09137
* https://arxiv.org/abs/1708.02043

for more details you can see this post:

* https://machinelearningmastery.com/caption-generation-inject-merge-architectures-encoder-decoder-model/

Our model contains three parts:

1 Photo feature extractor: We preprocessed the images with the Xception model and we will use the extracted features predicted by this model 

2 Sequence processor: A word embedding layer that handles text, followed by LSTM layer 

3 Decoder: The feature extractor(1) and the sequence processor(2) produce fixed-length vector. They are merged together and processed by a Dense layer to make a final prediction

We will use Dropout as a regularization technique to reduce overfitting.

In [18]:
def data_generator(captions, features, tokenizer, max_length):
    while 1:
        for k, caps_list in captions.items():
            #get photo features
            feature = features[k][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, caps_list, feature)
            yield [input_image, input_sequence], output_word     

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocabulary_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [19]:
# Creating the model

def create_model(vocabulary_size, max_length):
    
    input1 = Input(shape=(2048,))
    feat1 = Dropout(0.5)(input1)
    feat2 = Dense(256, activation='relu')(feat1)
    
    input2 = Input(shape=(max_length,))
    seq1 = Embedding(vocabulary_size, 256, mask_zero=True)(input2)
    seq2 = Dropout(0.5)(seq1)
    seq3 = LSTM(256)(seq2)
    
    decoder1 = add([feat2, seq3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    output = Dense(vocabulary_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[input1, input2], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    print(model.summary())
    
    return model

Training The Model

In [20]:
print('Training Dataset: ', len(training_images))
print('Training Captions: ', len(training_captions))
print('Vocabulary Size:', vocabulary_size)
print('Description Length: ', max_length)

Training Dataset:  6000
Training Captions:  6000
Vocabulary Size: 7318
Description Length:  33


In [21]:
model = create_model(vocabulary_size, max_length)
epochs = 10
steps = len(training_captions)

for i in range(epochs):
    generator = data_generator(training_captions, training_features, tokenizer, max_length)
    model.fit(generator, epochs=1, steps_per_epoch=steps)
    model.save("C:/ImageCaptionGenerator/model_" + str(i+1) + ".h5")

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 256)      1873408     input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 2048)         0           input_1[0][0]                    
______________________________________________________________________________________________





#### To test the model, run the "GenerateCaption.py" file in the terminal as follows:

## python GenerateCaption.py -i "image path"