## *Required Libraries*

In [None]:
import os
import pickle
import numpy as np
from tqdm import tqdm
from keras.applications.vgg16 import VGG16,preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.utils import to_categorical, plot_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from nltk.translate.bleu_score import corpus_bleu
from PIL import Image
import matplotlib.pyplot as plt

## *Loading Our Model*

In [35]:
model=VGG16() # Loading pre-trained VGG-16 Model
model=Model(inputs=model.inputs, outputs=model.layers[-2] .output) # Restructuring the Model
model.summary() # Model Summary

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

## *Loading Our Dataset*

In [71]:
base_dir="E:/Data Science/Projects/Dataset/"

In [37]:
directory=os.path.join(base_dir+'Images')

# Feature Extraction from Images
features={}

for img_name in tqdm(os.listdir(directory)):

    # Loading images from file
    img_path=os.path.join(directory,img_name)
    image=load_img(img_path, target_size=(224,224))

    # Converting image into numpy array
    image=img_to_array(image)

    # Reshaping the Image
    image=image.reshape(1,image.shape[0],image.shape[1],image.shape[2])

    # Preprocessing the image
    image=preprocess_input(image)

    # Feature Extraction
    feature=model.predict(image,verbose=0)
    
    # Getting image ID
    img_id=img_name.split('.')[0]

    # Storing Features
    features[img_id]=feature

In [38]:
# Storing Feature in Pickle 
# pickle.dump(features, open('Extracted_Features.pkl','wb'))

In [39]:
with open('Extracted_Features.pkl','rb') as f:
    features=pickle.load(f)

In [72]:
# Loading Caption Data
with open(os.path.join(base_dir,'captions.txt'),'r') as f:
    next(f)
    caption_doc=f.read()

In [41]:
# Mapping Of Image to Caption
mapping={}
for line in tqdm(caption_doc.split('\n')):
    tokens=line.split(',')
    if len(line)<2:
        continue
    img_name, captions=tokens[0],tokens[1:]
    img_id=img_name.split('.')[0]
    caption=" ".join(captions)

    if img_id not in mapping:
        mapping[img_id]=[]
    
    mapping[img_id].append(caption)

100%|██████████| 40456/40456 [00:00<00:00, 56149.15it/s]


## *Preprocessing Text Data*

In [42]:
def clean(mapping):
    for key,captions in mapping.items():
        for i in range(len(captions)):
            caption=captions[i]
            caption=caption.lower()
            caption=caption.replace('[^A-Za-z]','')
            caption=caption.replace('\s+',' ')
            caption='startseq '+" ".join([word for word in caption.split() if len(word)>1])+' endseq'
            captions[i]=caption

In [43]:
mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [44]:
clean(mapping)
mapping['1000268201_693b08cb0e']

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

In [45]:
all_captions=[]
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

## *Tokenization*

In [46]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1

In [47]:
vocab_size

8485

In [48]:
# Getting maximum length of a caption
max_length=max(len(caption.split()) for caption in all_captions)
max_length

35

## *Splitting the Data*

In [49]:
img_ids=list(mapping.keys())
split=int(len(img_ids)*0.90)
train=img_ids[:split]
test=img_ids[split:]

## *Creating Data Generator*

In [50]:
def data_generator(data_keys, mapping, features, tokenizer, max_lenght, vocab_size, batch_size):
    x1, x2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]

                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_lenght)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    x1.append(features[key][0])
                    x2.append(in_seq)
                    y.append(out_seq)

            if n == batch_size:
                x1, x2, y = np.array(x1), np.array(x2), np.array(y)
                yield {"image":x1, "text":x2}, y
                x1, x2, y = list(), list(), list()
                n = 0

## *Model Building*

### *Encoder Model*

*Image Feature Layers*

In [51]:
inputs1=Input(shape=(4096,),name="image")
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)

*Sequence Feature Layer*

In [52]:
inputs2=Input(shape=(max_length,),name="text")
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)

### *Decoder Model*

In [53]:
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size, activation='softmax')(decoder2)

In [54]:
model=Model(inputs=[inputs1, inputs2], outputs=outputs)

## *Model Compilation*

In [55]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

## *Plot Model*

In [56]:
# plot_model(model,show_shapes=True)

## *Model Training*

In [None]:
epochs=20
batch_size=32
steps=len(train)//batch_size

for i in range(epochs):
    generator=data_generator(train,mapping, features, tokenizer, max_length,vocab_size,batch_size)
    model.fit(generator,epochs=1,steps_per_epoch=steps,verbose=1)



## *Model Saving*

In [None]:
model.save('model.h5')

In [58]:
from keras.models import load_model
model=load_model('D:/model.h5')

## *Generate Caption for an Image*

In [59]:
def indx_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [63]:
def predict_caption(model,image,tokenizer,max_length):
    in_text='startseq'
    for i in range(max_length):
        seq=tokenizer.texts_to_sequences([in_text])[0]
        pad_seq=pad_sequences([seq],max_length)
        y_pred=model.predict([image,pad_seq],verbose=0)
        y_pred=np.argmax(y_pred)
        word=indx_to_word(y_pred,tokenizer)
        if word is None:
            break
        in_text+=" "+word
        if word=='endseq':
            break
    return in_text

## *Model Evaluation*

In [None]:
actual, predicted=list(),list()
for key in tqdm(test):
    captions=mapping[key]
    y_pred=predict_caption(model,features[key],tokenizer,max_length)
    y_pred=y_pred.split()
    actual_caption=[caption.split for caption in captions]
    actual.append(actual_caption)
    predicted.append(y_pred)


In [None]:
print('BLEU Score-1:',corpus_bleu(actual,predicted,weights=(1,0,0,0)))
print('BLEU Score-2:',corpus_bleu(actual,predicted,weights=(0.5,0.5,0,0)))
print('BLEU Score-3:',corpus_bleu(actual,predicted,weights=(0.25,0.25,0.25,0.25)))

## *Visualizing The Results*

In [None]:
img_name="10815824_2997e03d76.jpg"
def visualize(img_name):
    img_id=img_name.split('.')[0]
    img_path=os.path.join(base_dir,"Images",img_name)
    image=Image.open(img_path)
    captions=mapping[img_id]

    print('Actual')
    for caption in captions:
        print(caption)

    y_pred=predict_caption(model,features[img_id],tokenizer,max_length)

    print('Predicted')

    print(y_pred)

    plt.imshow(image)

In [None]:
visualize(img_name)