# **Dataset and its structure**

1. Commonly known datasets that can be used for trianing purpose:
    1. Flickr8K
    2. Flick30K
    3. Fick100K
    4. MSCOCO
2. Each dataset may have there own structure of dataset. For Flickr_8K dataset, all the images of training, validation and test set are in one folder. It contains 3 different files i.e Flickr_8k.trainImages.txt, Flickr_8k.testImages.txt , Flickr_8k.devImages.txt  corresponding to each type of dataset i.e train, test and validation set, each file having file_name of images conatined in each dataset. 
3. For example, in Flick8k, Flickr_8k.trainImages.txt file contains file_ids of images in training set. Name of image file is its image id.
4. All the images are in same folder. So to parse images of training dataset, first read trianImages.txt file, read line by line image id and load corresponding image from image dataset folder.
5. Each image is given 5 different captions by 5 different humans. This is because an image can be described in multiple ways.

# **Import Libraries**

In [1]:
import os
import scipy.io
import numpy as np
import pandas as pd
from keras.applications.vgg16 import VGG16
from keras.models import Model

In [2]:
from keras.preprocessing.image import load_img
from keras.layers import Input, Dense, Embedding, LSTM, concatenate, Dropout
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.layers import Add
import matplotlib.pyplot as plt

In [3]:
from keras.preprocessing.text import Tokenizer
import string
import tensorflow as tf
import keras
from nltk.tokenize import word_tokenize

In [4]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from nltk.translate.bleu_score import corpus_bleu

In [5]:
from pickle import dump

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# **Image Feature Extraction**

In [None]:
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)# summarize
print(model.summary())


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
img=load_img('/content/gdrive/MyDrive/archive/Images/3767841911_6678052eb6.jpg',target_size=(224, 224))
img_to_array(img).shape

(224, 224, 3)

In [None]:
def img_preprocess(filename):
  feats={}
  j=0
  for img in os.listdir(filename):
    image=load_img(f'{filename}/{img}',target_size=(224, 224))
    img_arr=img_to_array(image)
    img_arr = img_arr.reshape((1, img_arr.shape[0], img_arr.shape[1], img_arr.shape[2]))
    feat=model.predict(img_arr,verbose=0)#no output shown as verbose=0
    img_id=img.split('.')[0]
    feats[img_id]=feat
    print(j)
    j+=1
  return feats

In [None]:
 features=img_preprocess('/content/gdrive/MyDrive/archive/Images/')

In [6]:
 pkl_file_path = '/content/drive/MyDrive/archive/your_data.pkl'


In [7]:
import pickle
 pkl_file_path = '/content/drive/MyDrive/archive/your_data.pkl'
 with open(pkl_file_path, 'wb') as pkl_file:
    pickle.dump(features, pkl_file)

In [6]:
from pickle import load
all_features = load(open('your_data.pkl', 'rb'))

In [9]:
len(list(all_features.values()))

8101

# **Text Loading and Cleaning**

In [7]:
file_path='/content/drive/MyDrive/archive/captions.txt'
with open('archive/captions.txt', 'r') as file:
    content = file.read()

(content[0:1000])

'image,caption\n1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .\n1000268201_693b08cb0e.jpg,A girl going into a wooden building .\n1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .\n1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .\n1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .\n1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting\n1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with each other on the road .\n1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots are staring at each other in the street .\n1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each other on the road .\n1001773457_577c3a7d70.jpg,Two dogs on pavement moving toward each other .\n1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .\n10026741

In [10]:
s = set(string.punctuation)

In [11]:
def clean(caption):
    cleaned = []
    x="startseq"
    for word in word_tokenize(caption):
        if (word.lower() not in s):
            cleaned.append(word.lower())
    for i in cleaned:
        if i==cleaned[len(cleaned)-1]:
            x=x+" "+i+" endseq"
        else:
            x=x+" "+ i
    return x

def create_data(content):
    captions = {}
    for line in content.split('\n'):
        if line == 'image,caption' or line == '':
            continue
        cap = line.split(',')[1:]
        im = str(line.split(',')[0])
        img_id = str(im.split('.')[0])
        if img_id not in captions:
            captions[img_id] = []
        caps = clean(' '.join(cap))
        captions[img_id].append(caps)
    return captions

In [12]:
cap = create_data(content)
len(cap.keys())

8091

In [12]:
cap[list(cap.keys())[0]]

['startseq a child in a pink dress is climbing up a set of stairs in an entry way endseq',
 'startseq a girl going into a wooden building endseq',
 'startseq a little girl climbing into a wooden playhouse endseq',
 'startseq a little girl climbing the stairs to her playhouse endseq',
 'startseq a little girl in a pink dress going into a wooden cabin endseq']

In [14]:
def to_vocabulary(descriptions):
 all_desc = set()
 for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
 return all_desc

In [27]:
vocab=list(to_vocabulary(cap))
len(vocab)

8909

In [None]:
import pickle
pkl_file_path = '/content/drive/MyDrive/archive/vocab.pkl'
with open(pkl_file_path, 'wb') as pkl_file:
    pickle.dump(vocab, pkl_file)

In [15]:
def to_lines(descriptions):
 all_desc = []
 for key in descriptions.keys():
    [all_desc.append(d) for d in descriptions[key]]
 return all_desc

# **Tokenizer and creating train and test data**

In [16]:
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [17]:
tokenizer=create_tokenizer(cap)

In [18]:
def max_length(descriptions):
  lines = to_lines(descriptions)
  return max(len(d.split()) for d in lines)

In [13]:
def train_test_split(desc, feats):
    train_desc = {k: v for i, (k, v) in enumerate(desc.items()) if i < 6000}
    test_desc = {k: v for i, (k, v) in enumerate(desc.items()) if i >= 6000}
    selected_features = {key: all_features[key] for key in train_desc.keys() & all_features.keys()}
    train_feats = {key: selected_features[key] for key in train_desc.keys()}

    test_features = {key: all_features[key] for key in test_desc.keys() & all_features.keys()}
    test_feats= {key: test_features[key] for key in test_desc.keys()}
    return train_desc, test_desc, train_feats, test_feats

train_desc, test_desc, train_feats, test_feats = train_test_split(cap, all_features)


In [20]:
max_length = max_length(cap)
print(max_length)

40


In [21]:
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
      seq = tokenizer.texts_to_sequences([desc])[0]
      for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X1.append(photo)
        X2.append(in_seq)
        y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)


# **Define the Model**

In [23]:
def define_model(vocab_size,max_caption_length):
  img_input=Input(shape=(4096,))
  drop=Dropout(0.3)(img_input)
  img_encoder=Dense(256, activation='relu')(drop)

  text_input = Input(shape=(max_caption_length,))
  text_embedding = Embedding(vocab_size, 256, mask_zero=True)(text_input)
  drop=Dropout(0.3)(text_embedding)
  text_encoder = LSTM(256)(drop)

  decoder1 = Add()([img_encoder, text_encoder])
  decoder2 = Dense(256, activation='relu')(decoder1)

  output = Dense(vocab_size, activation='softmax')(decoder2)
  model = Model(inputs=[img_input, text_input], outputs=output)

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
 
  return model

In [24]:
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
 while 1:
  for key, desc_list in descriptions.items():
    photo = photos[key][0]
    in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
    yield [in_img, in_seq], out_word

In [28]:
vocab_len=len(vocab)

In [None]:
train_data = create_sequences(tokenizer, max_length, train_desc, train_feats, vocab_len)

# **Train the model**

In [29]:
model = define_model(vocab_len, max_length)

epochs = 5
steps = len(train_desc)
for i in range(epochs):

 generator = data_generator(train_desc, train_feats, tokenizer, max_length, vocab_len)

 model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

 model.save('model_' + str(i) + '.keras')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 40)]                 0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 4096)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 40, 256)              2280704   ['input_2[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 4096)                 0         ['input_1[0][0]']             
                                                                                              

# **Testing and Evaluating Model**

In [30]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text


In [42]:
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = [],[]
    for key, desc_list in descriptions.items():
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())

    print('BLEU-1:', corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2:',corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3:',corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4:',corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
    return actual,predicted

In [43]:
actual,predicted=evaluate_model(model, test_desc, test_feats, tokenizer, max_length)

BLEU-1: 0.4851472633042599
BLEU-2: 0.3028213363977868
BLEU-3: 0.20836074254446565
BLEU-4: 0.10134149820114542
