## Importing Libraries

In [None]:
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
import glob
from PIL import Image
from time import time

from keras import Input, layers
from keras import optimizers
from tensorflow.keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.preprocessing import image
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense, Activation, Flatten, Reshape, Dropout
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.models import Model
from tensorflow.keras.utils import to_categorical

### Finding the Paths to the dataset

In [None]:
tokens_path = "../input/flickr8k-text/Flickr8k_text/Flickr8k.token.txt"
train_images_path = '../input/flickr8k-text/Flickr8k_text/Flickr_8k.trainImages.txt'
test_images_path = '../input/flickr8k-text/Flickr8k_text/Flickr_8k.testImages.txt'
images_path = '../input/flickr8k-images/Flicker8k_Images/'
glove_path = '../input/glove6b'

doc = open(tokens_path,'r').read()
print(doc[:410])

### Creating the dictionary of images and their captions

In [None]:
descriptions = dict()
for line in doc.split('\n'):
        tokens = line.split()
        if len(line) > 2:
          image_id = tokens[0].split('.')[0]
          image_desc = ' '.join(tokens[1:])
          if image_id not in descriptions:
              descriptions[image_id] = list()
          descriptions[image_id].append(image_desc)

### Tidy up the captions by removing the punctuations, convert to lower case and split captions at spaces

In [None]:
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        desc = desc.split()
        desc = [word.lower() for word in desc]
        desc = [w.translate(table) for w in desc]
        desc_list[i] =  ' '.join(desc)

### Showing one of the image and its captions

In [None]:
pic = '2398605966_1d0c9e6a20.jpg'
x=plt.imread(images_path+pic)
plt.imshow(x)
plt.show()
descriptions['2398605966_1d0c9e6a20']

### Create a vocabulary by taking only unique words

In [None]:
vocabulary = set()
for key in descriptions.keys():
        [vocabulary.update(d.split()) for d in descriptions[key]]
print('Original Vocabulary Size: %d' % len(vocabulary))

In [None]:
lines = list()
for key, desc_list in descriptions.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

### Read the training images and the captions and create a training dataset
### Do the same process and create a test dataset

In [None]:
doc = open(train_images_path,'r').read()
dataset_train = list()
for line in doc.split('\n'):
    if len(line) > 1:
      identifier = line.split('.')[0]
      dataset_train.append(identifier)

train = set(dataset_train)

In [None]:
doc = open(test_images_path,'r').read()
dataset_test = list()
for line in doc.split('\n'):
    if len(line) > 1:
      identifier = line.split('.')[0]
      dataset_test.append(identifier)

test = set(dataset_test)

In [None]:
img = glob.glob(images_path + '*.jpg')
train_images = set(open(train_images_path, 'r').read().strip().split('\n'))
train_img = []
for i in img: 
    if i[len(images_path):] in train_images:
        train_img.append(i)

test_images = set(open(test_images_path, 'r').read().strip().split('\n'))
test_img = []
for i in img: 
    if i[len(images_path):] in test_images: 
        test_img.append(i)

### Create a dictionary of Train Descriptions with mappings from training images id to all its captions
#### We append startseq and endseq at the start and end of the captions to create 2 sentences and feed it to the model


In [None]:
train_descriptions = dict()
for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_descriptions:
            train_descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        train_descriptions[image_id].append(desc)

### Do the same for the test descriptions

In [None]:
test_descriptions = dict()
for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in test:
        if image_id not in test_descriptions:
            test_descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        test_descriptions[image_id].append(desc)

In [None]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)

#### Take only those words in the captions which occur more than 10 times in the whole dataset

In [None]:
word_limit = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_limit]

print('Vocabulary = %d' % (len(vocab)))

#### Create 2 mappings from index to word and word to index

In [None]:
idx_to_word = {}
word_to_idx = {}
ix = 1
for w in vocab:
    word_to_idx[w] = ix
    idx_to_word[ix] = w
    ix += 1

vocab_size = len(idx_to_word) + 1

#### We find the description with maximum length and make all the other descriptions to that length by adding padding 

In [None]:
all_desc = list()
for key in train_descriptions.keys():
    [all_desc.append(d) for d in train_descriptions[key]]
lines = all_desc
max_length = max(len(d.split()) for d in lines)

print('Description Length: %d' % max_length)

### Word vectors map words to a vector space.
#### In the vector space similar words are clustered and different words are separated
#### The basic premise behind Glove is that we can derive sematic relationships between words from the concurrence matrix.

In [None]:
embeddings_index = {} 
f = open(os.path.join(glove_path, 'glove.6B.200d.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

#### Here we make a concurrence matrix of shape(1660, 220) consisting of our vocabulary and the 200-d vector

In [None]:
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_to_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### We are using InceptionV3 model which is pretrained on Imagenet dataset

In [None]:
model = InceptionV3(weights='imagenet')

In [None]:
model_new = Model(model.input, model.layers[-2].output)

#### Preprocess the training data set by converting each images into shape (299, 299)

In [None]:
def preprocess(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

### Now  we can train and test images on our model

In [None]:
def encode_image(image):
    image = preprocess(image) 
    fea_vec = model_new.predict(image) 
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1])
    return fea_vec

encoding_train = {}
for img in train_img:
    encoding_train[img[len(images_path):]] = encode_image(img)
train_features = encoding_train

encoding_test = {}
for img in test_img:
    encoding_test[img[len(images_path):]] = encode_image(img)

### Here we define our model
#### First it preprocess the sequence from the text
#### Then we extract feature vector from the images
#### We feed the feature vector to the fully connected layers
#### There are 2 FE layers
#### Then at last we use softmax activation function to decode

In [None]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

#### Train the model

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

#### Use Adam optimizer and categorical_crossentropy to calculate loss

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

#### Create batches of dataset to and train it batch wise

In [None]:
def data_generator(descriptions, photos, word_to_idx, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            photo = photos[key +'.jpg']
            for desc in desc_list:
                seq = [word_to_idx[word] for word in desc.split(' ') if word in word_to_idx]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)

            if n==num_photos_per_batch:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n=0

#### Train the image for 30 epochs and give batch size of 3

In [None]:
epochs = 30
batch_size = 3
steps = len(train_descriptions)//batch_size

generator = data_generator(train_descriptions, train_features, word_to_idx, max_length, batch_size)
model.fit(generator, epochs=epochs, steps_per_epoch=steps, verbose=1)

#### We use beam search to get the best caption predicted

In [None]:
def beam_search(image, beam_index = 3):
    start = [word_to_idx["startseq"]]
    start_word = [[start, 0.0]]
    while len(start_word[0][0]) < max_length:
        temp = []
        for s in start_word:
            par_caps = sequence.pad_sequences([s[0]], maxlen=max_length, padding='post')
            preds = model.predict([image,par_caps], verbose=0)
            word_preds = np.argsort(preds[0])[-beam_index:]
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += preds[0][w]
                temp.append([next_cap, prob])
                    
        start_word = temp
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        start_word = start_word[-beam_index:]
    
    start_word = start_word[-1][0]
    intermediate_caption = [idx_to_word[i] for i in start_word]
    final_caption = []
    
    for i in intermediate_caption:
        if i != 'endseq':
            final_caption.append(i)
        else:
            break

    final_caption = ' '.join(final_caption[1:])
    return final_caption

#### We observed that when we did beam search with beam index = 7

In [None]:
pic = '2398605966_1d0c9e6a20.jpg'
image = encoding_test[pic].reshape((1,2048))
x=plt.imread(images_path + pic)
plt.imshow(x)
plt.show()

print("Beam Search, K = 3:",beam_search(image, beam_index = 3))
print("Beam Search, K = 5:",beam_search(image, beam_index = 5))
print("Beam Search, K = 7:",beam_search(image, beam_index = 7))
print("Beam Search, K = 10:",beam_search(image, beam_index = 10))

In [None]:
print(test_descriptions['2398605966_1d0c9e6a20'])

In [None]:
captions = []
for caption in test_descriptions['2398605966_1d0c9e6a20']:
    captions.append(caption.split())

In [None]:
from nltk.translate.bleu_score import sentence_bleu
sentence_bleu(captions, beam_search(image, beam_index = 7).split(' '))


### Predict the bleu_score for 200 images and take the average
#### We are getting an average around 50 percent. 

In [None]:
bleu_score = 0;
i=1
for key, caption_list in test_descriptions.items():
    captions = []
    if i==100:
        break
    for caption in caption_list:
        captions.append(caption.split())
    score = sentence_bleu(captions, beam_search(encoding_test[key + '.jpg'].reshape((1,2048)), beam_index = 7).split(' '))
    bleu_score +=score
    if i%10==0:
        print(bleu_score/i)
    i+=1
        

In [None]:
print(bleu_score/100)

#### To observe the bias we checked the bleu_score of all the test images and
#### If any image has score < 0.25 we check what is the caption predict

In [None]:
i=1
edge_cases = []
for key, caption_list in test_descriptions.items():
    captions = []
    if i==100:
        break
    for caption in caption_list:
        captions.append(caption.split())
    score = sentence_bleu(captions, beam_search(encoding_test[key + '.jpg'].reshape((1,2048)), beam_index = 7).split(' '))
    if i%10==0:
        print(score)
    if score < 0.25:
        print(score)
        edge_cases.append(key)
        edge_cases.append(score)
    i+=1

In [None]:
print(edge_cases)


In [None]:
pic = '1554713437_61b64527dd.jpg'
image = encoding_test[pic].reshape((1,2048))
x=plt.imread(images_path + pic)
plt.imshow(x)
plt.show()
print(test_descriptions['1554713437_61b64527dd'])
print("Beam Search, K = 7:",beam_search(image, beam_index = 7))


In [None]:
pic = '1679617928_a73c1769be.jpg'
image = encoding_test[pic].reshape((1,2048))
x=plt.imread(images_path + pic)
plt.imshow(x)
plt.show()
print(test_descriptions['1679617928_a73c1769be'])
print("Beam Search, K = 7:",beam_search(image, beam_index = 7))

In [None]:
pic = '180094434_b0f244832d.jpg'
image = encoding_test[pic].reshape((1,2048))
x=plt.imread(images_path + pic)
plt.imshow(x)
plt.show()
print(test_descriptions['180094434_b0f244832d'])
print("Beam Search, K = 7:",beam_search(image, beam_index = 7))

In [None]:
i=0
edge_cases = []
for key, caption_list in test_descriptions.items():
    captions = []
    i+=1
    if i<101:
        continue
    if i==200:
        break
    for caption in caption_list:
        captions.append(caption.split())
    score = sentence_bleu(captions, beam_search(encoding_test[key + '.jpg'].reshape((1,2048)), beam_index = 7).split(' '))
    if i%10==0:#
        print(score)
    if score < 0.25:
        print(score)
        edge_cases.append(key)
        edge_cases.append(score)


In [None]:
print(edge_cases)

#### Give the predicted captions for a particular image 

In [None]:
def predict(name):
    pic = name + '.jpg'
    image = encoding_test[pic].reshape((1,2048))
    x=plt.imread(images_path + pic)
    plt.imshow(x)
    plt.show()
    print(test_descriptions[name])
    print("Beam Search, K = 7:",beam_search(image, beam_index = 7))

#### These are some of the examples of the images we found which are biased

In [None]:
predict('191003285_edd8d0cf58')
predict('197504190_fd1fc3d4b7')
predict('2248487950_c62d0c81a9')
predict('2309860995_c2e2a0feeb')
predict('2176980976_7054c99621')
predict('2196107384_361d73a170')
predict('2274992140_bb9e868bb8')