## Assignment 3 - Image Captioning

## Import Libraries

In [None]:
import os
from os import listdir
import pickle
import numpy as np
from tqdm.notebook import tqdm
from tensorflow.keras.applications.inception_v3 import InceptionV3,preprocess_input
from tensorflow.keras.applications.resnet50 import ResNet50,preprocess_input
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add

## Creating Feature Extraction Model

In [None]:
mod='attention-ResNet50'
# Load vgg16 model
model_r=ResNet50()
# Restructure the model
model_r=Model(inputs=model_r.inputs,outputs=model_r.layers[-2].output)
print(model_r.summary())

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_1[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)       

### Images Feature Extraction

In [None]:
# Extract features from each photo in the directory
def extract_features(directory):
    # Extract features from each photo
    features = dict()
    for name in listdir(directory):
        # Load an image from file
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        # Convert the image pixels to a numpy array
        image = img_to_array(image)
        # Reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # Prepare the image for the Xception model
        image = preprocess_input(image)
        # Get features
        feature = model_r.predict(image, verbose=0)
        # Get image id
        image_id = name.split('.')[0]
        # Store feature
        features[image_id] = feature
    return features

In [None]:
# Extract features from all images
directory = '../input/d/sayanf/flickr8k/Flickr8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))

Extracted Features: 8091


In [None]:
# Save features to file
pickle.dump(features, open('features.pkl', 'wb'))

In [None]:
# Load features from previously extracted data
with open('/kaggle/input/trianed/features.pkl','rb') as f:
    features=pickle.load(f)

In [None]:
# Load doc into memory
def load_doc(filename):
    # Open the file as read only
    file = open(filename, 'r')
    # Read all text
    text = file.read()
    # Close the file
    file.close()
    return text

filename = '../input/d/sayanf/flickr8k/Flickr8k_text/Flickr8k.token.txt'
# Load descriptions
doc = load_doc(filename)

In [None]:
# Create mapping for image to captions
mapping={}
# Process lines
for line in tqdm(doc.split('\n')):
    # Split line by comma
    tokens=line.split()
    if len(line)<2:
        continue
    image_id,caption=tokens[0],tokens[1:]
    # Remove extension from image_id
    image_id=image_id.split('.')[0]
    #convert caption list to string
    caption=' '.join(caption)
    # Create list if needed
    if image_id not in mapping:
        mapping[image_id]=[]
    # Store the caption
    mapping[image_id].append(caption)
len(mapping)

  0%|          | 0/40461 [00:00<?, ?it/s]

8092

## Text preprocessing

In [None]:
def clean(mapping):
    for key,captions in mapping.items():
        for i in range(len(captions)):
            # Take one caption at a time
            caption=captions[i]
            # Preprocessing steps
            # Convert to lowert case
            caption=caption.lower()
            # Delete digitd and special characters
            caption=caption.replace('[^A-Za-z]','')
            # Delete additional spaces
            caption=caption.replace('\s+',' ')
            # Add start and end tags to the captions
            caption='<SOS> ' +' '.join([word for word in caption.split() if len(word)>1])+' <EOS>'
            captions[i]=caption


In [None]:
# Before preproces of text
print('-------- Before Proccess --------')
print(mapping['2258277193_586949ec62'])
# preprocess the text
clean(mapping)
# After preprocess
print('-------- After Proccess --------')
print(mapping['2258277193_586949ec62'])

-------- Before Proccess --------
['people waiting for the subway', 'Some people looking out windows in a large building .', 'Three people are waiting on a train platform .', 'Three people standing at a station .', 'two woman and one man standing near train tracks .']
-------- After Proccess --------
['<SOS> people waiting for the subway <EOS>', '<SOS> some people looking out windows in large building <EOS>', '<SOS> three people are waiting on train platform <EOS>', '<SOS> three people standing at station <EOS>', '<SOS> two woman and one man standing near train tracks <EOS>']


In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)
print("All captions: " + str(len(all_captions)))
print(all_captions[:10])

All captions: 40460
['<SOS> child in pink dress is climbing up set of stairs in an entry way <EOS>', '<SOS> girl going into wooden building <EOS>', '<SOS> little girl climbing into wooden playhouse <EOS>', '<SOS> little girl climbing the stairs to her playhouse <EOS>', '<SOS> little girl in pink dress going into wooden cabin <EOS>', '<SOS> black dog and spotted dog are fighting <EOS>', '<SOS> black dog and tri-colored dog playing with each other on the road <EOS>', '<SOS> black dog and white dog with brown spots are staring at each other in the street <EOS>', '<SOS> two dogs of different breeds looking at each other on the road <EOS>', '<SOS> two dogs on pavement moving toward each other <EOS>']


In [None]:
# Tokenize the text
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index) + 1
print("Vocab Size: " + str(vocab_size))

Vocab Size: 8485


In [None]:
# Get maximum length of the caption
max_length=max(len(caption.split()) for caption in all_captions)
print("Max Length of Caption: " + str(max_length))

Max Length of Caption: 34


In [None]:
image_ids=list(mapping.keys())
split=int(len(image_ids)*0.90)
#size=int(len(image_ids)*0.99)
train=image_ids[:split]
test=image_ids[split:]
print(len(test))
print(len(train))

810
7282


## Data Generator
Making the data on batches to avoid crashing

In [None]:
def data_generator(data_keys,mapping,features,tokenizer,max_length,vocab_size,batch_size):
    # Loop over images
    X1,X2,y=list(),list(),list()
    n=0
    while 1:
        for key in data_keys:
            n += 1
            captions=mapping[key]
            # Process each caption
            for caption in captions:
                seq=tokenizer.texts_to_sequences([caption])[0]
                # Split sequence into X y pairs
                for i in range(1,len(seq)):
                    # Split into input and output pairs
                    in_seq,out_seq=seq[:i],seq[i]
                    in_seq=pad_sequences([in_seq],maxlen=max_length)[0]
                    # Encode out sequence
                    out_seq=to_categorical([out_seq],num_classes=vocab_size)[0]

                    # Store the sequences
                    if key in features:
                      X1.append(features[key][0])
                      X2.append(in_seq)
                      y.append(out_seq)
            if n == batch_size:
                X1,X2,y=np.array(X1),np.array(X2),np.array(y)
                yield [X1,X2],y
                X1,X2,y=list(),list(),list()
                n=0

## LSTM Model

In [None]:
# Encoder model
# Image feature layers
inputs1=Input(shape=(2048,))
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)

# Sequence feature layer
inputs2=Input(shape=(max_length,))
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)

# Decoder model
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)

model=Model(inputs=[inputs1,inputs2],outputs=outputs)
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

# plot the model
# plot_model(model,show_shapes=True)


In [None]:
# Train the model
epochs=30
batch_size=64
steps=len(train)//batch_size

print(mod)

generator=data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
history= model.fit(generator,epochs=epochs,steps_per_epoch=steps,verbose=1)

attention-ResNet50
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
# Save the model
model.save('best_model_simple.h5')

  saving_api.save_model(


## Caption Prediction

In [None]:
def idx_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [None]:
from keras.models import load_model
model = load_model('/kaggle/input/trianed/best_model_simple.h5')

In [None]:
# Generate caption for the image
def predict_caption(model, image, tokenizer, max_length):
    # Add start tag for generation purpose
    in_text='<SOS>'
    # Iterate over the max length of the sequence
    for i in range(max_length):
        # Encode input sequence
        sequence=tokenizer.texts_to_sequences([in_text])[0]
        # Pad the sequence
        sequence=pad_sequences([sequence],max_length)
        # Predict next word
        yhat=model.predict([image,sequence],verbose=0)
        # Get index with high probability
        yhat=np.argmax(yhat)
        # Convert index to word
        word=idx_to_word(yhat,tokenizer)
        # Stop if word is not found
        if word is None:
            break
        # Append word as input for generating next word
        in_text+=' '+word
        # Stop if we reach end tag
        if word=='eos':
            break
    return in_text

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.gleu_score import corpus_gleu
from nltk.translate.ribes_score import sentence_ribes

# Validate with test data
actual,predicted=list(),list()
act,pre=[],[]

for key in tqdm(test):
    cap=""
    # Get actual caption
    captions=mapping[key]
    for i in captions :
        cap+=i
    cap=cap.replace('<SOS> ','')
    cap=cap.replace('<EOS>','')

    # Predict the caption for image
    y_pred=predict_caption(model,features[key],tokenizer,max_length)

    # Split the words
    c=y_pred
    c=c.replace('<SOS> ','')
    c=c.replace('<EOS>','')
    pre.append(c)
    y_pred=y_pred.split()
    actual_captions=[caption.split() for caption in captions]

    # Append words to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
    act.append(cap)


# Calculate the BLEU score
#print(act)
#print(pre)
print('GLEU: %f '% corpus_gleu(actual,predicted))
print('BLEU-1: %f '% corpus_bleu(actual,predicted,weights=(1,0.0,0,0)))
print('BLEU-2: %f '% corpus_bleu(actual,predicted,weights=(0,1,0,0)))
print('BLEU-3: %f '% corpus_bleu(actual,predicted,weights=(0,0.0,1,0)))
print('BLEU-4: %f '% corpus_bleu(actual,predicted,weights=(0,0,0,1)))

  0%|          | 0/810 [00:00<?, ?it/s]

GLEU: 0.143202 
BLEU-1: 0.477278 
BLEU-2: 0.189615 
BLEU-3: 0.075793 
BLEU-4: 0.025066 


In [None]:
# Extract features from image
features_ext={}

# Load the image from file
img_path='/kaggle/input/test-set/test_set/climb_10.png'
image=load_img(img_path,target_size=(224,224))

# Convert image pixels to numpy array
image=img_to_array(image)
image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))

# Preporocess image for vgg
image=preprocess_input(image)

# Extract features
feature=model_r.predict(image,verbose=0)

# Store_features
features_ext['img']=feature

description = predict_caption(model, feature, tokenizer, max_length)
print(description)

<SOS> man in red shirt is climbing rock wall eos


In [None]:
import glob
# Extract features from image
final={}

#load the image from file
img_path='/kaggle/input/flickr8k/Images/climb_10.png'
for filename in glob.glob('/kaggle/input/test-set/test_set/*'):

  image=load_img(filename,target_size=(224,224))
  # Convert image pixels to numpy array
  image=img_to_array(image)
  image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
  # Preporocess image for vgg
  image=preprocess_input(image)
  # Extract features
  feature=model_r.predict(image,verbose=0)
  # Store_features
  features['img']=feature

  # Add start tag for generation purpose
  in_text='<SOS>'
  #iterate over the max length of the sequence
  for i in range(max_length):
      # Encode input sequence
      sequence=tokenizer.texts_to_sequences([in_text])[0]
      # Pad the sequence
      sequence=pad_sequences([sequence],max_length)
      # Predict next word
      yhat=model.predict([feature,sequence],verbose=0)
      # Get index with high probability
      yhat=np.argmax(yhat)
      # Convert index to word
      word=idx_to_word(yhat,tokenizer)
      # Stop if word is not found
      if word is None:
          break
      # Append word as input for generating next word
      in_text+=' '+word
      # Stop if we reach end tag
      if word=='eos':
          break

  #description = predict_caption(model, feature, tokenizer, max_length)
  print(filename.split('/')[-1], in_text)
  final[filename.split('/')[-1]] = in_text

notting_1.jpg <SOS> people are standing in front of large flowers eos
snowboard_10.jpg <SOS> snowboarder slides down slope eos
image_22.jpg <SOS> two men in white uniforms are playing ball eos
climb_11.png <SOS> person in red jacket is pointing to the summit eos
wet-dog.jpg <SOS> dog runs through the water eos
images (2).jpg <SOS> woman in white shirt is standing on the edge of the water eos
image2.jpg <SOS> man in aerodynamic gear riding bike on the edge of road eos
7246b633f0ba13b9e28d0105e9e912bd.jpg <SOS> many people are watching basketball competition eos
images (4).jpg <SOS> young boy wearing blue shirt and blue shirt is smiling eos
Snow_Dogs_CarolynKaster_AP-19013763595378.jpg <SOS> dog is running through the snow eos
image_04.jpg <SOS> two men are jumping to catch frisbee eos
snowboard_13.jpg <SOS> man in yellow jacket is skiing down snowy hill eos
045759d0c8aa5aa26b9492da524146fb.jpg <SOS> busy busy street corner with people are watching eos
images (13).jpg <SOS> the woman is 

In [None]:
new = dict()

for key, value in final.items():
    o = str()
    for word in value.split():
        if word == '<SOS>':
            w = True
            continue
        elif word == 'eos':
            continue
        else:
            if w:
                o = o  + word
                w = False
            else: o = o + ' ' + word
    new[key] = o

In [None]:
with open("captions.txt", "w") as output:
    for key, value in new.items():
        output.write(key + ', ' + value + '\n')

In [None]:
from PIL import Image
import cv2
import matplotlib.pyplot as plt


def generate_caption(image_name):
    image_id=image_name.split('.')[0]
    image_name=cv2.imread('/kaggle/input/flickr8k/Images/' + image_name)
    captions=mapping[image_id]
    for caption in captions:
        print(caption)

    # Predict the caption
    y_pred=predict_caption(model,features[image_id],tokenizer,max_length)
    print('predicted statement')
    print(y_pred)
    plt.imshow(image_name)

In [None]:
# print(mod)
# generate_caption('1067675215_7336a694d6.jpg')