#Your Personal Image Captioner
Image Captioning is the process of generating a textual description for a given image. It has been a very important and fundamental task in the Deep Learning domain and has a huge number of applications. For instance, image captioning technologies can be used to create an application to help people who have low or no eyesight.

So, go ahead and try it out. Here's how it works:
1. Upload an image of your choice.
2. Click on the 'Get Caption' Button
3. Wait for the results!!!


In [15]:
#import libraries
from PIL import Image
import numpy as np
import tensorflow.keras.preprocessing.image

def encodeImage(img,WIDTH,HEIGHT,preprocess_input,encode_model,OUTPUT_DIM):
    """
    Function to encode the images
    :param img: input image
    :param WIDTH: width of the image
    :param HEIGHT: height of the image
    :param preprocess_input: tensorflow function to preprocess the image
    :param encode_model: our caption model
    :param OUTPUT_DIM: output dimension
    :return: encoded image
    """
    # Resize all images to a standard size (specified bythe image 
    # encoding network)
    img = img.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
    # Convert a PIL image to a numpy array
    x = tensorflow.keras.preprocessing.image.img_to_array(img)
    # Expand to 2D array
    x = np.expand_dims(x, axis=0)
    # Perform any preprocessing needed by InceptionV3 or others
    x = preprocess_input(x)
    # Call InceptionV3 (or other) to extract the smaller feature set for 
    # the image.
    x = encode_model.predict(x) # Get the encoding vector for the image
    # Shape to correct form to be accepted by LSTM captioning network.
    x = np.reshape(x, OUTPUT_DIM )
    return x

In [16]:
#import libraries
from tensorflow.keras import Input, layers
from tensorflow.keras.layers import add
from tensorflow.keras.layers import (LSTM, Embedding, 
    TimeDistributed, Dense, RepeatVector, 
    Activation, Flatten, Reshape, concatenate,  
    Dropout, BatchNormalization)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


def create_model(OUTPUT_DIM,vocab_size,embedding_dim,max_length,embedding_matrix):
    """
    :param OUTPUT_DIM: the output dimension 
    :param vocab_size: the size of the vocabulary
    :param embedding_dim: the dimension of the embedding matrix
    :param max_length: the maximum length of a caption
    :return: caption model is returned
    """
    inputs1 = Input(shape=(OUTPUT_DIM,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    cap_model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    cap_model.layers[2].set_weights([embedding_matrix])
    cap_model.layers[2].trainable = False
    cap_model.compile(loss='categorical_crossentropy', optimizer='adam')
    return cap_model

def generateCaption(photo,caption_model,max_length,wordtoidx,idxtoword,START,STOP):

    """
    Function to return the caption after the prediction
    :param photo: input image
    :param caption_model: our caption model
    :param max_length: maximum length of a sequence
    :param wordtoidx: word to index lookup table
    :param idxtoword: index to word lookup table
    :param START:starting token
    :param STOP: stopping token
    :return: caption generated from the model
    """
    in_text = START
    for i in range(max_length):
        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idxtoword[yhat]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [17]:
import os 
from pathlib import Path
root_captioning=Path()
model_path = os.path.join(root_captioning,"data",f'caption-model.hdf5')
caption_data_path=os.path.join(root_captioning,"data",f'caption_data.pickle')

In [18]:
import pickle
with open(caption_data_path,"rb") as f:
  caption_data=pickle.load(f)

In [47]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
import tensorflow.keras.applications.inception_v3
encode_model = InceptionV3(weights='imagenet')
encode_model = Model(encode_model.input, encode_model.layers[-2].output)
WIDTH = 299
HEIGHT = 299
OUTPUT_DIM = 2048
preprocess_input = tensorflow.keras.applications.inception_v3.preprocess_input
embedding_dim=200
vocab_size=caption_data['vocab_size']
max_length=caption_data['max_length']
embedding_matrix=caption_data['embedding_matrix']
wordtoidx=caption_data['wordtoidx']
idxtoword=caption_data['idxtoword']
START = "startseq"
STOP = "endseq"

In [24]:
caption_model=create_model(OUTPUT_DIM,vocab_size,embedding_dim,max_length,embedding_matrix)
caption_model.load_weights(model_path)

In [48]:
import ipywidgets as widgets
import io
btn_upload=widgets.FileUpload() #upload button
btn_fpoints=widgets.Button(description='Get Caption') #button to generate the caption
lbl_orig = widgets.Label() #original image label 
out_pl=widgets.Output() #output original image
lbl_pred = widgets.Label() #output label

In [49]:
def on_click_caption(change):
  for name, file_info in btn_upload.value.items():
    img = Image.open(io.BytesIO(file_info['content']))
  lbl_orig.value=f'Uploaded Image:'
  out_pl.clear_output()
  with out_pl:display(img)
  img = encodeImage(img,WIDTH,HEIGHT,preprocess_input,encode_model,OUTPUT_DIM).reshape((1,OUTPUT_DIM))
  lbl_pred.value =f"Caption: {generateCaption(img,caption_model,max_length,wordtoidx,idxtoword,START,STOP)}"

btn_fpoints.on_click(on_click_caption) #what to do when button is clicked 

In [50]:
widgets.VBox([widgets.Label('Image Captioner'), 
      btn_upload,
      btn_fpoints,
      lbl_orig,
      out_pl,
      lbl_pred])

VBox(children=(Label(value='Upload an image.'), FileUpload(value={}, description='Upload'), Button(description…