In [3]:
import pandas as pd
import numpy as np
import cv2
import keras
import json
import pickle
from keras.applications.vgg16 import VGG16
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from keras.preprocessing import image
from keras.models import Model, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from keras.layers.merge import add

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
model = load_model('/home/rish/Downloads/Image-Caption/model_9.h5')

In [4]:
model_temp = ResNet50(weights="imagenet",input_shape=(224,224,3))
#model.summary()

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5


In [5]:
model_resnet = Model(model_temp.input,model_temp.layers[-2].output)

In [6]:
def preprocess_img(img):
    img = image.load_img(img,target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img,axis=0)
    # Normalisation
    img = preprocess_input(img)
    return img

In [7]:
def encode_image(img):
    img = preprocess_img(img)
    feature_vector = model_resnet.predict(img)
    
    feature_vector = feature_vector.reshape(1, feature_vector.shape[1])
    #print(feature_vector.shape)
    return feature_vector

In [9]:
with open('/home/rish/Downloads/Image-Caption/word_to_idx.pkl','rb') as w2i:
  word_to_idx = pickle.load(w2i)

with open('/home/rish/Downloads/Image-Caption/idx_to_word.pkl','rb') as i2w:
  idx_to_word = pickle.load(i2w)

In [10]:
def predict_caption(photo):
    max_len = 35
    
    in_text = "startseq"
    for i in range(max_len):
        sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        sequence = pad_sequences([sequence],maxlen=max_len,padding='post')
        
        ypred = model.predict([photo,sequence])
        ypred = ypred.argmax() #WOrd with max prob always - Greedy Sampling
        word = idx_to_word[ypred]
        in_text += (' ' + word)
        
        if word == "endseq":
            break
    
    final_caption = in_text.split()[1:-1]
    final_caption = ' '.join(final_caption)
    return final_caption

In [11]:
word_to_idx

{'in': 1,
 'the': 2,
 'on': 3,
 'is': 4,
 'and': 5,
 'dog': 6,
 'with': 7,
 'man': 8,
 'of': 9,
 'two': 10,
 'white': 11,
 'black': 12,
 'boy': 13,
 'are': 14,
 'woman': 15,
 'girl': 16,
 'to': 17,
 'wearing': 18,
 'at': 19,
 'people': 20,
 'water': 21,
 'red': 22,
 'young': 23,
 'brown': 24,
 'an': 25,
 'his': 26,
 'blue': 27,
 'dogs': 28,
 'running': 29,
 'through': 30,
 'playing': 31,
 'while': 32,
 'shirt': 33,
 'down': 34,
 'standing': 35,
 'ball': 36,
 'little': 37,
 'grass': 38,
 'snow': 39,
 'child': 40,
 'person': 41,
 'jumping': 42,
 'over': 43,
 'three': 44,
 'front': 45,
 'sitting': 46,
 'holding': 47,
 'up': 48,
 'field': 49,
 'small': 50,
 'by': 51,
 'large': 52,
 'green': 53,
 'one': 54,
 'group': 55,
 'yellow': 56,
 'her': 57,
 'walking': 58,
 'children': 59,
 'men': 60,
 'into': 61,
 'air': 62,
 'beach': 63,
 'near': 64,
 'mouth': 65,
 'jumps': 66,
 'another': 67,
 'for': 68,
 'street': 69,
 'runs': 70,
 'its': 71,
 'from': 72,
 'riding': 73,
 'stands': 74,
 'as': 75,


In [12]:
idx_to_word

{1: 'in',
 2: 'the',
 3: 'on',
 4: 'is',
 5: 'and',
 6: 'dog',
 7: 'with',
 8: 'man',
 9: 'of',
 10: 'two',
 11: 'white',
 12: 'black',
 13: 'boy',
 14: 'are',
 15: 'woman',
 16: 'girl',
 17: 'to',
 18: 'wearing',
 19: 'at',
 20: 'people',
 21: 'water',
 22: 'red',
 23: 'young',
 24: 'brown',
 25: 'an',
 26: 'his',
 27: 'blue',
 28: 'dogs',
 29: 'running',
 30: 'through',
 31: 'playing',
 32: 'while',
 33: 'shirt',
 34: 'down',
 35: 'standing',
 36: 'ball',
 37: 'little',
 38: 'grass',
 39: 'snow',
 40: 'child',
 41: 'person',
 42: 'jumping',
 43: 'over',
 44: 'three',
 45: 'front',
 46: 'sitting',
 47: 'holding',
 48: 'up',
 49: 'field',
 50: 'small',
 51: 'by',
 52: 'large',
 53: 'green',
 54: 'one',
 55: 'group',
 56: 'yellow',
 57: 'her',
 58: 'walking',
 59: 'children',
 60: 'men',
 61: 'into',
 62: 'air',
 63: 'beach',
 64: 'near',
 65: 'mouth',
 66: 'jumps',
 67: 'another',
 68: 'for',
 69: 'street',
 70: 'runs',
 71: 'its',
 72: 'from',
 73: 'riding',
 74: 'stands',
 75: 'as',


In [26]:
encode =  encode_image('/home/rish/Downloads/download (1).png')

In [27]:
predict_caption(encode)

'two dogs are running on the grass'

In [20]:
def caption_this_image(image):
    encode = encode_image(image) 
    caption = predict_caption(encode)
    return caption