In [None]:
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.layers import Input
from keras.layers import Flatten
import string
from pickle import load
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras.utils import plot_model
from numpy import argmax
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu 
from PIL import Image 
import matplotlib.pyplot as plt
import numpy as np
  

# Prepare photo data


In [None]:
#extract features from each photo in directory
def extract_features(directory):
    model=VGG16()
    model.layers.pop()
    model=Model(inputs=model.inputs,outputs=model.layers[-1].output)
    features=dict()
    for name in listdir(directory):
        filename=directory+'/'+name
        image=load_img(filename,target_size=(224,224))
        image=img_to_array(image)
        image=image.reshape(1,image.shape[0],image.shape[1],image.shape[2])
        image=preprocess_input(image)
        feature=model.predict(image,verbose=0)
        image_id=name.split('.')[0]
        features[image_id]=feature
    return features
dirc='../input/flickr8k-sau/Flickr_Data/Images/'
features=extract_features(dirc)
dump(features,open('features.pkl','wb'))
print(len(features))
        

In [None]:
#load photo features
def load_photo_features(filename,dataset):
    all_features=load(open(filename,'rb'))        #load all features
    features={k: all_features[k] for k in dataset} #filter features
    return features
    

# Prepare text data

In [None]:
#load doc into memory
def load_doc(filename):
    file= open(filename,'r') #read only
    text=file.read()
    file.close()
    return text
 
#extract descriptions for images
def load_descriptions(filename):
    doc=load_doc(filename)
    mapping=dict()
    for line in doc.split('\n'):
        tokens=line.split()
        if len(line)<2:
            continue
        image_id,image_desc=tokens[0],tokens[1:]
        image_id=image_id.split('.')[0]
        image_desc=' '.join(image_desc)
        if image_id not in  mapping:
            mapping[image_id]=list()
        mapping[image_id].append(image_desc)
    return mapping

def clean_descriptions(descriptions):
    table=str.maketrans('','',string.punctuation) #prepare translation table for removing punctuation
    for key,desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc=desc_list[i]      
            desc=desc.split()                     #tokenize  
            desc=[word.lower() for word in desc]  #convert to lower case
            desc=[w.translate(table) for w in desc]   #remove punctuations
            desc=[word for word in desc if len(word)>1] #remove hanging s and a
            desc=[word for word in desc if word.isalpha()] #remove tokens with numbers in them
            desc_list[i]=' '.join(desc) #store as string
            
#convert loaded desc into vocab of words
def to_vocabulary(descriptions):
    all_desc=set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

#save descriptions to file, one per line
def save_descriptions(descriptions,filename):
    lines=list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+ ' '+ desc)
    data='\n'.join(lines)
    file=open(filename,'w')
    file.write(data)
    file.close()

filen='../input/flickr8k-sau/Flickr_Data/Flickr_TextData/Flickr8k.token.txt'
descriptions=load_descriptions(filen) #parse descriptions
print(len(descriptions))
clean_descriptions(descriptions)
vocabulary = to_vocabulary(descriptions)
print(len(vocabulary))
save_descriptions(descriptions, 'descriptions1.txt')

    
            

    

# Deep learning Model
### 1.Loading data
### 2.Defining the model
### 3.Fitting the model


In [None]:
#load a predefined list of photo identifiers
def load_set(filename):
    doc=load_doc(filename)
    dataset=list()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

#load clean description into memory
def load_clean_descriptions(filename,dataset):
    doc=load_doc(filename)
    descriptions=dict()
    for line in doc.split('\n'):
        tokens=line.split() #split line by white space
        image_id,image_desc=tokens[0],tokens[1:] #split id with descrp
        if image_id in dataset:
            if image_id not in descriptions:    #create list of images in the set
                    descriptions[image_id]=list()
            desc='startseq '+ ' '.join(image_desc)+' endseq'    #wrap descr in tokens
            descriptions[image_id].append(desc)
    return descriptions
        
#convert a dictionary of clean_descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc=list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
        

In [None]:
#fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines=to_lines(descriptions)
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

#length of description with most words
def max_len(descriptions):
    lines=to_lines(descriptions)
    return max(len(d.split()) for d in lines)

#create sequences of images, input_sequences and output words for an image
def create_sequences(tokenizer,max_length,desc_list,photo,vocab_size):
    X1,X2,y=list(),list(),list()
    for desc in desc_list: #iterating over each description for image
        seq=tokenizer.texts_to_sequences([desc])[0] #encode the sequence
        for i in range(1,len(seq)):  
            in_seq,out_seq=seq[:i],seq[i]  # split into input and output pair
            in_seq=pad_sequences([in_seq],maxlen=max_length)[0] # pad input sequence
            out_seq=to_categorical([out_seq],num_classes=vocab_size)[0] #encode output sequences
            #store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1),array(X2),array(y)


In [None]:
#define the captioning model
def define_model(vocab_size,max_lenth):
    #feature extractor model
    inputs1=Input(shape=(4096,))
    fe1=Dropout(0.5)(inputs1)
    fe2=Dense(256,activation='relu')(fe1)
    #sequence model
    inputs2=Input(shape=(max_length,))
    se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
    se2=Dropout(0.5)(se1)
    se3=LSTM(256)(se2)
    #decoder model
    decoder1=add([fe2,se3])
    decoder2=Dense(256,activation='relu')(decoder1)
    outputs=Dense(vocab_size,activation='softmax')(decoder2)
    #tie it together [image,seq][word]
    model=Model(inputs=[inputs1,inputs2],outputs=outputs)
    model.compile(loss='categorical_crossentropy',optimizer='adam')
    return model

#data generator, used in a call to model.fit_generator()
def data_generator(descriptions,photos,tokenizer,max_length,vocab_size):
    while 1: #lopp for ever over images
        for key,desc_list in descriptions.items():
            photo=photos[key][0]     #retrieve the photo feature
            in_img,in_seq,out_word=create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [[in_img,in_seq],out_word]


In [None]:
# train dataset
 
# load training dataset (6K)
filename = '../input/flickr8k-sau/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_len(train_descriptions)
print('Description Length: %d' % max_length)

#test the data generator
generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

In [None]:


#define the model
model=define_model(vocab_size,max_length)
#train the model and run epochs manually & save after each epoch
epochs=20
steps=len(train_descriptions)
for i in range(epochs):
    generator=data_generator(train_descriptions,train_features,tokenizer,max_length,vocab_size)
    model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1) #fir for 1 epoch
    model.save('model_'+str(i)+'.h5')


In [None]:
#map an integer to a word
def word_for_id(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

#generate a description for an image
def generate_desc(model,tokenizer,photo,max_length):
    in_text='startseq' #seed the generation process
    for i in range(max_length):
        sequence=tokenizer.texts_to_sequences([in_text])[0]   #integer encode input sequence
        sequence=pad_sequences([sequence],maxlen=max_length)  #pad input
        yhat=model.predict([photo,sequence],verbose=0)   #predict next word
        yhat=argmax(yhat)     #convert prob into integer
        word=word_for_id(yhat,tokenizer)
        if word is None:
            break
        in_text+=' '+word #append as input for generating the next word
        if word=='endseq':
            break
    return in_text

#evaluate the skill of the model
def evaluate_model(model,descriptions,photos,tokenizer,max_length):
    actual,predicted= list(),list()
    for key,desc_list in descriptions.items():
        yhat=generate_desc(model,tokenizer,photos[key],max_length) #generate descrip
        #store actual and predicted
        references=[d.split() for d in desc_list]  
        actual.append(references)
        predicted.append(yhat.split())
    #calculate BLUE score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
        
        

In [None]:
# dev dataset
 
# load test set
filename = '../input/flickr8k-sau/Flickr_Data/Flickr_TextData/Flickr_8k.testImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))

#load and evaluate
i=19
mod=load_model('model_'+str(i)+'.h5')
evaluate_model(mod,test_descriptions,test_features,tokenizer,max_length)

# Generate new caption

In [None]:
#load doc into memory  load_doc()
#load a pre-defined list of photo identifiers   load_set()
#load clean descriptions into memory   load_clean_descriptions()
#convert a dictionary to a list of descriptions   to_lines()
#fit a tokenizer    create_tokenizer()
#load training dataset
#prepare tokenizer tokenizer=create_tokenizer(train_descriptions)
dump(tokenizer,open('tokenizer.pkl','wb')) #saving the tokenizer
 
#extract features from each photo in directory extract_features()
def ext_features(filename):
    model=VGG16()
    model.layers.pop()
    model=Model(inputs=model.inputs,outputs=model.layers[-1].output)
    image=load_img(filename,target_size=(224,224))
    image=img_to_array(image)
    image=image.reshape(1,image.shape[0],image.shape[1],image.shape[2])
    image=preprocess_input(image)
    feature=model.predict(image,verbose=0)
    return feature
#map integer to a word   word_for_id()
#generate a description for an image generate_desc()

tokenizer=load(open('tokenizer.pkl','rb')) #load the tokenizer
max_length= 34 #predefine the max sequence length
mod=load_model('model_'+str(i)+'.h5')#load the model load_model()
photo = ext_features('../input/flickr8k-sau/Flickr_Data/Images/1020651753_06077ec457.jpg') #load and prepare the photograph
img_array = np.array(Image.open('../input/flickr8k-sau/Flickr_Data/Images/1020651753_06077ec457.jpg'))
plt.imshow(img_array)

description = generate_desc(mod, tokenizer, photo, max_length)#generate description
print(description)