In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
import pickle
warnings.filterwarnings('ignore')
from os import listdir
from pickle import dump
import tensorflow
import keras
from keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.utils import img_to_array
from tensorflow.keras.utils import load_img
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical,plot_model
from keras.layers import Input,Dense,LSTM,Embedding,Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint




In [None]:


# Now extract the features from each photo in the directory
def extract_features(directory):
    # Loading the pre-trained VGG16 model
    model = VGG16(weights='imagenet', include_top=False)
    model.layers.pop()  # restructuring the model
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    print(model.summary())  # summarizing the model

    features = dict()  # Extracting features from the input photo
    for name in listdir(directory):
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)  # converting the image pixels to a numpy array
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)  # preparing the image for the VGG model
        feature = model.predict(image, verbose=0)  # getting the features
        image_id = name.split('.')[0]  # getting the image id
        features[image_id] = feature  # storing the feature
        print('>%s' % name)
    return features



#Now extracting the features from all the images
directory = '/kaggle/input/flicker-8k-image-dataset-captionstxt/Images'
features = extract_features(directory)
print('Extracted Features --> %d' % len(features))
#saving the file now
dump(features, open('Image_Caption_Data/featues.pkl','wb'))

In [None]:
import string

#Creating a function that will load doc into memory
def load_doc(filename):
    file = open(filename, 'r') #open the file as read only
    text = file.read() # real all the text
    file.close() #close the file
    return text

#NOw create the descriptions for images
def load_descriptions(doc):
    mapping = dict()
    #Now process the line
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        #Now take the first token as the image id and rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        #removeing the filenames from image id
        image_id = image_id.split('.')[0]
        #Now convert the description tokens back to string format
        image_desc = ' '.join(image_desc)
        #Now creating a list
        if image_id not in mapping:
            mapping[image_id] = list()
        #NOW storing the description
        mapping[image_id].append(image_desc)
    return mapping



def clean_descriptions(descriptions):
    #prepare translation table for removing punctuations
    table = str.maketrans('','',string.punctuation)
    for key,desc_list in descriptions.items():
        desc = desc_list[i]
        #tokeinizing
        desc = desc.split()
        #convert it to lower case
        desc = [word.lower() for word in desc]
        #removing punctuation from each token
        desc = [w.translate(table) for w in desc]
        #now remove hanging 's' and 'a'
        desc = [word for word in desc if len(word)>1]
        #removing tokens with numbers in them
        desc = [word for word in desc if word.isalpha()]
        #store as string
        desc_list[i] = ' '.join(desc)
        
#Now convert the laoded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    #Now build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc


#NOw save descriptions to file one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()
    
filename = '/kaggle/input/flicker-8k-image-dataset-captionstxt/captions.txt'
#Now load the descriptions 
doc = load_doc(filename)
#parse the descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ', % len(descriptions))
#clean descriptions
clean_descriptions(descriptions)
#summarize vocabulary
print('Vocabulary Size: %d' % len(vocabulary))

#save to file
save descriptions(descriptions, 'Image Caption Project Model Data/descriptions.txt')

        

In [None]:
from pickle import load
#Now load the doc into the memory
def load_doc(filename):
    #open the file as read only
    file = open(filenme,'r')
    #read all text
    text = file.read()
    #Now close the file
    file.close()
    return text


#now load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    #process line by line
    for line in doc.split('\n'):
        #skip empty lines
        if len(line) < 1:
            continue
        #get the image indentifier
        indentifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

#load clean descriptions into the memory
def load_clean_descriptions(filename, dataset):
    #loading the document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        #split line by the white space
        tokens = line.split()
        #split id from descriptions
        image_id , image_desc = token[0], tokens[1:]
        #Now skip the images not in set 
        if image_id in dataset:
            #create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            #wrap description in tokens
            desc = 'startseq' + ' '.join(image_desc) + 'endseq'
            #store
            descriptions[image_id].append(desc)
    return



#Now load the photo features
def load_photo_features(filename, dataset):
    #load all the features
    all_features = load(open(filename, 'rb'))
    #filter features
    features = {k: all_features[k] for k in dataset}
    return features


#load the training dataset(6k)
filename = '/kaggle/input/flicker-8k-image-dataset-captionstxt/captions.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
#descriptions
trian_descriptions = load_clean_descriptions('/kaggle/input/flicker-8k-image-dataset-captionstxt/captions.txt')
print('Descriptions: train=%d' % len(train_descriptions))

#photo features
train_feautures = load_photo_features('Image_caption_Model.pkl', train)
print('Photos: train=%d' % len(train_features))
train_descriptions



#NOw convert a dictionary of clean descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

#Fit a tokenzier given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenzier()
    tokenizer.fit_on_texts(lines)
    return tokenizer

#get vocabulary size
tokenzier = create_tokenizer(trian_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

#calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

#Now create sequences of images , input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list,photo):
    x1, x2, y = list(), list(),list()
    #walk through each description for the image
    for desc in desc_list:
        #encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        #split one sequence into multiple x,y paris
        for i in range(1, len(seq)):
            #split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            #pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            #encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            #store
            x1.append(photo)
            x2.append(in_seq)
            y.append(out_seq)
    return array(x1), array(x2), array(y)


#define the captioning model
def define_model(vocab_size,max_length):
    #feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    #sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size,256,mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    #decoder model
    decorder1 = add([fe2,fe3])
    decoder2 = Dense(256,activation='relu')(decoder1)
    outputs = Dense(vocab_size,activation='softmax')(decoder2)
    
    #Tie it together
    model = Model(inputs=[inputs1,inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    #summarize the model
    print(model.summary())
    

## Now we check the BLEU Scores for the Model

In [None]:
#Sentence BLEU Score calculation using NLTK
from nltk.translate.bleu_score import sentence_bleu
reference = [['Three','people','running','on','the','road'],['running','on','the','road']]
candidate = ['Person','is','running','on','road']
score = sentence_bleu(reference,candidate)
print(score)

In [None]:
#Cumulative BLEU score calculation using NLTK
from nltk.bleu_score import sentence_bleu
reference = [['Three','people','running','on','the','road']]
cadidate = ['Person','is','running','on','raod']
print('Cumulative 1-gram: %f' % sentence_bleu(reference,cadidate, weights=(1,0,0,0)))
print('Cumulative 2-gram: %f' % sentence_bleu(reference,cadidate, weights=(0.5,0.5,0,0)))
print('Cumulative 3-gram: %f' % sentence_bleu(reference,cadidate, weights=(0.33,0.33,0.33,0)))
print('Cumulative 4-gram: %f' % sentence_bleu(reference,cadidate, weights=(0.25,0.25,0.25,0.25)))



In [None]:
#map an integer to a worod
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

#generate a description for an image
def generate_desc(model,tokenizer,photo,max_length):
    #seed the generation process
    in_text = 'startseq'
    #iterate over the whole length of the sequence
    for i in range(max_length):
        #integer encoding the input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        #predict the next word
        yhat = model.predict([photo,sequence],verbose=0)
        #nwo convert the probability to integer
        yhat = argmax(yhat)
        #map integer to word
        word = word_for_id(yhat,tokenizer)
        #Stop if we cannot map the word
        if word in None:
            break
        #Now append as input for generating the next word
        in_text += ' ' + word
        #stop if we predict the end of the sequence
        if word == 'enfseq':
            break
    return in_text




#Now evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    #step over the whole set
    for key, desc_list in descriptions.items():
        #generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        #store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
        #calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0,0,0,0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5,0.5,0,0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3,0.3,0.3,0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25,0.25,0.25,0.25)))
    
    
#Now prepare the training set
#load the data
filename = '/kaggle/input/flicker-8k-image-dataset-captionstxt/captions.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))

#descriptions
train_descriptions = load_clean_descriptions('descriptions.txt',train)
print('Descriptions: train=%d' % len(train_descriptions))

#prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary SIze: %d' % vocab_size)
#determine the max sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

#prepare the testing set
#loading
filename = '/kaggle/input/flicker-8k-image-dataset-captionstxt/captions.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
#descriptions
test_descriptions = load_cean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
#photo features
test_features = load_photo_features('features.pkl',test)
print('Photos: test=%d' % len(test_features))

#load the model which has minimum loss in this case it was model_18
filename = 'model_18.h5'
model = load_model(filename)
#evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)


In [None]:
#REmove startseq and endseq
query = description
stopwords = ['startseq','endseq']
querywords = query.split()

resultwords = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)

print(result)

## Now you can upload any image and check the outcome