## *Required Libraries*

In [None]:
import os
import pickle
import numpy as np
from tqdm import tqdm
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from PIL import Image
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## *Loading Our Model*

In [None]:
base_model = VGG16()

# Restructure the model to use features from the second-to-last layer
vgg_model = Model(inputs=base_model.inputs, outputs=base_model.layers[-2].output)

# Print model summary
vgg_model.summary()

## *Loading Our Dataset*

In [None]:
dataset_dir="E:\Data Science\Projects\Dataset"

## *Extract features from Image*

In [None]:
# features = {}
# directory = os.path.join(dataset_dir, 'Images')

# for img_name in tqdm(os.listdir(directory)):
#     # load the image from file
#     img_path = directory + '/' + img_name
#     image = load_img(img_path, target_size=(224, 224))
#     # convert image pixels to numpy array
#     image = img_to_array(image)
#     # reshape data for model
#     image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
#     # preprocess image for vgg
#     image = preprocess_input(image)
#     # extract features
#     feature = vgg_model.predict(image,verbose=0)
#     # get image ID
#     image_id = img_name.split('.')[0]
#     # store feature
#     features[image_id] = feature

In [None]:
# Storing Feature in Pickle 
# pickle.dump(features, open(working_dir+'Extracted_Features.pkl','wb'))

In [None]:
with open("E:\Data Science\Projects\model\Extracted_Features.pkl",'rb') as f:
    features=pickle.load(f)

In [None]:
# Loading Caption Data
with open(os.path.join(dataset_dir,'captions.txt'),'r') as f:
    next(f)
    caption_doc=f.read()

In [None]:
# Mapping Of Image to Caption
mapping={}
for line in tqdm(caption_doc.split('\n')):
    tokens=line.split(',')
    if len(line)<2:
        continue
    img_name, captions=tokens[0],tokens[1:]
    img_id=img_name.split('.')[0]
    caption=" ".join(captions)

    if img_id not in mapping:
        mapping[img_id]=[]
    
    mapping[img_id].append(caption)

## *Preprocessing Text Data*

In [None]:
def clean(mapping):
    for key,captions in mapping.items():
        for i in range(len(captions)):
            caption=captions[i]
            caption=caption.lower()
            caption=caption.replace('[^A-Za-z]','')
            caption=caption.replace('\s+',' ')
            caption='startseq '+" ".join([word for word in caption.split() if len(word)>1])+' endseq'
            captions[i]=caption

In [None]:
mapping['1000268201_693b08cb0e']

In [None]:
clean(mapping)
mapping['1000268201_693b08cb0e']

In [None]:
all_captions=[]
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

## *Tokenization*

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1

In [None]:
# Saving Our Model
with open('Tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
vocab_size

In [None]:
# Getting maximum length of a caption
max_length=max(len(caption.split()) for caption in all_captions)
max_length

## *Splitting the Data*

In [None]:
img_ids=list(mapping.keys())
split=int(len(img_ids)*0.90)
train=img_ids[:split]
test=img_ids[split:]

## *Creating Data Generator*

In [None]:
def data_generator(data_keys, mapping, features, tokenizer, max_lenght, vocab_size, batch_size):
    x1, x2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]

                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_lenght)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    x1.append(features[key][0])
                    x2.append(in_seq)
                    y.append(out_seq)

            if n == batch_size:
                x1, x2, y = np.array(x1), np.array(x2), np.array(y)
                yield {"image":x1, "text":x2}, y
                x1, x2, y = list(), list(), list()
                n = 0

## *Model Building*

### *Encoder Model*

*Image Feature Layers*

In [None]:
inputs1=Input(shape=(4096,),name="image")
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)

*Sequence Feature Layer*

In [None]:
inputs2=Input(shape=(max_length,),name="text")
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256,use_cudnn=False)(se2)

### *Decoder Model*

In [None]:
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size, activation='softmax')(decoder2)

In [None]:
model=Model(inputs=[inputs1, inputs2], outputs=outputs)

## *Model Compilation*

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

## *Plot Model*

In [None]:
# plot_model(model,show_shapes=True)

## *Model Training*

In [None]:
epochs=20
batch_size=32
steps=len(train)//batch_size

for i in range(epochs):
    generator=data_generator(train,mapping, features, tokenizer, max_length,vocab_size,batch_size)
    print("Epoch {}/{}".format((i+1),epochs))
    model.fit(generator,epochs=1,steps_per_epoch=steps,verbose=1)

## *Model Saving*

In [None]:
model.save('model.h5')

In [None]:
# Load Model
model=load_model("E:\Data Science\Projects\model\model.h5")

## *Generate Caption for an Image*

In [None]:
def indx_to_word(integer,tokenizer):
    for word,index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [None]:
def predict_caption(model, image, tokenizer, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        pad_seq = pad_sequences([seq], maxlen=max_length)  
        y_pred = model.predict([image, pad_seq], verbose=0)
        y_pred = np.argmax(y_pred)  # Get the predicted word index
        word = indx_to_word(y_pred, tokenizer)
        if word is None:
            break
        in_text += " " + word
        if word == 'endseq':
            break
    return in_text


## *Model Evaluation*

In [None]:
# Prepare data for BLEU score calculation
actual, predicted = list(), list()

for key in tqdm(test):
    captions = mapping[key]
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    y_pred = y_pred.split()  # Split predicted caption into tokens
    
    actual_caption = [caption.split() for caption in captions]  # Correctly split captions into tokens
    actual.append(actual_caption)
    predicted.append(y_pred)

In [None]:
print('BLEU Score-1:',corpus_bleu(actual,predicted))
print('BLEU Score-1:',corpus_bleu(actual,predicted,weights=(1,0,0,0)))
print('BLEU Score-2:',corpus_bleu(actual,predicted,weights=(0.5,0.5,0,0)))
print('BLEU Score-3:',corpus_bleu(actual,predicted,weights=(0.25,0.25,0.25,0.25)))

## *Visualizing The Results*

In [None]:
def visualize(img_name):
    img_id=img_name.split('.')[0]
    img_path=os.path.join(dataset_dir,"Images",img_name)
    image=Image.open(img_path)
    captions=mapping[img_id]

    print('Actual')
    for caption in captions:
        print(caption)

    y_pred=predict_caption(model,features[img_id],tokenizer,max_length)

    print('Predicted')

    print(y_pred)

    plt.imshow(image)

In [None]:
img_name="1015584366_dfcec3c85a.jpg"
visualize(img_name)