# ***Building Image Caption Generator Using LSTM***

## ***1-Importing packages and Libraries***

In [1]:
from tensorflow.keras.layers import Input , Dense , Embedding , LSTM , Dropout , add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.xception import Xception
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from nltk.stem.snowball import stopwords
from tensorflow.keras.models import Model
from matplotlib import image as mpimg
import matplotlib.pyplot as plt
import tensorflow_text as text
import tensorflow_hub as hub
import tensorflow as tf
from PIL import Image
import numpy as np
import seaborn
import random
import pickle
#import spacy
import nltk
import gzip
import cv2
import os
import re
import json
#nltk.download("all")

  from pkg_resources import parse_version


In [2]:
# set up tensorflow to use apple mps


In [3]:
# !uv run python -m spacy download en_core_web_md

## ***2-Preprocessing text***

* Punctuation removal







In [4]:
def remove_punc(text) :
  return re.sub(r'[^\w\s]','',text)

* Lowercasing

In [5]:
def to_lower_case(text) :
  return text.lower()

* Removing Stopwords

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tmyciels/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
stopwords_list = stopwords.words('english')
def remove_stopwords(text) :
  text_words = [word for word in text.split() if ((word not in stopwords_list) and (len(word) > 2))]
  text = " ".join(text_words)
  return text

* Removing numbers

In [8]:
def remove_numbers(text) :
  return re.sub(r'[0-9]','',text)

* Removing multiple whitespaces

In [9]:
def remove_multiple_spaces(text) :
  return re.sub(' +',' ',text).strip()

In [10]:
# gathering all the text cleaning steps in one function
def clean_text(text) :
  text = remove_punc(text)
  text = to_lower_case(text)
  text = remove_stopwords(text)
  text = remove_numbers(text)
  text = remove_multiple_spaces(text)
  return text

## ***3-Preprocessing data***

In [11]:
def read_file(path) : 
  with open(path, 'r') as file :
    return file.read().split('\n')

In [12]:
# converting image_captions data into dict where keys = images and values = captions 
def get_data_dictionary(data) :
  descriptions = {}
  for line in data :
    image_name , caption = line.split('\t')
    if image_name[:-2] in descriptions.keys() :
      descriptions[image_name[:-2]].append(caption)
    else :
      descriptions[image_name[:-2]] = [caption]
  return descriptions

In [13]:
# using predefined text preprocessing functions to clean the captions text
def clean_captions(descriptions) :
  for image in descriptions.keys() :
    for index , caption in enumerate(descriptions[image]) :
      descriptions[image][index] = clean_text(caption)
  return descriptions

In [14]:
# writing down data dictionary into external file
def write_file(path,data) :
  lines = []
  for image in data.keys() :
    for caption in data[image] :
      lines.append(image+'\t'+caption)
  lines = '\n'.join(lines)
  with open(path,'w') as file :
    file.write(lines)

In [15]:
from pathlib import Path
data = read_file(Path.cwd() / "flicker_30k" / "Flickr8k.token.txt")
descriptions = get_data_dictionary(data)
descriptions = clean_captions(descriptions)
path = Path.cwd() / "cleaned_data.txt"
write_file(path,descriptions)

## ***4-Extracting Images Features***

In [16]:
# importing VGG16 model without the output layer
features_extractor = VGG16()
features_extractor = Model(inputs = features_extractor.inputs  , outputs =  features_extractor.layers[-2].output)
features_extractor.summary()

2025-06-02 01:25:53.484606: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-06-02 01:25:53.484642: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-06-02 01:25:53.484646: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
I0000 00:00:1748820353.484954  631959 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1748820353.485149  631959 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [17]:
images_path = Path.cwd() / "flickr30k_images" / "flickr30k_images"
images_names = os.listdir(images_path)

In [18]:
from tqdm.auto import tqdm

In [19]:
# Using preptrained model to extract images features and building dict where key:images_names and values:images_features
def preprocess_image(model,images_path,images_list) :
  features = {}
  for img in tqdm(images_list):
    path = os.path.join(images_path,img)
    image = Image.open(path)
    image = image.resize((224,224))
    image = np.expand_dims(image, axis = 0)
    image = image / 127.5
    image = image -1
    # dont print status bar
    feature = model.predict(image, verbose = 0)
    features[img] = feature
  return features

In [None]:
features = preprocess_image(features_extractor,images_path,images_names)
path = Path.cwd() / 'images_features.bin'

# saving images_features dict into .bin file
pickle.dump(features,open(path,'wb'))

  0%|          | 0/31785 [00:00<?, ?it/s]

2025-06-02 01:25:55.634564: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [None]:
# features_extractor.save("features_extractor.h5")

## ***5-Loading prepared files***

In [None]:
# adding <start> and <end> to each caption
def load_tokens(path,images) :
  lines = read_file(path)
  tokens = {}
  for line in lines :
    img , caption = line.split('\t')
    if img in images :
      if img not in tokens.keys() :
        tokens[img] = []
      tokens[img].append("<start> "+caption+" <end>")
  return tokens

In [None]:
# listing all available images
def list_images(path) :
   all_images = []
   lines = read_file(path)
   for line in lines :
    img , caption = line.split('\t')
    if img not in all_images :
       all_images.append(img)
   return all_images

all_images_list = list_images("E:\Khaled\Data\Projects\Image Caption Generator - GPU\cleaned_data.txt")

## ***6-Splitting Images into Training, Validation & Testing sets***

In [None]:
# training_images , testing_images = train_test_split(all_images_list , test_size = .1 , shuffle = True)
# cross_validation_images , testing_images = train_test_split(testing_images , test_size = .5 , shuffle = True)

In [None]:
def write_list_to_file(input_list, file_name):
    with open(file_name, "w") as file:
        for item in input_list:
            file.write(str(item) + "\n")

In [None]:
# write_list_to_file(training_images, 'training_images_list.txt')
# write_list_to_file(cross_validation_images, 'cross_validation_images_list.txt')
# write_list_to_file(testing_images, 'testing_images_list.txt')

In [None]:
import pickle
training_images = read_file(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\training_images_list.txt")
# loading training images_captions dict
training_tokens = load_tokens(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\cleaned_data.txt",training_images)
# loading extracted images features
features = pickle.load(open(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\images_features.bin",'rb'))

## ***7-Building text vectorization model***

In [None]:
# extracting all captions into one list
def fetch_captions(tokens) :
  captions = []
  for caps in tokens.values() :
    [captions.append(cap) for cap in caps]
  return captions

In [None]:
captions = fetch_captions(training_tokens)

In [None]:
sentences_length = []
for caption in captions :
  sentences_length.append(len(caption.split()))

max_length = max(sentences_length)
full_text = ' '.join(captions)

In [None]:
text_dataset = tf.data.Dataset.from_tensor_slices(captions)

# preparing TextVectorization layer to be used to tokenize captions
vectorize_layer = TextVectorization(output_mode = 'int' )
vectorize_layer.adapt(text_dataset)

# building vocab using TextVectorization layer
vocabulary = list(vectorize_layer.get_vocabulary())
vocab_size = vectorize_layer.vocabulary_size()

In [None]:
# Pickle the config and weights
pickle.dump({'config': vectorize_layer.get_config(),'weights': vectorize_layer.get_weights()}, open("tv_layer.pkl", "wb"))

In [None]:
# tokenizing captions and saving it back to dict where keys:images and values:sequences
training_images_sequences = {}
i = 0
for img , captions in training_tokens.items():
    training_images_sequences[img] = []
    for caption in captions :
        sequence =  vectorize_layer(tf.constant([caption])).numpy().tolist()[0]
        training_images_sequences[img].append(sequence)

In [None]:
# with open('training_images_sequences.json', 'w') as f:
#     # Serialize the dictionary to JSON and write it to the file
#     json.dump(training_images_sequences, f)

In [None]:
with open(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\training_images_sequences.json", 'r') as f:
    # Load the JSON data from the file and deserialize it to a Python object
    training_tokens = json.load(f)

## ***8-Building data generator***

In [None]:
def data_generator(tokens_keys,tokens,features,vocab_size,max_length,batch_size) :
    input_1 , input_2 , output = [] , [] , []
    n = 0
    while 1 :
        for img in tokens_keys :
            sequences = tokens[img]
            n += 1
            if img in features.keys() :
                feature = features[img][0]
                for sequence in sequences :
                    for index in range(1,len(sequence)) :
                        input_b = sequence[:index]
                        input_b = pad_sequences([input_b], maxlen = max_length, padding='post')[0]
                        output_w = sequence[index]
                        output_w = to_categorical([output_w],num_classes=vocab_size)[0]
                        input_1.append(feature)
                        input_2.append(input_b)
                        output.append(output_w)
            
            if n == batch_size :
                try :
                    input_1, input_2 , output = np.array(input_1), np.array(input_2), np.array(output)
                    yield [input_1,input_2],output
                    input_1 , input_2 , output = [] , [] , []
                    n = 0
                except :
                    print("Skipped")
                    input_1 , input_2 , output = [] , [] , []
                    n = 0

## ***9-Building Captioning Model***

In [None]:
no_of_features = 4096
  
def build_model(no_of_features,max_length,output_size,learning_rate) :

# images features model path
  input_img = Input(shape=(no_of_features,))
  cnn_layer1 = Dropout(.4)(input_img)
  cnn_layer2 = Dense(256, activation = 'relu')(cnn_layer1)
  
# sequences path
  input_seq = Input(shape=(max_length,))
  lstm_layer1 = Embedding(output_size,300,input_length = max_length , mask_zero = True )(input_seq)
  lstm_layer2 = Dropout(.4) (lstm_layer1)
  lstm_layer3 = LSTM(256,activation='tanh') (lstm_layer2)

  merging_layer = add([cnn_layer2,lstm_layer3])
  final_dense = Dense(256 , activation ='relu')(merging_layer)
  output = Dense(output_size , activation ='softmax')(final_dense)

  model = Model(inputs = [input_img,input_seq] , outputs = output )

  optimizer = Adam(learning_rate=learning_rate)
  model.compile(loss = 'categorical_crossentropy' , optimizer = optimizer)

  return model

In [None]:
captioning_model = build_model(no_of_features,max_length,vocab_size, .001)

In [None]:
plot_model(captioning_model , show_shapes = True)

In [None]:
captioning_model.summary()

## ***10-Checking GPU Power***

In [None]:
# making sure of the GPU power
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.config.experimental.list_physical_devices('GPU'))

In [None]:
tf.config.experimental.set_visible_devices(tf.config.experimental.list_physical_devices('GPU')[0], 'GPU')

## ***11-Model Training***

In [None]:
# Define a custom callback class to track the training loss history
class LossHistory(tf.keras.callbacks.Callback):

# Define a function to initialize the loss history list at the beginning of training
    def on_train_begin(self, logs={}):
        self.losses = []
    
# Define a function to append the training loss at the end of each epoch
    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))

history = LossHistory()

In [None]:
model_loss = []

In [None]:
steps = len(training_tokens) / 64

for i in range(50) :
  # shuffling training data before each epoch
  tokens_keys = list(training_tokens.keys())
  random.shuffle(tokens_keys)
  data = data_generator(tokens_keys,training_tokens,features,vocab_size,max_length,64)

  captioning_model.fit(data , epochs = 1 , steps_per_epoch=steps , verbose =1, callbacks=[history])

  # extracting epoch model loss and saving it into txt file
  loss = history.losses
  model_loss.append(loss[0])
  write_list_to_file(model_loss, 'model_loss.txt')

captioning_model.save('model_49.h5')

In [None]:
# loading txt file into list
def read_file_to_list(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

In [None]:
model_loss = read_file_to_list(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\saved_models\model_loss.txt")

## ***12-Inference using greedy algorithm***

In [None]:
captioning_model = tf.keras.models.load_model(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\saved_models\model_49.h5")

In [None]:
# importing saved TextVectorization layer
from_disk = pickle.load(open("tv_layer.pkl", "rb"))
vectorize_layer = TextVectorization.from_config(from_disk['config'])
vectorize_layer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
vectorize_layer.set_weights(from_disk['weights'])

# restoring vocab using TextVectorization layer
vocabulary = list(vectorize_layer.get_vocabulary())
vocab_size = vectorize_layer.vocabulary_size()

In [None]:
# loading saved fatures exctractor model
features_extractor = tf.keras.models.load_model(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\saved_models\features_extractor.h5")

In [None]:
def get_features_from_image(image_path,model) :
  img = Image.open(image_path)
  img = img.resize((224,224))
  img = np.expand_dims(img,axis = 0)
  img = img/127.5
  img = img-1
  features = model.predict(img)
  return features

def get_word(index,vocab) :
  word = vocab[index]
  return word

In [None]:
def get_caption(path,features_extractor,vectorize_layer,captioning_model):
  my_features = get_features_from_image(path,features_extractor)
  caption = '<start>'
  for i in range(max_length) :
    sequenced_caption = vectorize_layer(tf.constant([caption])).numpy().tolist()
    padded_sequenced_caption = pad_sequences(sequenced_caption , maxlen = max_length, padding='post')[0]
    padded_sequenced_caption = np.resize(padded_sequenced_caption,(1,max_length))
    output = captioning_model.predict([my_features , padded_sequenced_caption])
    index = np.argmax(output)
    if index == 2 :
      caption = caption + ' <end>'
      return caption
    else :
      current_word = get_word(index,vocabulary)
      caption = caption + ' ' + current_word
  return caption

In [None]:
def get_caption_show_image(image_path=None) :
    if image_path is not None :
        image_path = image_path
    else :
        images_names = read_file(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\testing_images_list.txt")
        image_index = random.randint(0,len(images_names))
        image_path = os.path.join(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\flickr30k_images",images_names[image_index])

    caption = get_caption(image_path,features_extractor,vectorize_layer,captioning_model)[8:-5]
    print(caption)
    image = plt.imread(image_path)
    fig,ax = plt.subplots()
    ax.imshow(image)
    ax.axis('off')

    plt.show()

### ***12.1-Testing using random Images from testing set***

In [None]:
get_caption_show_image()

In [None]:
get_caption_show_image()

In [None]:
get_caption_show_image()

In [None]:
get_caption_show_image()

In [None]:
get_caption_show_image()

### ***12.2-Testing using Images downloaded from web***

In [None]:
get_caption_show_image(r"C:\Users\User\Desktop\pexels-chevanon-photography-1108099.jpg")

In [None]:
get_caption_show_image(r"C:\Users\User\Desktop\pexels-pixabay-2209.jpg")

In [None]:
get_caption_show_image(r"C:\Users\User\Desktop\pexels-pixabay-2346.jpg")

In [None]:
get_caption_show_image(r"C:\Users\User\Desktop\pexels-pixabay-248547.jpg")

In [None]:
get_caption_show_image(r"C:\Users\User\Desktop\pexels-milena-de-narvaez-ayllon-2889030.jpg")

## ***13-Inference using beam search algorithm***

In [None]:
# getting top k probabilities and indexes
def get_word_preds(sentence,testing_image,beam_size) :
    sequenced_caption = vectorize_layer(tf.constant([sentence])).numpy().tolist()
    padded_sequenced_caption = pad_sequences(sequenced_caption , maxlen = max_length, padding='post')[0]
    padded_sequenced_caption = np.resize(padded_sequenced_caption,(1,max_length))
    preds = captioning_model.predict([testing_image , padded_sequenced_caption])
    word_preds_indexes = np.argsort(preds[0])[-beam_size:]
    return preds,word_preds_indexes

In [None]:
# getting top k captions using beam search algorithm
def get_caption_with_beam(image_path,beam_size) :

    # extracting image feature
    my_testing_image = get_features_from_image(image_path,features_extractor)

    cap = "<start>"
    current_k_sentences = {}
    final_captions = []
    new_hypotheses = []

   
    preds,word_preds_indexes = get_word_preds(cap,my_testing_image,beam_size)

    
    for w in word_preds_indexes:
        new_seq = [cap]
        new_seq.append(get_word(w,vocabulary))
        new_hypotheses.append((new_seq, preds[0][w]))

    
    new_hypotheses = sorted(new_hypotheses, key=lambda x: x[1], reverse=True)[:beam_size]
    for seq, prob in new_hypotheses:
        current_k_sentences[" ".join(seq)] = prob

    for _ in range(max_length-1) :
        all_sentences = []
        
        for sentence in current_k_sentences.keys() :
            # making sure the caption stops at "end" 
            if sentence[-3:] == "end" :
                final_captions.append((sentence,current_k_sentences[sentence]))
                beam_size = beam_size-1
                if beam_size == 0 :
                    break
                continue
            
            preds,word_preds_indexes = get_word_preds(sentence,my_testing_image,beam_size)

            new_hypotheses = []
        
            for w in word_preds_indexes:
                new_seq = [sentence]
                new_seq.append(get_word(w,vocabulary))
                new_hypotheses.append((new_seq, current_k_sentences[sentence]*preds[0][w]))

            new_hypotheses = sorted(new_hypotheses, key=lambda x: x[1], reverse=True)[:beam_size]
            
            for seq, prob in new_hypotheses:
                all_sentences.append((" ".join(seq),prob))

        
        all_sentences = sorted(all_sentences, key=lambda x: x[1], reverse=True)[:beam_size]
        current_k_sentences = {}
        for seq, prob in all_sentences:
            current_k_sentences["".join(seq)] = prob

        if beam_size == 0 :
            break
    
    # printing top k captions
    final_captions = sorted(final_captions, key=lambda x: x[1], reverse=True)
    for cap,prob in final_captions :
        print(cap[8:-4])
    
    # displaying testing image
    image = plt.imread(image_path)
    fig,ax = plt.subplots()
    ax.imshow(image)
    ax.axis('off')
    plt.show()

In [None]:
final_captions = get_caption_with_beam(r"C:\Users\User\Desktop\Kids-now-spend-twice-as-much-time-playing-indoors-than-outdoors.jpg",6)

In [None]:
final_captions = get_caption_with_beam(r"C:\Users\User\Desktop\pexels-tarikul-raana-3619972.jpg",6)

## ***14-Visualizing Model Performance***

In [None]:
model_loss = [round(float(value),3) for value in model_loss ]

In [None]:
fig , ax = plt.subplots()
ax.plot(range(len(model_loss)),model_loss)
ax.set_xlabel("No of epochs")
ax.set_ylabel("Loss")
ax.set_title("Training Loss")
plt.show()

## ***15-BLEU Score***

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def evaluation_func(captioning_model,vectorize_layer,features_extractor,images_folder,images_set_path) :
  images_names = read_file(images_set_path)
  images_tokens = load_tokens(r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\cleaned_data.txt",images_names)
  actual , predicted = list() , list()
  for image in images_tokens.keys() :
    image_path = os.path.join(images_folder,image)
    generated_caption = get_caption(image_path,features_extractor,vectorize_layer,captioning_model)
    actual_captions = images_tokens[image]
    actual.append([caption.split() for caption in actual_captions])
    predicted.append(generated_caption.split())

  BLEU_2 = corpus_bleu(actual,predicted,weights=(0.5, 0.5, 0, 0))
  return BLEU_2

In [None]:
training_bleu_score  = evaluation_func(captioning_model,vectorize_layer,features_extractor,r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\flickr30k_images",r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\training_images_list.txt")
validation_bleu_score = evaluation_func(captioning_model,vectorize_layer,features_extractor,r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\flickr30k_images",r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\cross_validation_images_list.txt")
testing_bleu_score  = evaluation_func(captioning_model,vectorize_layer,features_extractor,r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\flickr30k_images",r"E:\Khaled\Data\Projects\Image Caption Generator - GPU\testing_images_list.txt")

In [None]:
print(f"Training BLEU Score : {training_bleu_score}")
print(f"Cross Validation BLEU Score : {validation_bleu_score}")
print(f"Testing BLEU Score : {testing_bleu_score}")

It's important to note that the BLEU score is not the only evaluation metric used for image captioning models, and that other metrics such as METEOR, ROUGE, and CIDEr may also be used to evaluate a model's performance.
It's also worth mentioning that BLEU is a controversial evaluation metric for image captioning, as it only measures the overlap between the generated captions and the reference captions, and does not take into account other factors such as the fluency, coherence, and overall quality of the captions.