In [6]:
# Basic libraries
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
from math import ceil
from collections import defaultdict
from tqdm.notebook import tqdm        # Progress bar library for Jupyter Notebook

# Deep learning framework for building and training models
import tensorflow as tf
## Pre-trained model for image feature extraction
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

## Tokenizer class for captions tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

## Function for padding sequences to a specific length
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Class for defining Keras models
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, concatenate, Bidirectional, Dot, Activation, RepeatVector, Multiply, Lambda

# For checking score
from nltk.translate.bleu_score import corpus_bleu

In [16]:
# Setting the input and output directory
INPUT_DIR = './'
OUTPUT_DIR = './image_name'

In [11]:
# We are going to use pretraind vgg model
# Load the vgg16 model
model = VGG16()

# Restructuring the model to remove the last classification layer, this will give us access to the output features of the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

# Printing the model summary
print(model.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [12]:
# Initialize an empty dictionary to store image features
image_features = {}

# Define the directory path where images are located
img_dir = os.path.join(INPUT_DIR, 'Images')

# Loop through each image in the directory
for img_name in tqdm(os.listdir(img_dir)):
    # Load the image from file
    img_path = os.path.join(img_dir, img_name)
    image = load_img(img_path, target_size=(224, 224))
    # Convert image pixels to a numpy array
    image = img_to_array(image)
    # Reshape the data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # Preprocess the image for ResNet50
    image = preprocess_input(image)
    # Extract features using the pre-trained ResNet50 model
    image_feature = model.predict(image, verbose=0)
    # Get the image ID by removing the file extension
    image_id = img_name.split('.')[0]
    # Store the extracted feature in the dictionary with the image ID as the key
    image_features[image_id] = image_feature

  0%|          | 0/8091 [00:00<?, ?it/s]

In [17]:
# Store the image features in pickle
pickle.dump(image_features, open(os.path.join(OUTPUT_DIR, 'image_feature.pkl'), 'wb'))

In [18]:
# Load features from pickle file
pickle_file_path = os.path.join(OUTPUT_DIR, 'image_feature.pkl')
with open(pickle_file_path, 'rb') as file:
    loaded_features = pickle.load(file)

In [19]:
with open(os.path.join(INPUT_DIR, 'captions.txt'), 'r') as file:
    next(file)
    captions_doc = file.read()

In [20]:
# Create mapping of image to captions
image_to_captions_mapping = defaultdict(list)

# Process lines from captions_doc
for line in tqdm(captions_doc.split('\n')):
    # Split the line by comma(,)
    tokens = line.split(',')
    if len(tokens) < 2:
        continue
    image_id, *captions = tokens
    # Remove extension from image ID
    image_id = image_id.split('.')[0]
    # Convert captions list to string
    caption = " ".join(captions)
    # Store the caption using defaultdict
    image_to_captions_mapping[image_id].append(caption)

# Print the total number of captions
total_captions = sum(len(captions) for captions in image_to_captions_mapping.values())
print("Total number of captions:", total_captions)

  0%|          | 0/40460 [00:00<?, ?it/s]

Total number of captions: 2447


In [63]:
# Function for processing the captions
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # Take one caption at a time
            caption = captions[i]
            # Preprocessing steps
            # Convert to lowercase
            caption = caption.lower()
            # Remove non-alphabetical characters
            caption = ''.join(char for char in caption if char.isalpha() or char.isspace())
            # Remove extra spaces
            caption = caption.replace('\s+', ' ')
            # Add unique start and end tokens to the caption
            caption = 'startseq ' + ' '.join([word for word in caption.split() if len(word) > 1]) + ' endseq'
            captions[i] = caption

In [64]:
# Creating a List of All Captions
all_captions = [caption for captions in image_to_captions_mapping.values() for caption in captions]

In [65]:
all_captions[:10]


[' a red ball next to it .',
 ' stands on shore shaking off water',
 ' facing the water and the city skyline .',
 ' being held by the male   sitting next to a pond with a near by stroller .',
 ' with stick .',
 ' standing on an ice   looking into something covered with a blue tarp .',
 ' man in background watches .',
 ' as seen from behind her .',
 ' one with a yellow and orange ball   play in some water in front of a field .',
 ' as seen from behind .']

In [66]:
# Tokenizing the Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

In [67]:
# Save the tokenizer
with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

# Load the tokenizer
with open('tokenizer.pkl', 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

In [68]:
# Calculate maximum caption length
max_caption_length = max(len(tokenizer.texts_to_sequences([caption])[0]) for caption in all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Print the results
print("Vocabulary Size:", vocab_size)
print("Maximum Caption Length:", max_caption_length)

Vocabulary Size: 2102
Maximum Caption Length: 24


In [69]:
# Creating a List of Image IDs
image_ids = list(image_to_captions_mapping.keys())
# Splitting into Training and Test Sets
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [70]:
# Data generator function
def data_generator(data_keys, image_to_captions_mapping, features, tokenizer, max_caption_length, vocab_size, batch_size):
    # Lists to store batch data
    X1_batch, X2_batch, y_batch = [], [], []
    # Counter for the current batch size
    batch_count = 0

    while True:
        # Loop through each image in the current batch
        for image_id in data_keys: 
            # Get the captions associated with the current image
            captions = image_to_captions_mapping[image_id]

            # Loopyi through each caption for the current image
            for caption in captions:
                # Convert the caption to a sequence of token IDs
                caption_seq = tokenizer.texts_to_sequences([caption])[0]

                # Loop through the tokens in the caption sequence
                for i in range(1, len(caption_seq)):
                    # Split the sequence into input and output pairs
                    in_seq, out_seq = caption_seq[:i], caption_seq[i]

                    # Pad the input sequence to the specified maximum caption length
                    in_seq = pad_sequences([in_seq], maxlen=max_caption_length)[0]

                    # Convert the output sequence to one-hot encoded format
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # Append data to batch lists
                    X1_batch.append(features[image_id][0])  # Image features
                    X2_batch.append(in_seq)  # Input sequence
                    y_batch.append(out_seq)  # Output sequence

                    # Increase the batch counter
                    batch_count += 1

                    # If the batch is complete, yield the batch and reset lists and counter
                    if batch_count == batch_size:
                        X1_batch, X2_batch, y_batch = np.array(X1_batch), np.array(X2_batch), np.array(y_batch)
                        yield [X1_batch, X2_batch], y_batch
                        X1_batch, X2_batch, y_batch = [], [], []
                        batch_count = 0

In [90]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Embedding, RepeatVector, Bidirectional, Lambda, concatenate, Activation, Dot
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

# Load VGG16 as the encoder
vgg_model = VGG16(weights='imagenet', include_top=False)  # Exclude top FC layers
for layer in vgg_model.layers:
    layer.trainable = False  # Freeze VGG16 weights

# Encoder Model
inputs1 = Input(shape=(224, 224, 3))  # Image input size
fe1 = vgg_model(inputs1)  # Extract features
fe1 = tf.keras.layers.GlobalAveragePooling2D()(fe1)  # Convert to (None, 512)
fe2 = Dense(50, activation='relu')(fe1)  # Reduce to 50 hidden units
fe2_projected = RepeatVector(max_caption_length)(fe2)  # Expand for sequence processing
fe2_projected = Bidirectional(LSTM(50, return_sequences=True))(fe2_projected)

# Sequence feature layers (Decoder)
inputs2 = Input(shape=(max_caption_length,))
se1 = Embedding(vocab_size, 50, mask_zero=True)(inputs2)  # Use hidden size 50
se2 = Dropout(0.5)(se1)
se3 = Bidirectional(LSTM(50, return_sequences=True))(se2)  # Reduce LSTM size to 50

# Apply attention mechanism using Dot product
attention = Dot(axes=[2, 2])([fe2_projected, se3])  # Calculate attention scores
attention_scores = Activation('softmax')(attention)  # Softmax attention

# Apply attention scores to sequence embeddings
attention_context = Lambda(lambda x: tf.einsum('ijk,ijl->ikl', x[0], x[1]))([attention_scores, se3])

# Sum the attended sequence embeddings along the time axis
context_vector = tf.reduce_sum(attention_context, axis=1)

# Decoder model
decoder_input = concatenate([context_vector, fe2], axis=-1)
decoder1 = Dense(50, activation='relu')(decoder_input)
outputs = Dense(vocab_size, activation='softmax')(decoder1)

# Create the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Visualize the model
plot_model(model, show_shapes=True)


2025-04-04 11:37:46.029703: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-04-04 11:37:46.030833: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-04-04 11:37:46.031739: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [83]:
from tensorflow.keras.layers import Input, Dropout, Dense, RepeatVector, LSTM, Embedding, Bidirectional, Dot, Activation, Lambda, concatenate
from tensorflow.keras.models import Model
import tensorflow as tf

# Encoder model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(50, activation='relu')(fe1)  # Use 50 units instead of 256
fe2_projected = RepeatVector(max_caption_length)(fe2)
fe2_projected = Bidirectional(LSTM(50, return_sequences=True))(fe2_projected)  # Adjusted to 50

# Sequence feature layers
inputs2 = Input(shape=(max_caption_length,))
se1 = Embedding(vocab_size, 50, mask_zero=True)(inputs2)  # Embedding should match
se2 = Dropout(0.5)(se1)
se3 = Bidirectional(LSTM(50, return_sequences=True))(se2)  # Adjusted to 50

# Apply attention mechanism using Dot product
attention = Dot(axes=[2, 2])([fe2_projected, se3])  # Calculate attention scores

# Softmax attention scores
attention_scores = Activation('softmax')(attention)

# Apply attention scores to sequence embeddings
attention_context = Lambda(lambda x: tf.einsum('ijk,ijl->ikl', x[0], x[1]))([attention_scores, se3])

# Sum the attended sequence embeddings along the time axis
context_vector = tf.reduce_sum(attention_context, axis=1)  # Shape: (batch_size, 100)

# Decoder model
decoder_input = concatenate([context_vector, fe2], axis=-1)  # Ensure matching shapes
decoder1 = Dense(50, activation='relu')(decoder_input)  # Adjusted to 50
outputs = Dense(vocab_size, activation='softmax')(decoder1)

# Create the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Visualize the model
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True)


2025-04-04 11:32:04.687580: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-04-04 11:32:04.690993: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-04-04 11:32:04.692684: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [71]:
# Encoder model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
fe2_projected = RepeatVector(max_caption_length)(fe2)
fe2_projected = Bidirectional(LSTM(256, return_sequences=True))(fe2_projected)

# Sequence feature layers
inputs2 = Input(shape=(max_caption_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = Bidirectional(LSTM(256, return_sequences=True))(se2)

# Apply attention mechanism using Dot product
attention = Dot(axes=[2, 2])([fe2_projected, se3])  # Calculate attention scores

# Softmax attention scores
attention_scores = Activation('softmax')(attention)

# Apply attention scores to sequence embeddings
attention_context = Lambda(lambda x: tf.einsum('ijk,ijl->ikl', x[0], x[1]))([attention_scores, se3])

# Sum the attended sequence embeddings along the time axis
context_vector = tf.reduce_sum(attention_context, axis=1)

# Decoder model
decoder_input = concatenate([context_vector, fe2], axis=-1)
decoder1 = Dense(256, activation='relu')(decoder_input)
outputs = Dense(vocab_size, activation='softmax')(decoder1)

# Create the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Visualize the model
plot_model(model, show_shapes=True)

2025-04-03 22:13:44.344507: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-04-03 22:13:44.345329: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-04-03 22:13:44.346152: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [99]:
# Set the number of epochs, batch size
epochs = 50
batch_size = 64

# Calculate the steps_per_epoch based on the number of batches in one epoch
steps_per_epoch = ceil(len(train) / batch_size)
validation_steps = ceil(len(test) / batch_size)  # Calculate the steps for validation data

def data_generator(dataset, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size):
    while True:  # Infinite loop for generator
        for img_id, captions in enumerate(dataset):  # Assuming dataset is a list
            feature = loaded_features.get(str(img_id), None)  # Convert img_id to string if necessary
  # Extract image features
            feature = np.expand_dims(feature, axis=0)  # Add batch dimension

            for caption in captions:
                # Convert caption into sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                
                for i in range(1, len(seq)):
                    input_seq, output_word = seq[:i], seq[i]
                    
                    # Pad input sequence
                    input_seq = pad_sequences([input_seq], maxlen=max_caption_length, padding='post')[0]
                    
                    # Convert output word to one-hot vector
                    output_word = to_categorical([output_word], num_classes=vocab_size)[0]
                    
                    yield ([feature, input_seq], output_word)  # Return as tuple

       
# Loop through the epochs for training
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    
    # Set up data generators
    train_generator = data_generator(train, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
    test_generator = data_generator(test, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
    
    model.fit(train_generator, epochs=1, steps_per_epoch=steps_per_epoch,
          validation_data=test_generator, validation_steps=validation_steps,
          verbose=1)

Epoch 1/50



KeyboardInterrupt



In [97]:
print("Available keys in loaded_features:", list(loaded_features.keys())[:10])  # Print first 10 keys


Available keys in loaded_features: ['439037721_cdf1fc7358', '3571193625_835da90c5e', '542269487_5d77b363eb', '3057497487_57ecc60ff1', '441398149_297146e38d', '3430526230_234b3550f6', '2181724497_dbb7fcb0a9', '162743064_bb242faa31', '2754271176_4a2cda8c15', '3282897060_8c584e2ce8']


In [77]:
# Save the model
model.save(OUTPUT_DIR+'/mymodel.h5')

In [78]:
def get_word_from_index(index, tokenizer):
    return next((word for word, idx in tokenizer.word_index.items() if idx == index), None)

In [79]:
def predict_caption(model, image_features, tokenizer, max_caption_length):
    # Initialize the caption sequence
    caption = 'startseq'
    
    # Generate the caption
    for _ in range(max_caption_length):
        # Convert the current caption to a sequence of token indices
        sequence = tokenizer.texts_to_sequences([caption])[0]
        # Pad the sequence to match the maximum caption length
        sequence = pad_sequences([sequence], maxlen=max_caption_length)
        # Predict the next word's probability distribution
        yhat = model.predict([image_features, sequence], verbose=0)
        # Get the index with the highest probability
        predicted_index = np.argmax(yhat)
        # Convert the index to a word
        predicted_word = get_word_from_index(predicted_index, tokenizer)
        
        # Append the predicted word to the caption
        caption += " " + predicted_word
        
        # Stop if the word is None or if the end sequence tag is encountered
        if predicted_word is None or predicted_word == 'endseq':
            break
    
    return caption

In [80]:
# Initialize lists to store actual and predicted captions
actual_captions_list = []
predicted_captions_list = []

# Loop through the test data
for key in tqdm(test):
    # Get actual captions for the current image
    actual_captions = image_to_captions_mapping[key]
    # Predict the caption for the image using the model
    predicted_caption = predict_caption(model, loaded_features[key], tokenizer, max_caption_length)
    
    # Split actual captions into words
    actual_captions_words = [caption.split() for caption in actual_captions]
    # Split predicted caption into words
    predicted_caption_words = predicted_caption.split()
    
    # Append to the lists
    actual_captions_list.append(actual_captions_words)
    predicted_captions_list.append(predicted_caption_words)

  0%|          | 0/209 [00:00<?, ?it/s]

In [81]:
from nltk.translate.bleu_score import sentence_bleu

bleu_scores = []

for ref, hyp in zip(actual_captions_list, predicted_captions_list):
    try:
        score = sentence_bleu(ref, hyp, weights=(1.0, 0, 0, 0))  # BLEU-1
        bleu_scores.append(score)
    except Exception:
        continue

# Compute and print only the mean BLEU-1 score
mean_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
print(mean_bleu)


0.0830769230769229


In [82]:
from nltk.translate.bleu_score import sentence_bleu

bleu1_scores, bleu2_scores, bleu3_scores, bleu4_scores = [], [], [], []

for ref, hyp in zip(actual_captions_list, predicted_captions_list):
    try:
        bleu1_scores.append(sentence_bleu(ref, hyp, weights=(1.0, 0, 0, 0)))  # BLEU-1
        bleu2_scores.append(sentence_bleu(ref, hyp, weights=(0.5, 0.5, 0, 0)))  # BLEU-2
        bleu3_scores.append(sentence_bleu(ref, hyp, weights=(0.33, 0.33, 0.33, 0)))  # BLEU-3
        bleu4_scores.append(sentence_bleu(ref, hyp, weights=(0.25, 0.25, 0.25, 0.25)))  # BLEU-4
    except Exception:
        continue

# Compute mean BLEU scores
mean_bleu1 = sum(bleu1_scores) / len(bleu1_scores) if bleu1_scores else 0
mean_bleu2 = sum(bleu2_scores) / len(bleu2_scores) if bleu2_scores else 0
mean_bleu3 = sum(bleu3_scores) / len(bleu3_scores) if bleu3_scores else 0
mean_bleu4 = sum(bleu4_scores) / len(bleu4_scores) if bleu4_scores else 0

# Print results
print(f"BLEU-1: {mean_bleu1}")
print(f"BLEU-2: {mean_bleu2}")
print(f"BLEU-3: {mean_bleu3}")
print(f"BLEU-4: {mean_bleu4}")


BLEU-1: 0.0830769230769229
BLEU-2: 0.0167485232540874
BLEU-3: 0.00041095688632418085
BLEU-4: 9.111198281682361e-81


In [2]:
from tqdm.notebook import tqdm


# Initialize lists to store actual and predicted captions
actual_captions_list = []
predicted_captions_list = []

# Loop through the test data
for key in tqdm(test):
    # Get actual captions for the current image
    actual_captions = image_to_captions_mapping[key]
    # Predict the caption for the image using the model
    predicted_caption = predict_caption(model, loaded_features[key], tokenizer, max_caption_length)
    
    # Split actual captions into words
    actual_captions_words = [caption.split() for caption in actual_captions]
    # Split predicted caption into words
    predicted_caption_words = predicted_caption.split()
    
    # Append to the lists
    actual_captions_list.append(actual_captions_words)
    predicted_captions_list.append(predicted_caption_words)

print("Actual vs Predicted Captions:\n")

for idx, (actual, predicted) in enumerate(zip(actual_captions_list, predicted_captions_list)):
    # Actual is a list of lists (multiple ground truths)
    actual_sentences = [' '.join(words) for words in actual]
    predicted_sentence = ' '.join(predicted)
    print(f"Image {idx+1}:")
    print(f"  Actual: {actual_sentences}")
    print(f"  Predicted: {predicted_sentence}\n")


NameError: name 'test' is not defined