In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.efficientnet_v2 import EfficientNetV2B0
from tensorflow.keras.layers import Input, LSTM, Dense, GlobalAveragePooling2D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
import numpy as np
import os
import pandas as pd
import cv2

print("Module imports complete")

2023-12-15 10:35:33.019839: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-15 10:35:33.051948: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-15 10:35:33.051986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-15 10:35:33.053270: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-15 10:35:33.059191: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-15 10:35:33.060034: I tensorflow/core/platform/cpu_feature_guard.cc:1

Module imports complete


In [3]:
# Load the Flickr8k dataset
data_dir = 'Dataset/'
image_dir = os.path.join(data_dir, 'Images/')
caption_file = os.path.join(data_dir, 'captions.txt')
df = pd.read_csv(caption_file)
# Define the maximum length of the captions
max_length = 50

# Define the tokenizer for the captions
tokenizer = Tokenizer()

# Load the captions and preprocess them
with open(caption_file, 'r') as f:
    captions = f.readlines()

captions = df['caption'].tolist()
tokenizer.fit_on_texts(captions)


# Tokenize and pad sequences
sequences = tokenizer.texts_to_sequences(captions)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')


# Define the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print("Data preprocessed complete")

Data preprocessed complete


In [4]:


train_datagen = ImageDataGenerator(rescale=1. / 255)
valid_datagen = ImageDataGenerator(rescale=1. / 255)

# Flow from dataframe
# Flow from dataframe with custom preprocessing function
def preprocess_caption(caption):
    sequence = tokenizer.texts_to_sequences([caption])[0]
    return sequence

class CaptionDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, dataframe, directory, tokenizer, target_size, batch_size=32, shuffle=False):
        self.dataframe = dataframe
        self.directory = directory
        self.tokenizer = tokenizer
        self.target_size = target_size
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.dataframe))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.dataframe) / self.batch_size))

    def __getitem__(self, index):
        start = index * self.batch_size
        end = (index + 1) * self.batch_size
        batch_indexes = self.indexes[start:end]

        batch_images = []
        batch_captions = []

        max_caption_length = max([len(self.tokenizer.texts_to_sequences([self.dataframe['caption'].iloc[i]])[0]) for i in batch_indexes])

        for i in batch_indexes:
            image_path = os.path.join(self.directory, self.dataframe['image'].iloc[i])
            image = cv2.imread(image_path)
            image = cv2.resize(image, self.target_size)
            image = image / 255.0  # Normalize the image
            batch_images.append(image)

            caption = self.dataframe['caption'].iloc[i]
            sequence = self.tokenizer.texts_to_sequences([caption])[0]
            padded_sequence = pad_sequences([sequence], maxlen=max_caption_length, padding='post')[0]
            batch_captions.append(padded_sequence)

        return np.array(batch_images), np.array(batch_captions)


    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

# Create custom generators
train_generator = CaptionDataGenerator(df, os.path.join(data_dir, 'Images'), tokenizer, (224, 224), batch_size=32, shuffle=True)
valid_generator = CaptionDataGenerator(df, os.path.join(data_dir, 'Images'), tokenizer, (224, 224), batch_size=32, shuffle=False)


print("Image preprocessing complete")
for batch in train_generator:
    images, captions = batch  # Assuming your generator yields images and captions
    print("Image Shape:", images.shape)
    print("Caption Type:", type(captions))  # captions is a list, not a numpy array
    print("First Caption:", captions[0])  # Accessing the first caption in the batch
    break  # Print only the first batch

print("Done")

Image preprocessing complete
Image Shape: (32, 224, 224, 3)
Caption Type: <class 'numpy.ndarray'>
First Caption: [  1   9   2 212   4   1 220 256 409   3 246   0   0   0   0   0   0   0
   0   0   0]
Done


In [5]:
# Load the EfficientNetV2B0 model
image_model = EfficientNetV2B0(weights='imagenet', input_shape=(224, 224, 3), include_top=False)

# Freeze the layers of the EfficientNetV2B0 model
for layer in image_model.layers:
    layer.trainable = False

# Build the encoder-decoder model
inputs = Input(shape=(224, 224, 3))  # Adjusted input shape

x = image_model(inputs, training=False)
x = GlobalAveragePooling2D()(x)
print("Encoder-decoder model complete")

Encoder-decoder model complete


In [6]:
# Reshape the output of GlobalAveragePooling2D to match the LSTM output shape
# Reshape the output of GlobalAveragePooling2D to match the LSTM output shape
x_reshaped = tf.keras.layers.Reshape((1, -1))(x)



# Add an embedding layer for captions
embedding_layer = Embedding(input_dim=vocab_size, output_dim=256, input_length=max_length)(padded_sequences)

# Use LSTM for caption processing
lstm = LSTM(256)(embedding_layer)

# Reshape the output of LSTM to match the GlobalAveragePooling2D output shape
lstm_reshaped = tf.keras.layers.Reshape((1, -1))(lstm)

# Concatenate image and caption features
merged = tf.keras.layers.concatenate([x_reshaped, lstm_reshaped], axis=-1)

outputs = Dense(vocab_size, activation='softmax')(merged)

model = Model(inputs=inputs, outputs=outputs)  # Added inputs parameter

print("here")


2023-12-15 10:35:51.765674: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2071296000 exceeds 10% of free system memory.
2023-12-15 10:35:51.942807: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2071296000 exceeds 10% of free system memory.


here


In [11]:
def loss_fn(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    final_loss = tf.keras.losses.sparse_categorical_crossentropy(real, pred)

    mask = tf.cast(mask, dtype=final_loss.dtype)
    final_loss *= mask

    return tf.reduce_sum(final_loss)


# Compile the model
# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.compile(loss=loss_fn, optimizer='adam', metrics=['accuracy'])
# Define the callbacks
filepath = 'weights.best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

print("Model compiled")
print("Caption Data Type:", type(padded_sequences))


Model compiled
Caption Data Type: <class 'numpy.ndarray'>


In [12]:
# Train the model
model.fit(train_generator, epochs=100, validation_data=valid_generator, validation_steps=20, callbacks=[checkpoint, reduce_lr])
print("Model trained")

Epoch 1/100


InvalidArgumentError: Graph execution error:

Detected at node loss_fn/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert defined at (most recent call last):
  File "/home/codespace/.python/current/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/home/codespace/.python/current/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/home/codespace/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1077, in launch_instance

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/codespace/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/codespace/.python/current/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/home/codespace/.python/current/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/home/codespace/.python/current/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 529, in dispatch_queue

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 518, in process_one

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 424, in dispatch_shell

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 766, in execute_request

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 429, in do_execute

  File "/home/codespace/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/codespace/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3048, in run_cell

  File "/home/codespace/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3103, in _run_cell

  File "/home/codespace/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/codespace/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3308, in run_cell_async

  File "/home/codespace/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3490, in run_ast_nodes

  File "/home/codespace/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code

  File "/tmp/ipykernel_50712/847388464.py", line 2, in <module>

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/training.py", line 1807, in fit

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/training.py", line 1151, in train_step

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/training.py", line 1209, in compute_loss

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/losses.py", line 143, in __call__

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/losses.py", line 270, in call

  File "/tmp/ipykernel_50712/4044029749.py", line 3, in loss_fn

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/losses.py", line 2454, in sparse_categorical_crossentropy

  File "/home/codespace/.python/current/lib/python3.10/site-packages/keras/src/backend.py", line 5775, in sparse_categorical_crossentropy

assertion failed: [Condition x == y did not hold element-wise:] [x (loss_fn/SparseSoftmaxCrossEntropyWithLogits/Shape_1:0) = ] [32 18] [y (loss_fn/SparseSoftmaxCrossEntropyWithLogits/strided_slice:0) = ] [40455 1]
	 [[{{node loss_fn/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert}}]] [Op:__inference_train_function_29628]

In [None]:
# Save the model
model.save('image_captioning_model.h5')

print("done")