In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Add, LayerNormalization, MultiHeadAttention
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import numpy as np
import os
from PIL import Image

# Assuming you have image_paths and captions lists
image_paths = [...]  # List of image file paths
captions = [...]     # List of corresponding captions

# Tokenize the captions
tokenizer = Tokenizer(char_level=True, filters='', lower=True)
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption) for caption in captions)

sequences = tokenizer.texts_to_sequences(captions)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Split the data
train_image_paths, val_image_paths, train_captions, val_captions = train_test_split(
    image_paths, padded_sequences, test_size=0.2, random_state=42
)

# Load and preprocess images
def preprocess_image(image_path):
    img = Image.open(image_path).resize((224, 224))
    img = np.array(img) / 255.0
    return img

train_images = np.array([preprocess_image(path) for path in train_image_paths])
val_images = np.array([preprocess_image(path) for path in val_image_paths])


In [None]:
# CNN Model for feature extraction
base_model = ResNet50(include_top=False, weights='imagenet')
cnn_output = base_model.output
cnn_output = tf.keras.layers.GlobalAveragePooling2D()(cnn_output)
cnn_model = Model(inputs=base_model.input, outputs=cnn_output)

# Transformer Decoder with Attention
class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, max_length):
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.embedding = Embedding(vocab_size, d_model)
        self.pos_encoding = self.positional_encoding(max_length, d_model)
        self.dec_layers = [tf.keras.layers.MultiHeadAttention(num_heads, d_model) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.final_layer = tf.keras.layers.Dense(vocab_size)

    def positional_encoding(self, max_length, d_model):
        angle_rads = self.get_angles(np.arange(max_length)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def call(self, x, training, look_ahead_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(len(self.dec_layers)):
            x, block = self.dec_layers[i](x, x, x, attention_mask=look_ahead_mask, return_attention_scores=True)
            attention_weights[f'decoder_layer{i+1}_block'] = block

        output = self.final_layer(x)

        return output, attention_weights

# Combine the models
image_input = Input(shape=(224, 224, 3))
image_features = cnn_model(image_input)

caption_input = Input(shape=(max_length,))
caption_embedding = Embedding(vocab_size, 256)(caption_input)

decoder_output, _ = TransformerDecoder(vocab_size, 256, 8, 6, max_length)(caption_embedding)

outputs = Dense(vocab_size, activation='softmax')(decoder_output)

model = Model(inputs=[image_input, caption_input], outputs=outputs)


In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare data generators
def data_generator(image_paths, captions, batch_size):
    num_samples = len(image_paths)
    while True:
        for offset in range(0, num_samples, batch_size):
            batch_images = np.array([preprocess_image(path) for path in image_paths[offset:offset+batch_size]])
            batch_captions = captions[offset:offset+batch_size]
            yield [batch_images, batch_captions[:, :-1]], batch_captions[:, 1:]

# Train the model
batch_size = 32
train_steps = len(train_image_paths) // batch_size
val_steps = len(val_image_paths) // batch_size

model.fit(
    data_generator(train_image_paths, train_captions, batch_size),
    steps_per_epoch=train_steps,
    validation_data=data_generator(val_image_paths, val_captions, batch_size),
    validation_steps=val_steps,
    epochs=20
)
