In [None]:
import os
import json
import pickle
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
!unzip -q "/content/annotations_trainval2017.zip" -d /content/
ann_path = "/content/annotations/captions_train2017.json"
if not os.path.exists(ann_path):
    raise FileNotFoundError("captions_train2017.json not found!")

with open(ann_path, 'r') as f:
    annotations = json.load(f)

id_to_filename = {img['id']: img['file_name'] for img in annotations['images']}

descriptions = {}
for ann in annotations['annotations']:
    img_id = ann['image_id']
    caption = ann['caption']
    img_filename = id_to_filename[img_id]
    if img_filename not in descriptions:
        descriptions[img_filename] = []
    descriptions[img_filename].append(f'startseq {caption.lower()} endseq')

print("Total captions loaded:", sum(len(c) for c in descriptions.values()))

with open("/content/features.pkl", "rb") as f:
    features = pickle.load(f)

descriptions = {k: v for k, v in descriptions.items() if k in features}

all_captions = []
for caps in descriptions.values():
    all_captions.extend(caps)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

max_length = max(len(c.split()) for c in all_captions)

print("Vocab size:", vocab_size)
print("Max length:", max_length)


Total captions loaded: 591753
Vocab size: 27551
Max length: 51


In [None]:
from tensorflow.keras.utils import to_categorical

def data_generator(descriptions, features, tokenizer, max_length, vocab_size):
    while True:
        for fname, caps in descriptions.items():
            feature = features[fname][0]
            for cap in caps:
                seq = tokenizer.texts_to_sequences([cap])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    yield ([feature, in_seq], out_seq)

In [None]:
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.models import Model

inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model([inputs1, inputs2], outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

In [None]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def data_generator(descriptions, features, tokenizer, max_length, vocab_size, batch_size=32):
    while True:
        X1, X2, y = [], [], []
        n = 0
        for key, desc_list in descriptions.items():
            feature = np.array(features[key], dtype='float32')
            for desc in desc_list:
                seq = tokenizer.texts_to_sequences([desc])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
                    n += 1

                    if n == batch_size:

                        X1_batch = np.stack(X1, axis=0).astype('float32')
                        X2_batch = np.array(X2, dtype='int32')
                        y_batch = np.array(y, dtype='float32')

                        print("DEBUG BATCH SHAPE:", X1_batch.shape, X2_batch.shape, y_batch.shape)

                        yield (X1_batch, X2_batch), y_batch

                        X1, X2, y = [], [], []
                        n = 0


In [None]:
gen = data_generator(descriptions, features, tokenizer, max_length, vocab_size, 32)
for (img_f, seq), target in gen:
    print("FINAL SHAPES:", img_f.shape, seq.shape, target.shape)
    break

DEBUG BATCH SHAPE: (32, 2048) (32, 51) (32, 27551)
FINAL SHAPES: (32, 2048) (32, 51) (32, 27551)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import mixed_precision
from tensorflow.keras.callbacks import ModelCheckpoint

mixed_precision.set_global_policy('mixed_float16')
tf.config.optimizer.set_jit(True)

class CaptionDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, descriptions, features, tokenizer, max_length, batch_size=256):
        self.batch_size = batch_size
        self.features = features
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.pairs = []
        for key, desc_list in descriptions.items():
            feature = features[key]
            feature = feature[0] if len(np.array(feature).shape) == 2 else feature
            for desc in desc_list:
                seq = tokenizer.texts_to_sequences([desc])[0]
                for i in range(1, len(seq)):
                    self.pairs.append((feature, seq[:i], seq[i]))

        print("Total training samples:", len(self.pairs))

    def __len__(self):
        return len(self.pairs) // self.batch_size

    def __getitem__(self, index):
        start = index * self.batch_size
        end = start + self.batch_size
        batch = self.pairs[start:end]

        if len(batch) == 0:
            feature_dim = list(self.features.values())[0].shape[-1]
            return (np.zeros((0, feature_dim), dtype=np.float32),
                    np.zeros((0, self.max_length), dtype=np.int32)), np.zeros((0,), dtype=np.int32)

        X1 = np.array([b[0] for b in batch], dtype=np.float32)
        X2 = pad_sequences([b[1] for b in batch], maxlen=self.max_length, padding='post')
        y = np.array([b[2] for b in batch], dtype=np.int32)

        return (X1, X2), y

batch_size = 256
vocab_size = len(tokenizer.word_index) + 1
train_gen = CaptionDataGenerator(descriptions, features, tokenizer, max_length, batch_size)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

model.compile(
    loss=loss_fn,
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy']
)

checkpoint = ModelCheckpoint(
    filepath="checkpoint_epoch_{epoch:02d}_loss_{loss:.3f}.keras",
    monitor="loss",
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

model.fit(
    train_gen,
    epochs=20,
    callbacks=[checkpoint],
    verbose=1
)
model.save("image_caption_coco_fast.h5")
print("Training complete! Model saved.")

Total training samples: 6785091
Epoch 1/20


  self._warn_if_super_not_called()


[1m26501/26504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.3214 - loss: 4.0592
Epoch 1: loss improved from inf to 3.47638, saving model to checkpoint_epoch_01_loss_3.476.keras
[1m26504/26504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 14ms/step - accuracy: 0.3214 - loss: 4.0591
Epoch 2/20
[1m26502/26504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.4303 - loss: 2.9089
Epoch 2: loss improved from 3.47638 to 2.87275, saving model to checkpoint_epoch_02_loss_2.873.keras
[1m26504/26504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 14ms/step - accuracy: 0.4303 - loss: 2.9089
Epoch 3/20
[1m26501/26504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.4493 - loss: 2.7432
Epoch 3: loss improved from 2.87275 to 2.73296, saving model to checkpoint_epoch_03_loss_2.733.keras
[1m26504/26504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 14ms/step - accuracy: 0.4493 - lo



Training complete! Model saved.


In [None]:
model.save("image_caption_model.keras")
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
    print('saved the model')

saved the model


In [None]:
from google.colab import files

files.download("image_caption_coco_fast.h5")
files.download("image_caption_model.keras")
files.download("tokenizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>