# Imports

In [None]:
# Tensorflow 
# Importing necessary library or function
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.models import Model

# Code Cell

In [None]:
# Importing necessary library or function
import numpy as np
import pandas as pd
import re
import numpy as np
import os

# Preprocessing

In [10]:
def clean_caption(caption):
    """Improved caption cleaning"""
    # Convert to lowercase
    caption = caption.lower()
    
    # Remove special characters but keep basic punctuation
    caption = re.sub(r'[^a-z0-9\s.,!?]', '', caption)
    
    # Normalize spaces
    caption = re.sub(r'\s+', ' ', caption)
    
    # Remove single characters
    caption = re.sub(r'\s+[a-zA-Z]\s+', ' ', caption)
    
    # Add start and end tokens
    caption = 'startseq ' + caption.strip() + ' endseq'
    
    return caption

# Vocabulary Construction
def create_vocabulary(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    vocab = tokenizer.word_index
    vocab_size = len(vocab) + 1  # +1 for padding token
    return tokenizer, vocab, vocab_size

# Model Training

In [11]:
# 1. Load and organize data
def load_caption_data(filename, image_dir):
    with open(filename, 'r') as f:
        data = f.readlines()
    image_caption_dict = {}
    for line in data:
        img_name, caption = line.strip().split('\t')
        img_name = os.path.join(image_dir, img_name)
        if img_name not in image_caption_dict:
            image_caption_dict[img_name] = []
        image_caption_dict[img_name].append(clean_caption(caption))
    return image_caption_dict

# CNN Image Feature Extraction
def extract_image_features(image_paths,folder,img_size=(224, 224)):
    # Use ResNet50 without top layers for feature extraction
    print("Initializing ResNet50 model...")
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(*img_size, 3))
    model = Model(inputs=base_model.input, outputs=base_model.output)
    print("Model initialized successfully.")
    
    features = {}
    total_images = len(image_paths)
    print(f"Starting feature extraction for {total_images} images...")
    
    for i, img_path in enumerate(image_paths):
        # Print progress more frequently
        if i % 10 == 0:
            print(f"Processing image {i}/{total_images} ({(i/total_images)*100:.2f}%)")
            # Force output buffer to flush
# Importing necessary library or function
            import sys
            sys.stdout.flush()
        try:
            img = tf.keras.preprocessing.image.load_img(img_path, target_size=img_size)
            img_array = tf.keras.preprocessing.image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)
            img_array = tf.keras.applications.resnet50.preprocess_input(img_array)
            feature = model.predict(img_array, verbose=0)
            # Reshape to a fixed size for consistent input to decoder
            feature = np.reshape(feature, (feature.shape[0], -1, feature.shape[3]))
            img_id = os.join(folder, os.path.basename(img_path))
            features[img_id] = feature
            # Print a clear success message every 100 images
            if i % 100 == 0 and i > 0:
                print(f"✓ Successfully processed {i} images")
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
    print(f"Feature extraction completed. Processed {len(features)} images successfully.")
    return features

# Data generator for training
def data_generator(image_features, captions_dict, tokenizer, max_length, batch_size, vocab_size):
    # Get all valid image IDs (those that have features)
    valid_img_ids = [img_id for img_id in captions_dict.keys() if img_id in image_features]
    
    while True:
        # Shuffle the image IDs for each epoch
        np.random.shuffle(valid_img_ids)
        
        X1, X2, y = [], [], []
        count = 0
        
        for img_id in valid_img_ids:
            feature = image_features[img_id]
            captions = captions_dict[img_id]
            
            # Randomly select one caption for this image
            caption = np.random.choice(captions)
            seq = tokenizer.texts_to_sequences([caption])[0]
            
            # Generate input-output pairs
            for i in range(1, len(seq)):
                in_seq = seq[:i]
                out_seq = seq[i]
                
                # Pad sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                # One-hot encode output word
                out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
                
                X1.append(feature[0])
                X2.append(in_seq)
                y.append(out_seq)
                
                count += 1
                if count == batch_size:
                    yield [np.array(X1), np.array(X2)], np.array(y)
                    X1, X2, y = [], [], []
                    count = 0
        
        if count > 0:
            yield [np.array(X1), np.array(X2)], np.array(y)

# Modify model architecture parameters in create_model()
def create_model(vocab_size, max_length, embedding_dim=512, units=512):
    # Image feature input
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.3)(inputs1)  # Reduced dropout for better feature retention
    fe2 = Dense(embedding_dim, activation='relu')(fe1)
    
    # Sequence input with improved embedding
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.3)(se1)  # Reduced dropout
    se3 = LSTM(units, return_sequences=True)(se2)  # Added return_sequences
    se4 = LSTM(units)(se3)  # Added second LSTM layer
    
    # Enhanced decoder
    decoder1 = Add()([fe2, se4])
    decoder2 = Dense(units * 2, activation='relu')(decoder1)  # Wider dense layer
    decoder3 = Dropout(0.3)(decoder2)  # Added dropout
    outputs = Dense(vocab_size, activation='softmax')(decoder3)
    
    # Model with modified optimizer
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

# Modify the generate_caption function to handle batch dimensions correctly
def generate_caption(model, image_feature, tokenizer, max_length):
    # Start with the start sequence token
    in_text = 'startseq'
    
    # Add batch dimension to image_feature if not present
    if len(image_feature.shape) == 1:
        image_feature = np.expand_dims(image_feature, axis=0)
    
    # Iterate until max length or end token
    for i in range(max_length):
        # Convert the current text to a sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # Pad the sequence
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        
        # Predict the next word
        yhat = model.predict([image_feature, sequence], verbose=0)
        # Get the index with highest probability
        yhat = np.argmax(yhat)
        # Convert the index to a word
        word = None
        for w, idx in tokenizer.word_index.items():
            if idx == yhat:
                word = w
                break
        
        # Stop if we can't find the word or reach the end token
        if word is None or word == 'endseq':
            break
        
        # Append the word to the current text
        in_text += ' ' + word
    
    # Remove the start token
    caption = in_text.replace('startseq', '')
    
    return caption.strip()


# Code Cell

In [12]:
# Improved parameters for better model performance
max_length = 38  # Increased to allow longer captions
embedding_dim = 256  # Doubled for better word representation
units = 256  # Doubled LSTM units for more complex patterns
batch_size = 32  # Increased for better stability
epochs = 20  # More epochs for better convergence


# Load data
train_captions_file = 'train.txt'  # Adjust path as needed
validation_captions_file = 'val.txt'  # Adjust path as needed

# Load captions and organize by image
train_captions = load_caption_data(train_captions_file,'train/train')
validation_captions = load_caption_data(validation_captions_file,'val/val')

# Split data into train, validation, and test sets
train_ids, val_ids = train_captions.keys(), validation_captions.keys()

# Create dictionaries for each split
train_data = {img_id: train_captions[img_id] for img_id in train_ids}
val_data = {img_id: validation_captions[img_id] for img_id in val_ids}

# Create vocabulary from training captions
train_captions = [caption for captions in train_data.values() for caption in captions]
tokenizer, vocab, vocab_size = create_vocabulary(train_captions)

# get test_ids from the test/test folder
test_ids = os.listdir('test/test')
test_ids = [os.path.join('test/test', img_id) for img_id in test_ids]

# Code Cell

In [13]:
# Checkpoint the extracted features
train_image_features_file = 'train_image_features.npy'
val_image_features_file = 'val_image_features.npy'
test_image_features_file = 'test_image_features.npy'

# Code Cell

In [None]:
print("Extracting image features...")
test_image_features = extract_image_features(test_ids,'test/test')
print("Image features extracted!")

np.save(test_image_features_file, test_image_features)
print("Test image features saved to disk.")

Extracting image features...
Initializing ResNet50 model...


I0000 00:00:1745870102.508238    1749 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3586 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


# Code Cell

In [None]:
print("Extracting image features...")
val_image_features = extract_image_features(val_ids,'val/val')
print("Image features extracted!")

np.save(val_image_features_file, val_image_features)
print("Validation image features saved to disk.")

# Code Cell

In [None]:
print("Extracting image features...")
train_image_features = extract_image_features(train_ids,'train/train')
print("Image features extracted!")

np.save(train_image_features_file, train_image_features)
print("Training image features saved to disk.")

# Code Cell

In [None]:
# Load and preprocess image data
train_image_features = np.load(train_image_features_file, allow_pickle=True).item()
val_image_features = np.load(val_image_features_file, allow_pickle=True).item()
test_image_features = np.load(test_image_features_file, allow_pickle=True).item()

# add folder to image_features id without changing the original type
train_image_features = {os.path.join('train/train', k): v for k, v in train_image_features.items()}
val_image_features = {os.path.join('val/val', k): v for k, v in val_image_features.items()}
test_image_features = {os.path.join('test/test', k): v for k, v in test_image_features.items()}

# Check if the image features are loaded correctly
print(f"Train image features: {len(train_image_features)} images")
print(f"Validation image features: {len(val_image_features)} images")
print(f"Test image features: {len(test_image_features)} images")

Train image features: 6472 images
Validation image features: 809 images
Test image features: 810 images


# Code Cell

In [None]:
def create_tf_dataset(image_features, captions_dict, tokenizer, max_length, batch_size, vocab_size):
    def generator():
        valid_img_ids = [img_id for img_id in captions_dict.keys() if img_id in image_features]
        while True:
            np.random.shuffle(valid_img_ids)
            for img_id in valid_img_ids:
                # Get image features and reshape correctly
                feature = image_features[img_id]
                # Convert from (1, 49, 2048) to (2048,) by taking first element and flattening correctly
                feature = np.mean(feature[0], axis=0)  # Average across spatial dimensions
                
                captions = captions_dict[img_id]
                caption = np.random.choice(captions)
                seq = tokenizer.texts_to_sequences([caption])[0]
                
                for i in range(1, len(seq)):
                    in_seq = seq[:i]
                    out_seq = seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                    out_seq = tf.keras.utils.to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    # Ensure feature shape is correct
                    yield (feature, in_seq.astype(np.float32)), out_seq

    # Define output signature
    output_signature = (
        (tf.TensorSpec(shape=(2048,), dtype=tf.float32),
         tf.TensorSpec(shape=(max_length,), dtype=tf.float32)),
        tf.TensorSpec(shape=(vocab_size,), dtype=tf.float32)
    )

    # Create dataset
    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=output_signature
    )
    
    # Add batch and prefetch
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Now update the training code
print("Creating datasets...")
train_dataset = create_tf_dataset(
    train_image_features, train_data, 
    tokenizer, max_length, batch_size, vocab_size
)
val_dataset = create_tf_dataset(
    val_image_features, val_data, 
    tokenizer, max_length, batch_size, vocab_size
)

print("\nVerifying data shapes...")
for (img_features, text_seq), label in train_dataset.take(1):
    print(f"Image features shape: {img_features.shape}")
    print(f"Text sequence shape: {text_seq.shape}")
    print(f"Label shape: {label.shape}")
    # Additional debug info
    print(f"\nFeature stats:")
    print(f"Min: {tf.reduce_min(img_features)}")
    print(f"Max: {tf.reduce_max(img_features)}")
    print(f"Mean: {tf.reduce_mean(img_features)}")

Creating datasets...

Verifying data shapes...
Image features shape: (64, 2048)
Text sequence shape: (64, 38)
Label shape: (64, 2692)

Feature stats:
Min: 0.0
Max: 15.335591316223145
Mean: 0.6120313405990601


# Code Cell

In [None]:
model = create_model(vocab_size, max_length, embedding_dim, units)

# Improved callbacks configuration
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "model_checkpoint.h5",
        save_best_only=True,
        monitor='val_loss',
        mode='min'
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,  # Increased patience
        min_delta=0.005,  # More sensitive improvement threshold
        restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,  # More aggressive LR reduction
        patience=3,
        min_lr=1e-6,  # Added minimum learning rate
        verbose=1
    )
]

# First, let's calculate exact steps needed per epoch
def calculate_training_parameters():
    # Count total valid samples
    train_samples = sum(
        len(tokenizer.texts_to_sequences([caption])[0]) - 1
        for img_id in train_data.keys()
        if img_id in train_image_features
        for caption in train_data[img_id]
    )
    
    steps_per_epoch = train_samples // batch_size
    
    # Calculate validation steps
    val_samples = sum(
        len(tokenizer.texts_to_sequences([caption])[0]) - 1
        for img_id in val_data.keys()
        if img_id in val_image_features
        for caption in val_data[img_id]
    )
    
    val_steps = val_samples // batch_size
    
    print("\nTraining Configuration:")
    print(f"Training samples: {train_samples:,}")
    print(f"Validation samples: {val_samples:,}")
    print(f"Steps per epoch: {steps_per_epoch:,}")
    print(f"Validation steps: {val_steps:,}")
    print(f"Batch size: {batch_size}")
    print(f"Epochs: {epochs}")
    
    return steps_per_epoch, val_steps

# Get proper step counts
steps_per_epoch, validation_steps = calculate_training_parameters()

# Modified training configuration
try:
    print("\nStarting training...")
    history = model.fit(
        train_dataset.repeat(),  # Add repeat() to prevent dataset exhaustion
        steps_per_epoch=steps_per_epoch,  # Set proper steps
        epochs=epochs,
        validation_data=val_dataset.repeat(),
        validation_steps=validation_steps,
        callbacks=callbacks,
        verbose=1
    )

except KeyboardInterrupt:
    print("\nTraining interrupted. Saving model...")
    model.save('model_interrupted.h5')
    
except Exception as e:
    print(f"\nTraining failed with error: {str(e)}")
    traceback.print_exc()


Training Configuration:
Training samples: 1,015,749
Validation samples: 126,503
Steps per epoch: 15,871
Validation steps: 1,976
Batch size: 64
Epochs: 30

Starting training...
Epoch 1/30


I0000 00:00:1745723879.350810  801601 cuda_dnn.cc:529] Loaded cuDNN version 90800


[1m15871/15871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.2380 - loss: 4.3234

2025-04-27 06:27:50.235971: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 330792960 exceeds 10% of free system memory.
2025-04-27 06:27:50.431907: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 251658240 exceeds 10% of free system memory.
2025-04-27 06:27:50.485628: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 251658240 exceeds 10% of free system memory.
2025-04-27 06:27:50.545264: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 251658240 exceeds 10% of free system memory.
2025-04-27 06:27:50.601469: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 251658240 exceeds 10% of free system memory.


[1m15871/15871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m595s[0m 37ms/step - accuracy: 0.2380 - loss: 4.3233 - val_accuracy: 0.3117 - val_loss: 3.5882 - learning_rate: 0.0010
Epoch 2/30
[1m15870/15871[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 33ms/step - accuracy: 0.3131 - loss: 3.4853



[1m15871/15871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m585s[0m 37ms/step - accuracy: 0.3131 - loss: 3.4853 - val_accuracy: 0.3244 - val_loss: 3.5397 - learning_rate: 0.0010
Epoch 3/30
[1m15871/15871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m581s[0m 37ms/step - accuracy: 0.3233 - loss: 3.3763 - val_accuracy: 0.3285 - val_loss: 3.5748 - learning_rate: 0.0010
Epoch 4/30
[1m15871/15871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m583s[0m 37ms/step - accuracy: 0.3277 - loss: 3.3449 - val_accuracy: 0.3313 - val_loss: 3.6282 - learning_rate: 0.0010
Epoch 5/30
[1m15870/15871[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 33ms/step - accuracy: 0.3294 - loss: 3.3350
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
[1m15871/15871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m594s[0m 37ms/step - accuracy: 0.3294 - loss: 3.3350 - val_accuracy: 0.3275 - val_loss: 3.6507 - learning_rate: 0.0010
Epoch 6/30
[1m15871/15871[0m [32m━━━━━━━━━━━━

# Code Cell

In [None]:
#  Update the prediction function with proper feature handling
def predict_captions_for_test_images(model, test_image_features, tokenizer, max_length):
    predictions = {}
    length = len(test_image_features)
    
    for i, (img_id, feature) in enumerate(test_image_features.items()):
        try:
            # Reshape feature to match model input (from (1, 49, 2048) to (1, 2048))
            feature = np.mean(feature[0], axis=0)  # Average across spatial dimensions
            feature = np.expand_dims(feature, axis=0)  # Add batch dimension
            
            # Generate caption
            caption = generate_caption(model, feature, tokenizer, max_length)

            # Remove the file location from the image ID 'test/test/' using substring
            img_id = img_id.replace('test/test/', '')
            predictions[img_id] = caption
            
            # Progress reporting
            if i % 10 == 0:
                print(f"Processed {i}/{length} images ({(i/length)*100:.1f}%)")
                
        except Exception as e:
            print(f"Error processing image {img_id}: {str(e)}")
            predictions[img_id] = "Error generating caption"
            continue

    return predictions

# Code Cell

In [None]:
# First, create a custom layer scope
# Importing necessary library or function
import tensorflow as tf
# Importing necessary library or function
from tensorflow.keras.layers import Layer

def load_trained_model(model_path, vocab_size, max_length, embedding_dim=256, units=256):
    """Load trained model with proper custom objects"""
    # First create a fresh model with same architecture
    model = create_model(vocab_size, max_length, embedding_dim, units)
    
    try:
        # Load weights only
        model.load_weights(model_path)
        print("Model weights loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {e}")
        # If loading fails, create new model
        print("Creating new model instead...")
        model = create_model(vocab_size, max_length, embedding_dim, units)
    
    return model


# Code Cell

In [None]:
# Test the prediction
try:
    print("Loading model and generating predictions...")
    model_path = 'model_checkpoint.h5'
    model = load_trained_model(model_path, vocab_size, max_length, embedding_dim, units)
    
    test_predictions = predict_captions_for_test_images(
        model, test_image_features, tokenizer, max_length
    )
    
    # Save predictions
    predictions_df = pd.DataFrame(test_predictions.items(), columns=['image_id', 'caption'])
    predictions_df.to_csv('test_predictions2.csv', index=False)
    print("Predictions saved to test_predictions2.csv")
    
    # Display first few predictions
    print("\nSample predictions:")
    print(predictions_df.head())
    
except Exception as e:
    print(f"Error during prediction: {str(e)}")
# Importing necessary library or function
    import traceback
    traceback.print_exc()

Loading model and generating predictions...
Model weights loaded successfully!


I0000 00:00:1745760083.545105 1026245 cuda_dnn.cc:529] Loaded cuDNN version 90800


Processed 0/810 images (0.0%)
Processed 10/810 images (1.2%)
Processed 20/810 images (2.5%)
Processed 30/810 images (3.7%)
Processed 40/810 images (4.9%)
Processed 50/810 images (6.2%)
Processed 60/810 images (7.4%)
Processed 70/810 images (8.6%)
Processed 80/810 images (9.9%)
Processed 90/810 images (11.1%)
Processed 100/810 images (12.3%)
Processed 110/810 images (13.6%)
Processed 120/810 images (14.8%)
Processed 130/810 images (16.0%)
Processed 140/810 images (17.3%)
Processed 150/810 images (18.5%)
Processed 160/810 images (19.8%)
Processed 170/810 images (21.0%)
Processed 180/810 images (22.2%)
Processed 190/810 images (23.5%)
Processed 200/810 images (24.7%)
Processed 210/810 images (25.9%)
Processed 220/810 images (27.2%)
Processed 230/810 images (28.4%)
Processed 240/810 images (29.6%)
Processed 250/810 images (30.9%)
Processed 260/810 images (32.1%)
Processed 270/810 images (33.3%)
Processed 280/810 images (34.6%)
Processed 290/810 images (35.8%)
Processed 300/810 images (37.0