In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold
import pickle
import os
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

# Kaggle dataset paths
BASE_DIR = '../input/deeplearning/'
IMAGES_DIR = os.path.join(BASE_DIR, 'Images/Images/')
CAPTIONS_PATH = os.path.join(BASE_DIR, 'captions.txt')

# Create output directories for models and data
os.makedirs('model', exist_ok=True)
os.makedirs('data', exist_ok=True)

# Load captions
print("Loading captions...")
with open(CAPTIONS_PATH, "r") as f:
    captions_data = f.readlines()

# Process captions
cleaned_captions = []
image_names = []

for line in captions_data:
    line = line.strip()
    if line:  # Skip empty lines
        parts = line.split(',', 1)  # Split only at the first comma
        if len(parts) >= 2:
            image_name = parts[0].strip()
            caption = parts[1].strip()
            
            # Add start and end tokens
            caption = '<start> ' + caption + ' <end>'
            
            image_names.append(image_name)
            cleaned_captions.append(caption)

print(f"Loaded {len(cleaned_captions)} captions")
print(f"Sample caption: {cleaned_captions[0]}")
print(f"Sample image name: {image_names[0]}")

# Configure parameters
vocab_size = 5000  # Define vocabulary size
max_length = 30    # Max caption length
embedding_dim = 256
units = 256
batch_size = 32
features_shape = 4096  # VGG16 FC layer output shape
dropout_rate = 0.5

# Function to extract features from images using VGG16
def extract_features():
    # Load VGG16 model
    base_model = VGG16(weights='imagenet')
    model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)
    
    features = {}
    # Get list of all image files
    image_files = [f for f in os.listdir(IMAGES_DIR) if f.endswith('.jpg')]
    
    print(f"Extracting features from {len(image_files)} images...")
    
    # Process images in batches to avoid memory issues
    for img_name in tqdm(image_files):
        img_path = os.path.join(IMAGES_DIR, img_name)
        try:
            img = Image.open(img_path)
            img = img.resize((224, 224))
            img = np.array(img)
            
            # Handle grayscale images
            if len(img.shape) == 2:
                img = np.stack((img,)*3, axis=-1)
            elif img.shape[2] == 1:
                img = np.concatenate([img, img, img], axis=2)
            elif img.shape[2] == 4:  # Handle RGBA
                img = img[:,:,:3]
                
            # Preprocess for VGG16
            img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
            img = tf.keras.applications.vgg16.preprocess_input(img)
            
            # Extract features
            feature = model.predict(img, verbose=0)
            features[img_name] = feature.flatten()
        except Exception as e:
            print(f"Error processing {img_name}: {e}")
    
    return features

# Extract features from all images
print("Extracting features from images...")
image_features = extract_features()
print(f"Features extracted for {len(image_features)} images")

# Save the features
with open('data/image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

# Tokenize captions
print("Tokenizing captions...")
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<unk>')
tokenizer.fit_on_texts(cleaned_captions)
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Save the tokenizer
with open('data/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Create sequences
captions_sequences = tokenizer.texts_to_sequences(cleaned_captions)
captions_padded = pad_sequences(captions_sequences, maxlen=max_length, padding='post')

# Prepare training data
X_data = []  # Features
y_data = []  # Target captions

for i, img_name in enumerate(image_names):
    if img_name in image_features:
        X_data.append(image_features[img_name])
        y_data.append(captions_padded[i])

X_data = np.array(X_data)
y_data = np.array(y_data)

print(f"Training data shape: {X_data.shape}")
print(f"Target data shape: {y_data.shape}")

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

2025-04-20 08:43:24.285077: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745138604.514149      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745138604.583706      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading captions...
Loaded 40456 captions
Sample caption: <start> caption <end>
Sample image name: image
Extracting features from images...


I0000 00:00:1745138618.189967      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745138618.190678      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
[1m553467096/553467096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Extracting features from 8090 images...


I0000 00:00:1745138625.038853      95 service.cc:148] XLA service 0x7d81d0004a30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745138625.039771      95 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1745138625.039793      95 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1745138625.188466      95 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1745138627.765589      95 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
 74%|███████▍  | 5999/8090 [08:46<03:01, 11.53it/s]

In [None]:
# Define CNN-LSTM Model for sequence prediction
def build_model(vocab_size, max_length, dropout=0.5, lstm_units=256):
    # Image feature input
    image_input = Input(shape=(features_shape,))
    img_dense = Dense(embedding_dim, activation="relu")(image_input)
    img_dropout = Dropout(0.3)(img_dense)
    img_repeat = tf.keras.layers.RepeatVector(max_length)(img_dropout)
    
    # Text sequence input
    text_input = Input(shape=(max_length,))
    text_embed = Embedding(vocab_size, embedding_dim, mask_zero=True)(text_input)
    text_dropout = Dropout(0.3)(text_embed)
    
    # Merge image and text features
    decoder_input = tf.keras.layers.Concatenate()([img_repeat, text_dropout])
    
    # LSTM decoder
    decoder_lstm = LSTM(lstm_units, return_sequences=True)(decoder_input)
    decoder_dropout = Dropout(dropout)(decoder_lstm)
    output = Dense(vocab_size, activation="softmax")(decoder_dropout)
    
    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
    
    return model

In [None]:
import numpy as np
from sklearn.model_selection import KFold

# Train Model with K-Fold Cross Validation
print("Training model with K-Fold cross validation...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_loss = float("inf")
fold = 1

for train_idx, val_idx in kf.split(X_train):
    print(f"\nTraining Fold {fold}/5")
    fold += 1
    
    # Build model
    model = build_model(vocab_size, max_length)
    
    # Prepare input sequences
    train_img_features = X_train[train_idx]
    train_captions = y_train[train_idx]
    val_img_features = X_train[val_idx]
    val_captions = y_train[val_idx]
    
    # Create target sequences (one word shifted)
    train_targets = np.zeros_like(train_captions)
    val_targets = np.zeros_like(val_captions)
    
    # Shift sequences by one position
    for i in range(train_captions.shape[0]):
        train_targets[i, :-1] = train_captions[i, 1:]
    
    for i in range(val_captions.shape[0]):
        val_targets[i, :-1] = val_captions[i, 1:]

In [None]:

    
    # Reshape targets to match what sparse_categorical_crossentropy expects
    # We need to add a new dimension at the end
    train_targets = train_targets.reshape(train_targets.shape[0], train_targets.shape[1], 1)
    val_targets = val_targets.reshape(val_targets.shape[0], val_targets.shape[1], 1)
    
    # Train the model
    history = model.fit(
        [train_img_features, train_captions],
        train_targets,
        validation_data=([val_img_features, val_captions], val_targets),
        epochs=10, batch_size=batch_size, verbose=1
    )
    
    # Check if this model is better
    val_loss = min(history.history['val_loss'])
    if val_loss < best_loss:
        best_loss = val_loss
        model.save("model/best_caption_generator.h5")
        print(f"New best model saved with validation loss: {best_loss:.4f}")

print("Training complete. Best model saved.")