In [2]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set file paths
IMAGE_FOLDER = "/content/drive/MyDrive/data/loadedimages"
CAPTIONS_FILE = "/content/drive/MyDrive/data/filtered_captions.tsv"
OUTPUT_FILE = "/content/drive/MyDrive/data/filtered_captions_cleaned.tsv"


Mounted at /content/drive


In [None]:
import os
import pandas as pd

# Load captions file
df = pd.read_csv(CAPTIONS_FILE, sep="\t", names=["image", "caption"])
df = df.dropna()
df["image"] = df["image"].astype(str).str.strip()

# Ensure .jpg extension
df["image"] = df["image"].apply(lambda x: x if x.endswith(".jpg") else x + ".jpg")

# List of images in the folder
image_files = set([f.strip() for f in os.listdir(IMAGE_FOLDER)])

# Filter rows where image exists
df_filtered = df[df["image"].isin(image_files)].reset_index(drop=True)

# Save cleaned captions
OUTPUT_FILE = "/content/drive/MyDrive/data/filtered_captions_cleaned.tsv"
df_filtered.to_csv(OUTPUT_FILE, sep="\t", index=False, header=False)

# Show result
print(f" Filtered captions: {len(df_filtered)}")
print(" Saved to:", OUTPUT_FILE)


 Filtered captions: 53367
 Saved to: /content/drive/MyDrive/data/filtered_captions_cleaned.tsv


In [4]:
# Set new cleaned file path
CAPTIONS_FILE = "/content/drive/MyDrive/data/filtered_captions_cleaned.tsv"

# Load the cleaned TSV
df = pd.read_csv(CAPTIONS_FILE, sep="\t", names=["image", "caption"])
df = df.dropna()
df["image"] = df["image"].astype(str).str.strip()

# Build dictionary: image -> [captions]
captions_dict = {}
for _, row in df.iterrows():
    img_name = row["image"]
    caption = row["caption"]
    if img_name not in captions_dict:
        captions_dict[img_name] = []
    captions_dict[img_name].append(caption)

print(f" Captions dictionary created for {len(captions_dict)} images")


 Captions dictionary created for 53367 images


In [5]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm

# Load ResNet50 and remove final classification layer
resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.eval().cuda()

# Image transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Output path
FEATURES_PATH = "/content/drive/MyDrive/data/image_features.pt"
features_dict = {}

print(" Extracting image features...")
with torch.no_grad():
    for img_name in tqdm(captions_dict.keys()):
        img_path = os.path.join(IMAGE_FOLDER, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
            img_tensor = transform(image).unsqueeze(0).cuda()
            feature = resnet(img_tensor).view(-1).cpu()  # Flatten to 2048
            features_dict[img_name] = feature
        except Exception as e:
            print(f" Error with {img_name}: {e}")

# Save features
torch.save(features_dict, FEATURES_PATH)
print(f" Saved features for {len(features_dict)} images to {FEATURES_PATH}")



 Extracting image features...


100%|██████████| 53367/53367 [3:51:58<00:00,  3.83it/s]


 Saved features for 53367 images to /content/drive/MyDrive/data/image_features.pt


In [6]:
# Load precomputed features
features_dict = torch.load("/content/drive/MyDrive/data/image_features.pt")

from torch.utils.data import Dataset
import torch

class CaptionFeatureDataset(Dataset):
    def __init__(self, features_dict, captions_dict, word2idx, max_len=22):
        self.image_names = list(features_dict.keys())
        self.features_dict = features_dict
        self.captions_dict = captions_dict
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_name = self.image_names[idx]
        feature = self.features_dict[img_name]
        caption = random.choice(self.captions_dict[img_name]).split()

        # Add <SOS> and <EOS>
        caption = ["<SOS>"] + caption + ["<EOS>"]
        tokens = [self.word2idx.get(w, self.word2idx["<PAD>"]) for w in caption]
        tokens = tokens[:self.max_len]
        tokens += [self.word2idx["<PAD>"]] * (self.max_len - len(tokens))

        return feature, torch.tensor(tokens, dtype=torch.long)


In [7]:
from collections import Counter

# Get all captions from the cleaned dict
all_captions = sum(captions_dict.values(), [])

# Split and flatten all words
words = [word for caption in all_captions for word in caption.split()]
most_common = Counter(words).most_common(4900)

# Special tokens first
vocab = ["<PAD>", "<SOS>", "<EOS>"] + [w for w, _ in most_common]
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
VOCAB_SIZE = len(vocab)

print(f" Vocabulary size: {VOCAB_SIZE}")


 Vocabulary size: 4903


In [8]:
from torch.utils.data import DataLoader

dataset = CaptionFeatureDataset(features_dict, captions_dict, word2idx, max_len=22)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)



In [53]:
import torch
import torch.nn as nn

class Transformer_Decoder(nn.Module):
    def __init__(self, embed_size, vocab_size, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_encoding = nn.Parameter(torch.zeros(1, 22, embed_size))  # Max_len=22
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=embed_size,
            nhead=8,
            dim_feedforward=hidden_size,
            dropout=0.1,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, features, captions, mask=None):
        batch_size = captions.size(0)
        seq_len = captions.size(1)

        # Embed captions and add positional encoding
        embedded = self.embedding(captions) + self.pos_encoding[:, :seq_len, :]

        # Create causal mask for autoregressive decoding
        if mask is None:
            mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(features.device)

        # Features as memory (B, 1, embed_size) for cross-attention
        memory = features.unsqueeze(1)  # No need to repeat across time

        # Decode
        output = self.decoder(tgt=embedded, memory=memory, tgt_mask=mask)
        return self.fc(output)

# Update initialization
decoder = Transformer_Decoder(embed_size=256, vocab_size=VOCAB_SIZE, hidden_size=512, num_layers=3).to(device)

In [54]:
device = "cuda" if torch.cuda.is_available() else "cpu"

decoder = Transformer_Decoder(embed_size=256, vocab_size=VOCAB_SIZE, hidden_size=512, num_layers=3).to(device)
project_features = nn.Linear(2048, 256).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<PAD>"])
optimizer = torch.optim.Adam(list(decoder.parameters()) + list(project_features.parameters()), lr=0.001)


In [None]:
from tqdm import tqdm
import torch.nn as nn

NUM_EPOCHS = 20  
best_loss = float('inf')

print("Training Started...")
for epoch in range(NUM_EPOCHS):
    decoder.train()
    project_features.train()
    total_train_loss = 0

    for features, captions in tqdm(train_loader):  
        features, captions = features.to(device), captions.to(device)

        optimizer.zero_grad()
        projected = project_features(features)  # Shape: [batch_size, 256]
        output = decoder(projected, captions[:, :-1])  # Input excludes <EOS>, Shape: [batch_size, seq_len-1, VOCAB_SIZE]

        # Compute loss (target excludes <SOS>)
        # Use contiguous() to ensure memory layout is compatible with view()
        loss = criterion(output.view(-1, VOCAB_SIZE), captions[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation
    decoder.eval()
    project_features.eval()
    total_val_loss = 0
    with torch.no_grad():
        for features, captions in val_loader:
            features, captions = features.to(device), captions.to(device)
            projected = project_features(features)
            output = decoder(projected, captions[:, :-1])
            loss = criterion(output.view(-1, VOCAB_SIZE), captions[:, 1:].contiguous().view(-1))
            total_val_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Save best model
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        torch.save({'decoder': decoder.state_dict(), 'project_features': project_features.state_dict()}, 'best_model.pt')

print("Training Finished!")

Training Started...


100%|██████████| 1501/1501 [00:25<00:00, 59.26it/s]


Epoch 1 - Train Loss: 4.0981, Val Loss: 4.1179


100%|██████████| 1501/1501 [00:24<00:00, 60.31it/s]


Epoch 2 - Train Loss: 3.8434, Val Loss: 4.0562


100%|██████████| 1501/1501 [00:25<00:00, 59.88it/s]


Epoch 3 - Train Loss: 3.6636, Val Loss: 4.0197


100%|██████████| 1501/1501 [00:24<00:00, 60.23it/s]


Epoch 4 - Train Loss: 3.5217, Val Loss: 4.0213


100%|██████████| 1501/1501 [00:25<00:00, 59.64it/s]


Epoch 5 - Train Loss: 3.3979, Val Loss: 4.0502


100%|██████████| 1501/1501 [00:24<00:00, 60.67it/s]


Epoch 6 - Train Loss: 3.2905, Val Loss: 4.0473


100%|██████████| 1501/1501 [00:25<00:00, 59.87it/s]


Epoch 7 - Train Loss: 3.1889, Val Loss: 4.0934


100%|██████████| 1501/1501 [00:24<00:00, 60.06it/s]


Epoch 8 - Train Loss: 3.1000, Val Loss: 4.1343


100%|██████████| 1501/1501 [00:24<00:00, 60.21it/s]


Epoch 9 - Train Loss: 3.0208, Val Loss: 4.1620


100%|██████████| 1501/1501 [00:24<00:00, 60.18it/s]


Epoch 10 - Train Loss: 2.9478, Val Loss: 4.2022


100%|██████████| 1501/1501 [00:25<00:00, 59.54it/s]


Epoch 11 - Train Loss: 2.8807, Val Loss: 4.2324


100%|██████████| 1501/1501 [00:24<00:00, 60.25it/s]


Epoch 12 - Train Loss: 2.8177, Val Loss: 4.2887


100%|██████████| 1501/1501 [00:25<00:00, 59.76it/s]


Epoch 13 - Train Loss: 2.7635, Val Loss: 4.3124


100%|██████████| 1501/1501 [00:24<00:00, 60.31it/s]


Epoch 14 - Train Loss: 2.7081, Val Loss: 4.3546


100%|██████████| 1501/1501 [00:24<00:00, 60.21it/s]


Epoch 15 - Train Loss: 2.6588, Val Loss: 4.3732


100%|██████████| 1501/1501 [00:25<00:00, 59.48it/s]


Epoch 16 - Train Loss: 2.6138, Val Loss: 4.4039


100%|██████████| 1501/1501 [00:25<00:00, 59.23it/s]


Epoch 17 - Train Loss: 2.5705, Val Loss: 4.4414


100%|██████████| 1501/1501 [00:25<00:00, 59.72it/s]


Epoch 18 - Train Loss: 2.5318, Val Loss: 4.4865


100%|██████████| 1501/1501 [00:25<00:00, 59.87it/s]


Epoch 19 - Train Loss: 2.4923, Val Loss: 4.5142


100%|██████████| 1501/1501 [00:25<00:00, 59.44it/s]


Epoch 20 - Train Loss: 2.4566, Val Loss: 4.5443
Training Finished!


In [None]:
def beam_search_caption(image_path, resnet, transform, decoder, project_features, word2idx, idx2word, beam_width=5, max_len=22):
    decoder.eval()
    project_features.eval()

    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")
    image_tensor = transform(image).unsqueeze(0).cuda()

    with torch.no_grad():
        cnn_feat = resnet(image_tensor).view(1, -1)
        img_embed = project_features(cnn_feat)

    # Beam search
    sequences = [[ [word2idx["<SOS>"]], 0.0 ]]  # (sequence, log_prob)
    completed = []

    for _ in range(max_len):
        candidates = []
        for seq, score in sequences:
            if seq[-1] == word2idx["<EOS>"]:
                completed.append((seq, score))
                continue

            input_seq = torch.tensor([seq], dtype=torch.long).to(device)
            with torch.no_grad():
                output = decoder(img_embed, input_seq)
                probs = torch.softmax(output[:, -1, :], dim=-1)
                topk = torch.topk(probs, beam_width)

            for i in range(beam_width):
                token = topk.indices[0, i].item()
                token_prob = topk.values[0, i].item()
                new_seq = seq + [token]
                new_score = score + torch.log(torch.tensor(token_prob + 1e-10)).item()
                candidates.append((new_seq, new_score))

        # Select top beam_width candidates
        sequences = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_width]
        if len(sequences) == 0:
            break

    # Include completed sequences
    all_sequences = sequences + completed
    best_seq = sorted(all_sequences, key=lambda x: x[1], reverse=True)[0][0]

    caption = [idx2word[t] for t in best_seq if t not in [word2idx["<PAD>"], word2idx["<SOS>"], word2idx["<EOS>"]]]
    return " ".join(caption)

# Test it 
test_image_path = "/content/drive/MyDrive/data/loadedimages/14.jpg"
caption = beam_search_caption(test_image_path, resnet, transform, decoder, project_features, word2idx, idx2word)
print("Generated Caption:", caption)


Generated Caption: decorating christmas tree with hugs a christmas tree
