In [55]:
import torch, torchvision
print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)

from torchvision.models import inception_v3, Inception_V3_Weights
print("torchvision import successful")

torch: 2.7.0+cpu
torchvision: 0.22.0+cpu
torchvision import successful


In [56]:
# Path setup
import sys, os
from pathlib import Path
repo_root = Path().resolve().parent
sys.path.insert(0, str(repo_root))
os.chdir(repo_root)

In [57]:
# Imports
import math, json, pickle
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchvision.models import inception_v3, Inception_V3_Weights
from utils.dataloader import get_transforms, load_split_ids, build_caption_dataset
from utils.caption_dataset import CaptionDataset

In [58]:
from pathlib import Path
from utils.dataloader import get_transforms, load_split_ids, build_caption_dataset

# 1) Locate the split files
train_split = list(Path().rglob("Flickr_8k.trainImages.txt"))[0]
val_split   = train_split.with_name("Flickr_8k.devImages.txt")
test_split  = train_split.with_name("Flickr_8k.testImages.txt")
split_dir   = train_split.parent

# 2) Load split IDs
train_ids = load_split_ids(train_split)
val_ids   = load_split_ids(val_split)
test_ids  = load_split_ids(test_split)
print(f"Split sizes → train: {len(train_ids)}, val: {len(val_ids)}, test: {len(test_ids)}")

# 3) Locate the image folder
image_folder = list(Path().rglob("Flicker8k_Dataset"))[0]
print("Images folder:", image_folder, "→", len(list(image_folder.iterdir())), "files found")

# 4) Build datasets
transform_train = get_transforms("train")
transform_val   = get_transforms("val")

train_dataset = build_caption_dataset(train_ids, image_caption_seqs, word2idx, image_folder, transform_train)
val_dataset   = build_caption_dataset(val_ids,   image_caption_seqs, word2idx, image_folder, transform_val)
test_dataset  = build_caption_dataset(test_ids,  image_caption_seqs, word2idx, image_folder, transform_val)

print(f"Dataset sizes → train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")


Split sizes → train: 6000, val: 1000, test: 1000
Images folder: Downloads\Flickr8k_Dataset\Flicker8k_Dataset → 8091 files found
Dataset sizes → train: 30000, val: 5000, test: 5000


In [59]:
# 6) EncoderInception
class EncoderInception(nn.Module):
    def __init__(self, encoded_size=14, fine_tune=True):
        super().__init__()
        inception = inception_v3(weights=Inception_V3_Weights.IMAGENET1K_V1,
                                 aux_logits=True)
        modules = [m for n,m in inception.named_children()
                   if n not in ("AuxLogits","avgpool","dropout","fc")]
        self.backbone = nn.Sequential(*modules)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_size,encoded_size))
        self.fine_tune(fine_tune)
    def forward(self, x):
        x = self.backbone(x)
        x = self.adaptive_pool(x)
        B,C,H,W = x.size()
        return x.view(B,C,H*W).permute(0,2,1)
    def fine_tune(self, fine_tune):
        for p in self.backbone.parameters(): p.requires_grad=False
        if fine_tune:
            for block in list(self.backbone.children())[-2:]:
                for p in block.parameters(): p.requires_grad=True

In [60]:
# 7–8) PositionalEncoding, DecoderTransformer, CaptioningModel
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div = torch.exp(torch.arange(0, d_model, 2).float() *
                        -(math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(pos*div)
        pe[:,1::2] = torch.cos(pos*div)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x): return x + self.pe[:,:x.size(1)]

class DecoderTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8,
                 num_layers=6, dim_feedforward=2048,
                 dropout=0.1, max_len=50, encoder_dim=2048):
        super().__init__()
        self.d_model = d_model
        self.enc_proj = nn.Linear(encoder_dim, d_model)
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_enc     = PositionalEncoding(d_model, max_len)
        layer = nn.TransformerDecoderLayer(d_model, nhead,
                                           dim_feedforward, dropout)
        self.transformer_decoder = nn.TransformerDecoder(layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, encoder_seq, tgt_seq):
        proj_src = self.enc_proj(encoder_seq)              # (B,N,d_model)
        src      = proj_src.permute(1,0,2)                 # (N,B,d_model)
        tgt      = self.token_embed(tgt_seq)*math.sqrt(self.d_model)
        tgt      = self.pos_enc(tgt).permute(1,0,2)        # (T,B,d_model)
        mask     = nn.Transformer.generate_square_subsequent_mask(
                       tgt.size(0)).to(tgt.device)
        out = self.transformer_decoder(tgt, src, tgt_mask=mask)
        return self.fc_out(out.permute(1,0,2))             # (B,T,V)

class CaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, images, captions):
        enc = self.encoder(images)
        return self.decoder(enc, captions)

In [61]:
# 9) train_model with AdamW
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm
def train_model(model, train_dataset, val_dataset, word2idx,
                device='cuda', batch_size=32, epochs=20,
                patience=3, lr=1e-4):
    pad_idx = word2idx['<pad>']
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    optimizer, mode='min', patience=2, factor=0.5)
    train_loader = DataLoader(train_dataset, batch_size=batch_size,
                              shuffle=True, drop_last=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size,
                              shuffle=False)
    best_val_loss = float('inf'); patience_ctr = 0
    for epoch in range(epochs):
        model.train()
        train_losses=[]
        for imgs, caps, _ in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            imgs, caps = imgs.to(device), caps.to(device)
            optimizer.zero_grad()
            outs = model(imgs, caps[:,:-1])
            loss = criterion(outs.reshape(-1,outs.size(-1)),
                             caps[:,1:].reshape(-1))
            loss.backward(); optimizer.step()
            train_losses.append(loss.item())
        # …(validation & BLEU as before)…
        # save best, early stop…
    return model

In [62]:
# 10) device
device = torch.device("mps" if torch.backends.mps.is_available()
                      else "cuda" if torch.cuda.is_available()
                      else "cpu")

In [65]:
# 11–12) Instantiate & train with max_len large enough for your captions
encoder = EncoderInception(encoded_size=14, fine_tune=True)

decoder = DecoderTransformer(
    vocab_size=len(word2idx),
    d_model=512,
    nhead=8,
    num_layers=3,
    dim_feedforward=2048,
    dropout=0.1,
    max_len=40,          # ← increased from 20 to 40
    encoder_dim=2048
)

model = CaptioningModel(encoder, decoder).to(device)

trained_model = train_model(
    model,
    train_dataset,
    val_dataset,
    word2idx,
    device=device,
    batch_size=32,
    epochs=15,
    patience=3,
    lr=1e-4
)

Epoch 1 [Train]:   5%|▌         | 51/937 [11:04<3:12:28, 13.03s/it] 


KeyboardInterrupt: 