In [2]:
# Flickr30 path="/mnt/media/eleonora/flickr30k"
# Contains Images folder and captions.txt file and flickr_annotations_30k.csv file

# For now let's open the text file and check the first few lines to understand the format of the captions
with open("/mnt/media/eleonora/flickr30k/captions.txt", "r") as f:
    lines = f.readlines()
    for line in lines[:5]:  # Print the first 5 lines
        print(line.strip())

image,caption
1000092795.jpg, Two young guys with shaggy hair look at their hands while hanging out in the yard .
1000092795.jpg," Two young , White males are outside near many bushes ."
1000092795.jpg, Two men in green shirts are standing in a yard .
1000092795.jpg, A man in a blue shirt standing in a garden .


In [3]:
# let's open the csv file and check the first few lines to understand the format of the annotations
import csv
with open("/mnt/media/eleonora/flickr30k/flickr_annotations_30k.csv", "r") as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if i < 5:  # Print the first 5 rows
            print(row)
        else:
            break

['raw', 'sentids', 'split', 'filename', 'img_id']
['["Two young guys with shaggy hair look at their hands while hanging out in the yard.", "Two young, White males are outside near many bushes.", "Two men in green shirts are standing in a yard.", "A man in a blue shirt standing in a garden.", "Two friends enjoy time spent together."]', '[0, 1, 2, 3, 4]', 'train', '1000092795.jpg', '0']
['["Several men in hard hats are operating a giant pulley system.", "Workers look down from up above on a piece of equipment.", "Two men working on a machine wearing hard hats.", "Four men on top of a tall structure.", "Three men on a large rig."]', '[5, 6, 7, 8, 9]', 'train', '10002456.jpg', '1']
['["A child in a pink dress is climbing up a set of stairs in an entry way.", "A little girl in a pink dress going into a wooden cabin.", "A little girl climbing the stairs to her playhouse.", "A little girl climbing into a wooden playhouse.", "A girl going into a wooden building."]', '[10, 11, 12, 13, 14]', 'tr

In [24]:
import os
import random
from collections import defaultdict
from typing import Dict, List, Tuple

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms


In [None]:
# ---------------------------
# 1) Reproducibility (SEED)
# ---------------------------
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # deterministic-ish (can slow down)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def seed_worker(worker_id: int):
    # make dataloader workers deterministic
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
import os
import random
from collections import defaultdict
from typing import Dict, List, Tuple

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# ---------------------------
# 1) Reproducibility (SEED)
# ---------------------------
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # deterministic-ish (can slow down)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def seed_worker(worker_id: int):
    # make dataloader workers deterministic
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


In [13]:
# ---------------------------
# 2) Parse captions (5 per image)
# ---------------------------
def load_flickr30k_captions(captions_txt_path: str) -> Dict[str, List[str]]:
    """
    captions.txt format (as in your screenshot):
    first row header: image,caption
    then: filename.jpg, caption text ...
    """
    cap_dict = defaultdict(list)

    with open(captions_txt_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    # skip header if present
    start_idx = 1 if lines and lines[0].strip().lower().startswith("image,caption") else 0

    for line in lines[start_idx:]:
        line = line.strip()
        if not line:
            continue
        # split only on first comma because caption can contain commas
        fname, caption = line.split(",", 1)
        cap_dict[fname.strip()].append(caption.strip())

    # sanity check: keep only images that have at least 1 caption
    cap_dict = {k: v for k, v in cap_dict.items() if len(v) > 0}
    return cap_dict

# ---------------------------
# 3) Split by IMAGE (80/10/10)
# ---------------------------
def split_filenames(
    filenames: List[str],
    seed: int = 42,
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    test_ratio: float = 0.1,
) -> Tuple[List[str], List[str], List[str]]:
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-9

    rng = random.Random(seed)
    filenames = list(filenames)
    rng.shuffle(filenames)

    n = len(filenames)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    # remainder goes to test (handles rounding)
    n_test = n - n_train - n_val

    train_files = filenames[:n_train]
    val_files = filenames[n_train:n_train + n_val]
    test_files = filenames[n_train + n_val:]

    assert len(train_files) == n_train
    assert len(val_files) == n_val
    assert len(test_files) == n_test

    return train_files, val_files, test_files

# ---------------------------
# 4) Dataset: (image, random caption) without repeats for same image
#    Implementation trick:
#    Expand each image into 5 distinct (image, caption_i) pairs.
#    Shuffle pairs in DataLoader => caption "chosen at random"
#    and cannot repeat for same image because each caption index appears once.
# ---------------------------
class Flickr30kNoRepeatCaptionDataset(Dataset):
    def __init__(
        self,
        images_dir: str,
        captions_by_file: Dict[str, List[str]],
        filenames: List[str],
        seed: int = 42,
        transform=None,
        require_n_captions: int = 5,
    ):
        """
        - filenames: list of image filenames included in this split
        - captions_by_file: dict filename -> list of captions
        - require_n_captions: if 5, we enforce exactly 5 captions (skip images that don't have 5)
        """
        self.images_dir = images_dir
        self.transform = transform
        self.seed = seed

        # filter to files that exist and have enough captions
        kept = []
        for fn in filenames:
            if fn not in captions_by_file:
                continue
            if require_n_captions is not None and len(captions_by_file[fn]) < require_n_captions:
                continue
            img_path = os.path.join(images_dir, fn)
            if os.path.isfile(img_path):
                kept.append(fn)

        self.filenames = kept
        self.captions_by_file = captions_by_file

        # Build expanded index list: each (filename, caption_idx) appears once
        rng = random.Random(seed)
        self.pairs: List[Tuple[str, int]] = []
        for fn in self.filenames:
            caps = captions_by_file[fn]
            # choose 5 captions, but shuffle their order (random choice without repetition)
            idxs = list(range(len(caps)))
            rng.shuffle(idxs)
            idxs = idxs[:require_n_captions] if require_n_captions is not None else idxs
            for ci in idxs:
                self.pairs.append((fn, ci))

        # Note: global shuffle will be handled by DataLoader(shuffle=True)

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx: int):
        fn, cap_idx = self.pairs[idx]
        img_path = os.path.join(self.images_dir, fn)

        image = Image.open(img_path).convert("RGB")
        if self.transform is not None:
            image = self.transform(image)

        caption = self.captions_by_file[fn][cap_idx]
        return image, caption, fn, cap_idx  # fn/cap_idx are handy for debugging


In [27]:
# ---------------------------
# 5) Put everything together
# ---------------------------
SEED = 123
seed_everything(SEED)

flickr_root = "/mnt/media/eleonora/flickr30k"
captions_txt = os.path.join(flickr_root, "captions.txt")
images_dir = os.path.join(flickr_root, "Images")  # adjust if your folder name differs

captions_by_file = load_flickr30k_captions(captions_txt)

all_files = sorted(list(captions_by_file.keys()))
train_files, val_files, test_files = split_filenames(all_files, seed=SEED)

# Optional: torchvision transforms
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225)
    )
])

train_ds = Flickr30kNoRepeatCaptionDataset(
    images_dir=images_dir,
    captions_by_file=captions_by_file,
    filenames=train_files,
    seed=SEED,
    transform=transform,
    require_n_captions=5,
)
val_ds = Flickr30kNoRepeatCaptionDataset(
    images_dir=images_dir,
    captions_by_file=captions_by_file,
    filenames=val_files,
    seed=SEED + 1,   # different but deterministic
    transform=transform,
    require_n_captions=5,
)
test_ds = Flickr30kNoRepeatCaptionDataset(
    images_dir=images_dir,
    captions_by_file=captions_by_file,
    filenames=test_files,
    seed=SEED + 2,   # different but deterministic
    transform=transform,
    require_n_captions=5,
)

# DataLoaders
g = torch.Generator()
g.manual_seed(SEED)

train_loader = DataLoader(
    train_ds,
    batch_size=32,
    shuffle=True,              # shuffle pairs => "caption scelta a caso"
    num_workers=4,
    pin_memory=True,
    worker_init_fn=seed_worker,
    generator=g,
)
val_loader = DataLoader(
    val_ds,
    batch_size=32,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
    worker_init_fn=seed_worker,
)
test_loader = DataLoader(
    test_ds,
    batch_size=32,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
    worker_init_fn=seed_worker,
)

print("Images (unique) in splits:")
print("  train:", len(train_files), "val:", len(val_files), "test:", len(test_files))
print("Pairs (image,caption) in splits (x5 per image):")
print("  train:", len(train_ds), "val:", len(val_ds), "test:", len(test_ds))


Images (unique) in splits:
  train: 25426 val: 3178 test: 3179
Pairs (image,caption) in splits (x5 per image):
  train: 127130 val: 15890 test: 15895
Example batch filenames: ['75893484.jpg', '5661996549.jpg', '6188883048.jpg', '4152974865.jpg', '2914022011.jpg', '3368379417.jpg', '7570232540.jpg', '14887980.jpg', '4242032129.jpg', '56012054.jpg', '2694426634.jpg', '2512682478.jpg', '2360159351.jpg', '3741664817.jpg', '3631839768.jpg', '1083240835.jpg', '7735272386.jpg', '230269706.jpg', '584726817.jpg', '2561212119.jpg', '132966111.jpg', '2103568100.jpg', '4725183955.jpg', '165764051.jpg', '101362650.jpg', '6617071067.jpg', '486720042.jpg', '4717627685.jpg', '2350400382.jpg', '4299244891.jpg', '320779082.jpg', '7558058046.jpg']
Example batch captions: ['A blond woman on a cellphone looking at an advertisement .', '" A young man wearing protective eye wear , works on a tire ."', 'A girl in a white top and black shorts is touching a volleyball with both of her arms together .', 'A man w

In [28]:
# Quick sanity check: same image won't reuse same caption index inside the dataset
# (each (fn, cap_idx) appears once)
batch = next(iter(train_loader))
images, captions, fns, cap_idxs = batch
print("Example batch filenames:", list(fns))
print("Example batch captions:", list(captions))


Example batch filenames: ['4021561862.jpg', '4755772591.jpg', '2250479700.jpg', '4368364278.jpg', '6864033895.jpg', '4843297992.jpg', '4496738245.jpg', '5851818256.jpg', '155210731.jpg', '254901702.jpg', '189100641.jpg', '287967163.jpg', '2759879165.jpg', '480858814.jpg', '3446299757.jpg', '67110478.jpg', '3476381830.jpg', '1474240647.jpg', '2287938451.jpg', '4948224114.jpg', '4731305489.jpg', '4653315864.jpg', '3030953639.jpg', '3303787342.jpg', '3701226275.jpg', '3758747095.jpg', '4400684369.jpg', '316577571.jpg', '2909875716.jpg', '7725132206.jpg', '3439390288.jpg', '469386480.jpg']
Example batch captions: ['The football player in the white jersey is running towards the football player in the light blue jersey .', 'A few people are staring at something .', 'Man and woman sitting with yarn and knitting needles .', 'A man and woman with name tags on chat with others in a hotel lobby .', 'A woman is juggling three oranges outside .', 'A group of bushy haired people are walking down a r