In [22]:
import numpy as np
from pathlib import Path

# -------- parameters --------
total_images = 60_000
train_size   = total_images // 2          # 30 000
outfile      = Path("celeba_mia_halves.npz")
rng_seed     = 42                         # change/omit for a new random split
# ----------------------------

# 1. shuffle all indices reproducibly
rng     = np.random.default_rng(rng_seed)
indices = np.arange(total_images)
rng.shuffle(indices)

# 2. split 50 / 50
mia_train_idxs = indices[:train_size]     # shape (30_000,)
mia_eval_idxs  = indices[train_size:]     # shape (30_000,)

# 3. save both arrays to one NPZ
np.savez(outfile,
         mia_train_idxs=mia_train_idxs,
         mia_eval_idxs=mia_eval_idxs)

print(f"Saved to {outfile.resolve()}")


Saved to /home/ethanrao/MIA_test/celeba_mia_halves.npz


In [1]:
#!/usr/bin/env python3
# split_pokemon_blip.py
from pathlib import Path
from datasets import load_dataset, DatasetDict, disable_caching

# ----------------------------------------------------------------------
# CONFIG – adjust these three lines if you need different paths / seed
# ----------------------------------------------------------------------
# CACHE_DIR      = "./hf_cache"          # where the raw download is cached
SAVE_DIR       = Path("/banana/ethan/MIA_data/POKEMON/pokemon_blip_splits")  # final dataset on disk
RANDOM_SEED    = 42                    # makes the 416/417 split reproducible
# ----------------------------------------------------------------------

def main() -> None:
    # 1. download the original (single-split) dataset
    full_ds = load_dataset(
        "reach-vb/pokemon-blip-captions",
        split="train",
        # cache_dir=CACHE_DIR,
    )                                    # → Dataset with 833 rows

    # 2. shuffle & split 416 / 417
    split_ds: DatasetDict = full_ds.train_test_split(
        test_size=417,                   # the remainder (416) becomes train
        shuffle=True,
        seed=RANDOM_SEED,
    )
    assert len(split_ds["train"]) == 416
    assert len(split_ds["test"])  == 417

    # 3. save to disk in Arrow format (incl. images)
    SAVE_DIR.mkdir(parents=True, exist_ok=True)
    split_ds.save_to_disk(str(SAVE_DIR))
    print(f"✅  Saved dataset with splits {list(split_ds.keys())} to {SAVE_DIR}")

if __name__ == "__main__":
    disable_caching()  # optional: avoids filling ~/.cache/huggingface/datasets
    main()


Generating train split:   0%|          | 0/833 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/416 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/417 [00:00<?, ? examples/s]

✅  Saved dataset with splits ['train', 'test'] to /banana/ethan/MIA_data/POKEMON/pokemon_blip_splits


In [7]:

from datasets import load_from_disk
dataset = load_from_disk("/banana/ethan/MIA_data/POKEMON/pokemon_blip_splits")                            # returns a DatasetDict with "train" & "test"


In [10]:
next(iter(dataset['train']))
# len(dataset['test'])

{'image': <PIL.Image.Image image mode=RGB size=1280x1280>,
 'text': 'a cartoon animal with a pink ball on its head'}

In [12]:
#!/usr/bin/env python3
# build_coco2017_val_train_test.py
#
# Requires: pip install "datasets>=2.19.0" requests
# ---------------------------------------------------------------------
from pathlib import Path
import requests
from datasets import load_dataset, DatasetDict, Image, disable_caching

# ----------- configuration ---------------------------------------------------
# CACHE_DIR   = "./hf_cache"                    # raw HF cache
SAVE_DIR    = Path("/banana/ethan/MIA_data/MSCOCO/coco2017_val_splits")   # final dataset directory
IMG_DIR     = SAVE_DIR / "images"             # where JPEGs will live
RANDOM_SEED = 2025
NUM_PROC    = 8                              # parallel download workers
TIMEOUT_S   = 10
# -----------------------------------------------------------------------------

def download_to_local(example):
    """Download one COCO image and write its local path back into the row."""
    url = example["coco_url"]
    local_path = IMG_DIR / example["file_name"]       # e.g. images/val2017/000000397133.jpg
    local_path.parent.mkdir(parents=True, exist_ok=True)

    if not local_path.exists():                       # skip if already present
        resp = requests.get(url, timeout=TIMEOUT_S)
        resp.raise_for_status()
        local_path.write_bytes(resp.content)

    example["image"] = str(local_path)                # new column
    return example

def main() -> None:
    # 1. Load the 5 000-row validation split metadata
    ds = load_dataset(
        "phiyodr/coco2017",
        split="validation",
        # cache_dir=CACHE_DIR,
    )

    # 2. Download every JPEG locally & add an 'image' column with the file path
    ds = ds.map(download_to_local, num_proc=NUM_PROC)

    # 3. Keep only 'image' + 'captions', rename 'captions' -> 'text'
    ds = ds.remove_columns(
        [c for c in ds.column_names if c not in {"image", "captions"}]
    )

    # 4. Tell 🤗 Datasets that 'image' is an Image feature (decodes on access)
    ds = ds.cast_column("image", Image())

    # 5. Shuffle once, slice 2 500 / 2 500 into train / test
    ds = ds.shuffle(seed=RANDOM_SEED)
    split_ds = DatasetDict({
        "train": ds.select(range(2_500)),
        "test" : ds.select(range(2_500, 5_000)),
    })

    # 6. Persist to disk in Arrow format
    SAVE_DIR.mkdir(parents=True, exist_ok=True)
    split_ds.save_to_disk(str(SAVE_DIR))
    print(f"✅  Saved to {SAVE_DIR} (train {len(split_ds['train'])}, "
          f"test {len(split_ds['test'])})")

if __name__ == "__main__":
    disable_caching()     # optional – keeps ~/.cache/huggingface tidy
    main()


Generating train split:   0%|          | 0/118287 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

✅  Saved to /banana/ethan/MIA_data/MSCOCO/coco2017_val_splits (train 2500, test 2500)


In [13]:

from datasets import load_from_disk
dataset = load_from_disk("/banana/ethan/MIA_data/MSCOCO/coco2017_val_splits")                            # returns a DatasetDict with "train" & "test"


In [18]:
# len(dataset['test'len])
len(dataset['train'])

2500

In [2]:
#!/usr/bin/env python3
# build_flickr30k_20k_train_test.py
#
# pip install "datasets>=2.19.0"
# ----------------------------------------------------------------------
from pathlib import Path
from datasets import load_dataset, DatasetDict, Image, disable_caching

# -------------------- CONFIG (edit if you like) -----------------------
# CACHE_DIR   = "./hf_cache"
SAVE_DIR    = Path("/banana/ethan/MIA_data/FLICKR/flickr30k_splits")
RANDOM_SEED = 2025
SAMPLE_SIZE = 20_000                         # 20 k from the train split
# ----------------------------------------------------------------------

def main() -> None:
    # 1. Load the official Flickr-30k *train* split
    ds = load_dataset(
        "nlphuji/flickr30k",
        split="test",                       # 145 k rows
        # cache_dir=CACHE_DIR,
    )

    # 2. Shuffle and down-sample to 20 k
    ds = ds.shuffle(seed=RANDOM_SEED).select(range(SAMPLE_SIZE))

    # 3. Keep only 'image' and 'sentence', rename → 'text'
    ds = (
        ds.remove_columns([c for c in ds.column_names if c not in {"image", "caption"}]).cast_column("image", Image())     # ensures PIL decoding on access
    )

    # 4. Slice 10 k / 10 k into train / test
    split_ds = DatasetDict({
        "train": ds.select(range(10_000)),
        "test" : ds.select(range(10_000, SAMPLE_SIZE)),
    })

    # 5. Persist to disk (Arrow format)
    SAVE_DIR.mkdir(parents=True, exist_ok=True)
    split_ds.save_to_disk(str(SAVE_DIR))
    print(f"✅  Saved Flickr-30k sample to {SAVE_DIR}"
          f" (train={len(split_ds['train'])}, test={len(split_ds['test'])})")

if __name__ == "__main__":
    disable_caching()         # optional – keeps ~/.cache/huggingface tidy
    main()


Saving the dataset (0/3 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

✅  Saved Flickr-30k sample to /banana/ethan/MIA_data/FLICKR/flickr30k_splits (train=10000, test=10000)


In [3]:
from datasets import load_from_disk
dataset = load_from_disk("/banana/ethan/MIA_data/FLICKR/flickr30k_splits")    

In [8]:
print(len(dataset['train']))
dataset.column_names
dataset['train'][0]

10000


{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=399x500>,
 'caption': ['Male players wearing red and blue striped jerseys with yellow numbers hug each other.',
  'Three soccer players in red and blue striped shirts hugging on a soccer field.',
  'Two soccer players are embracing while 2 more approach to also hug them.',
  'Members of a soccer team hugging on the field.',
  'Three men celebrating a goal in the game.']}

In [None]:
#!/usr/bin/env python3
# build_laion_aesthetic_v2_5plus_5k.py
#
#   pip install "datasets>=2.19.0" requests tqdm
# ----------------------------------------------------------------------
from pathlib import Path
from typing import Dict
import hashlib, os, random, urllib.parse, requests, tqdm
from datasets import (
    load_dataset,
    Features,
    Image,
    Value,
    Dataset,
    disable_caching,
)

# ---------------- CONFIG -------------------------------------------------
SAVE_DIR    = Path("/banana/ethan/MIA_data/LAION5k/laion_aesthetic_v2_5plus_5k")
IMG_DIR     = SAVE_DIR / "images"
SAMPLE_SIZE = 2700                   # final rows you want
RANDOM_SEED = 2025
TIMEOUT_S   = 10
HEADERS     = {"User-Agent": "laion-5k-downloader"}
# ------------------------------------------------------------------------


def is_v2_5plus(row: Dict) -> bool:
    score = (
        row.get("AESTHETIC_SCORE")
        or row.get("aesthetic_score")
        or row.get("aesthetic")
    )
    p_wm = (
        row.get("p_watermark")
        or row.get("pwatermark")
        or row.get("P_WATERMARK")
    )
    return score is not None and p_wm is not None and score > 5.0 and p_wm < 0.5


def short_filename(row: Dict) -> str:
    """<hash>.ext   – keeps filenames < 40 chars."""
    base = row.get("hash") or hashlib.sha1(row["URL"].encode()).hexdigest()
    ext = os.path.splitext(urllib.parse.urlparse(row["URL"]).path)[1].lower()
    if ext not in {".jpg", ".jpeg", ".png", ".webp"}:
        ext = ".jpg"
    return f"{base}{ext}"


def download_image(url: str, local: Path) -> bool:
    try:
        r = requests.get(url, timeout=TIMEOUT_S, headers=HEADERS)
        r.raise_for_status()
        local.parent.mkdir(parents=True, exist_ok=True)
        local.write_bytes(r.content)
        return local.stat().st_size > 10_000
    except Exception:
        return False


def main() -> None:
    random.seed(RANDOM_SEED)
    disable_caching()
    SAVE_DIR.mkdir(parents=True, exist_ok=True)

    print("▶ Streaming LAION-2B-en-aesthetic, collecting v2 5+ rows…")
    stream = load_dataset(
        "laion/laion2B-en-aesthetic",
        split="train",
        streaming=True,
    ).shuffle(buffer_size=250_000, seed=RANDOM_SEED)

    good_rows = []
    for row in tqdm.tqdm(stream, desc="Sampling", unit="rows"):
        if not is_v2_5plus(row):
            continue

        fname = short_filename(row)
        local_path = IMG_DIR / fname[:2] / fname
        if local_path.exists() and local_path.stat().st_size > 10_000:
            ok = True
        else:
            ok = download_image(row["URL"], local_path)

        if ok:
            good_rows.append({"image": str(local_path), "text": row.get("TEXT", "")})
            print(len(good_rows))

        if len(good_rows) == SAMPLE_SIZE:
            break

    print(f"✔ Collected {len(good_rows)} images; building Dataset…")

    features = Features({"image": Image(), "text": Value("string")})
    ds = Dataset.from_list(good_rows, features=features)
    ds.save_to_disk(str(SAVE_DIR))
    print(f"✅  Saved train-only split to {SAVE_DIR}")


if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python3
# build_laion2B_multi_ascii_v25_2500.py
#
#   pip install "datasets>=2.19.0" requests tqdm
# ----------------------------------------------------------------------
import hashlib, os, random, re, urllib.parse, requests, tqdm
from pathlib import Path
from typing import Dict
from datasets import (
    load_dataset,
    disable_caching,
    Dataset,
    Features,
    Image,
    Value,
)

# ---------------- CONFIG ------------------------------------------------
SAVE_DIR    = Path("/banana/ethan/MIA_data/LAION5k/laion2B_multi_ascii_v25_2500")
IMG_DIR     = SAVE_DIR / "images"
TARGET_ROWS = 2_700
RNG_SEED    = 2025
TIMEOUT_S   = 10
HEADERS     = {"User-Agent": "laion-2500-downloader"}
ASCII_RE    = re.compile(r"^[\x00-\x7F]+$")      # only ASCII chars
# -----------------------------------------------------------------------


def ascii_only(text: str) -> bool:
    return bool(ASCII_RE.fullmatch(text))


def v25_pass(row: Dict) -> bool:
    score = row.get("AESTHETIC_SCORE") or row.get("aesthetic_score") or row.get("prediction")
    p_wm  = row.get("p_watermark") or row.get("pwatermark") or row.get("P_WATERMARK")
    sim  = row.get("similarity")
    try:
        return score is not None and p_wm is not None and sim is not None and score > 5.0 and p_wm < 0.5 and sim > 0.3
    except TypeError:
        return False


def short_name(row: Dict) -> str:
    base = row.get("hash") or hashlib.sha1(row["URL"].encode()).hexdigest()
    ext  = os.path.splitext(urllib.parse.urlparse(row["URL"]).path)[1].lower()
    if ext not in {".jpg", ".jpeg", ".png", ".webp"}:
        ext = ".jpg"
    return f"{base}{ext}"


def fetch(url: str, dest: Path) -> bool:
    try:
        r = requests.get(url, timeout=TIMEOUT_S, headers=HEADERS)
        r.raise_for_status()
        dest.parent.mkdir(parents=True, exist_ok=True)
        dest.write_bytes(r.content)
        return dest.stat().st_size > 10_000
    except Exception:
        return False


def main() -> None:
    random.seed(RNG_SEED)
    disable_caching()
    SAVE_DIR.mkdir(parents=True, exist_ok=True)

    print("▶ Streaming laion2B-multi-translated, applying ASCII + v2 5+ filters …")
    stream = load_dataset(
        "laion/laion2B-multi-joined-translated-to-en",
        split="train",
        streaming=True,
    ).shuffle(buffer_size=100_000, seed=RNG_SEED)

    rows = []
    for row in tqdm.tqdm(stream, desc="Sampling", unit="rows"):
        text = row.get("ENG TEXT", "") or row.get("eng_text", "")
        if not text or not ascii_only(text):
            continue
        if not v25_pass(row):
            continue

        fname = short_name(row)
        local = IMG_DIR / fname[:2] / fname
        ok = local.exists() and local.stat().st_size > 10_000
        if not ok:
            ok = fetch(row["URL"], local)
        if not ok:
            continue

        rows.append({"image": str(local), "text": text})
        print(len(rows))
        if len(rows) == TARGET_ROWS:
            break

    print(f"✔ Collected {len(rows)} images; saving dataset …")

    features = Features({"image": Image(), "text": Value("string")})
    ds = Dataset.from_list(rows, features=features)
    ds.save_to_disk(str(SAVE_DIR))
    print(f"✅  Saved train-only split to {SAVE_DIR} ({len(ds)} rows)")


if __name__ == "__main__":
    main()


In [5]:
from datasets import load_from_disk
from pathlib import Path
from datasets import load_from_disk, Image, disable_caching
from PIL import Image as PILImage, UnidentifiedImageError

ds = load_from_disk('/banana/ethan/MIA_data/LAION5k/laion2B_multi_ascii_v25_2500')
disable_caching()
# ds = ds.cast_column("image", Image(decode=True))

cnt = 0
idx = []
for i in range(2700):
    try:
        print(type(ds[i]))
        # ds[i]
        idx.append(i)
        cnt += 1
        if cnt == 2500:
            break
    except:
        print(f'xxxx{i}')
        # idx.append(i)
print(len(idx))
clean = ds.select(idx)
clean.save_to_disk('/banana/ethan/MIA_data/LAION5k/laion2B_multi_ascii_v25_2500_clean')
    

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x400 at 0x7F858613A2D0>, 'text': '200 Victorian Sunroom Design Ideas & Remodel Pictures | Houzz'}
{'image': <PIL.Image.Image image mode=RGB size=590x393 at 0x7F8586174850>, 'text': 'Father Teaching Son How To Play Basketball On Driveway At Home - Stock Photo - Images'}
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=300x300 at 0x7F8586174FD0>, 'text': 'Melissa Sweet Wedding Gown'}
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=672x840 at 0x7F8586176890>, 'text': 'View of Montage Deer Valley with Deer Valley Resort in the background'}
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x667 at 0x7F8586175910>, 'text': 'Grilled fillet steak with charred tomato and feta salad'}
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=474x266 at 0x7F8586176D90>, 'text': 'Colourful Autumn Background by Autumn Backgrounds Wallpapers Wallpaper Cave'}
{'image': <PIL.Im

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]