<a href="https://colab.research.google.com/github/lobral2728/ucb_ml_capstone/blob/colab/LoadDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Capstone Dataset Builder (Balanced Adults from FairFace + Avatars + Animal Faces)

OUTPUT: data/final/{train,val,test}/{human,avatar,animal}/*.jpg (224x224 JPEG)
TARGET: ResNet50 fine-tuning (PyTorch/TF compatible ImageFolder)

------------------------------------
REMINDER (future enhancement option):
------------------------------------
If needed later, we can add a face-detection step (e.g., MTCNN or RetinaFace)
to crop tighter face regions before resizing. This is *not* enabled now.

-------------------------
DATASET SOURCES + CITED:
-------------------------
• FairFace (padding = 1.25 images + train/val labels)
  Repo: https://github.com/joojs/fairface
  Images (padding=1.25): https://drive.google.com/file/d/1g7qNOZz9wC7OfOhcPqH1EZ5bk1UFGmlL/view
  Labels (train):        https://drive.google.com/file/d/1i1L3Yqwaio7YSOCj7ftgk8ZZchPG7dmH/view
  Labels (val):          https://drive.google.com/file/d/1wOdja-ezstMEp81tX1a-EYkFebev4h7D/view
  Paper (cite if you publish): Karkkainen & Joo (WACV 2021)
    @inproceedings{karkkainenfairface,
      title={FairFace: Face Attribute Dataset for Balanced Race, Gender, and Age for Bias Measurement and Mitigation},
      author={Karkkainen, Kimmo and Joo, Jungseock},
      booktitle={WACV},
      year={2021}, pages={1548--1558}
    }

• Avatars (Google’s Cartoon Set via Kaggle):
  https://www.kaggle.com/datasets/brendanartley/cartoon-faces-googles-cartoon-set

• Animal faces (Dogs vs Cats via Kaggle):
  https://www.kaggle.com/datasets/salader/dogs-vs-cats

NOTE: You need a working Kaggle API (place kaggle.json under ~/.kaggle/ with chmod 600).

In [None]:
!python3 -m pip install --upgrade pip
!pip install pandas
!pip install tqdm
!pip install kaggle
# Jupyter-friendly installs (no-op if already present)
!pip -q install gdown imagehash kaggle --upgrade

In [None]:
import time

start_time = time.time()

In [None]:
import os, io, sys, json, math, shutil, random, zipfile, tarfile, hashlib, gdown, imagehash
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from PIL import Image, ImageOps
from tqdm.auto import tqdm
if 'google.colab' in sys.modules:
  from google.colab import userdata
  from google.colab import drive
  import shutil
  from pathlib import Path
  RUNNING_IN_COLAB = True
else:
  RUNNING_IN_COLAB = False

## Reproducibility

In [None]:
SEED = 42
random.seed(SEED); np.random.seed(SEED)

## USER TUNABLES (easy knobs)

In [None]:
TARGET_PER_CLASS = 10_000  # (#3) change this to scale dataset size per class
SPLIT = {"train": 0.80, "val": 0.10, "test": 0.10}
IMG_SIZE = 224
JPEG_QUALITY = 95

# Switchable image-quality gates (your #3 request)
APPLY_IMAGE_FILTERS = True          # turn filters on/off globally
MIN_SHORT_SIDE_PX = 96              # reject if min(width, height) < this
MIN_ASPECT_RATIO   = 0.50           # reject if (w/h) < this
MAX_ASPECT_RATIO   = 2.00           # reject if (w/h) > this
PRINT_REJECTION_MESSAGES = True     # log every rejected image (can be verbose)

# Kaggle slugs (per your links)
KAGGLE_AVATARS_DATASET = "brendanartley/cartoon-faces-googles-cartoon-set"
KAGGLE_ANIMALFACES_DATASET = "salader/dogs-vs-cats"

# Optional: restrict to specific subfolders inside those Kaggle zips (usually not needed)
AVATAR_INCLUDE_DIRS = None
ANIMAL_INCLUDE_DIRS = None

# Safety checks
MIN_REQUIRED_PER_CLASS = TARGET_PER_CLASS

# Direct Google Drive file IDs for FairFace (padding=1.25) + labels (from joojs/fairface README)
FAIRFACE_IMG_PAD125_ID   = "1g7qNOZz9wC7OfOhcPqH1EZ5bk1UFGmlL"
FAIRFACE_LABEL_TRAIN_ID  = "1i1L3Yqwaio7YSOCj7ftgk8ZZchPG7dmH"
FAIRFACE_LABEL_VAL_ID    = "1wOdja-ezstMEp81tX1a-EYkFebev4h7D"

## Path management

In [None]:
BASE_DIR = Path.cwd()
WORK_DIR = BASE_DIR / "data"
RAW_DIR  = WORK_DIR / "raw"
OUT_DIR  = WORK_DIR / "final"
TMP_DIR  = WORK_DIR / "tmp"
FAIRFACE_DIR = RAW_DIR / "fairface"  # will hold pad=1.25 images & label CSVs
for p in [RAW_DIR, OUT_DIR, TMP_DIR, FAIRFACE_DIR]:
    p.mkdir(parents=True, exist_ok=True)

## Dependencies / environment

In [None]:
if RUNNING_IN_COLAB:
  # Set Kaggle credentials as environment variables
  os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
  os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
  # Define the paths (assuming BASE_DIR is defined elsewhere, e.g., Path.cwd())
  # BASE_DIR = Path.cwd()
  COLAB_DATA_DIR = BASE_DIR / "data"
  DRIVE_DATA_PARENT_DIR = Path("/content/drive/My Drive/Colab Data") # Change if you want a different parent dir in Drive
  DRIVE_DATA_DIR = DRIVE_DATA_PARENT_DIR / "Capstone" # Change "your_dataset_name"
else:
  # Kaggle creds sanity
  kaggle_creds = Path.home() / ".kaggle" / "kaggle.json"
  if not kaggle_creds.exists():
      print("Kaggle credentials not found at ~/.kaggle/kaggle.json — Kaggle downloads will fail until you add them.")

## Helper utils

### Idempotent download helpers

In [None]:
def _any_images_in_dir(root: Path) -> bool:
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
    return any(p.suffix.lower() in exts for p in root.rglob("*") if p.is_file())

def _any_archives_in_dir(root: Path) -> bool:
    return any(root.rglob("*.zip")) or any(root.rglob("*.tar")) or any(root.rglob("*.tar.gz")) or any(root.rglob("*.tgz"))

def extract_all_archives(root: Path):
    """Extract all archives (recursively finds new ones after extraction)."""
    did_any = True
    extracted_archives = set()
    while did_any:
        did_any = False
        archives_to_extract = []

        # Only look for zip files in the root directory
        zip_files = list(root.glob("*.zip"))
        # Only look for tar files in the root directory
        tar_files = list(root.glob("*.tar")) + list(root.glob("*.tar.gz")) + list(root.glob("*.tgz"))

        archives_to_extract.extend(zip_files)
        archives_to_extract.extend(tar_files)

        for archive_path in archives_to_extract:
            if archive_path not in extracted_archives:
                try:
                    extract_archive(archive_path, archive_path.parent)
                    did_any = True
                    extracted_archives.add(archive_path)
                except Exception as e:
                    print(f"Failed to extract {archive_path.name}: {e}")
    return


def is_fairface_present(ff_dir: Path) -> bool:
    # Look for train/val with images
    train_dirs = [
        ff_dir / "train",
        ff_dir / "val",
        ff_dir / "fairface-img-margin125-trainval" / "train",
        ff_dir / "fairface-img-margin125-trainval" / "val",
    ]
    has_imgs = any(_any_images_in_dir(d) for d in train_dirs if d.exists())
    labels_ok = (ff_dir / "fairface_label_train.csv").exists() and (ff_dir / "fairface_label_val.csv").exists()
    return has_imgs and labels_ok

def ensure_fairface_downloaded():
    """
    Ensure FairFace (padding=1.25) images and labels exist & are extracted.
    Skips re-download if already present; prints informative status.
    """
    ff_zip = FAIRFACE_DIR / "fairface-img-margin125-trainval.zip"
    lbl_train = FAIRFACE_DIR / "fairface_label_train.csv"
    lbl_val   = FAIRFACE_DIR / "fairface_label_val.csv"

    if is_fairface_present(FAIRFACE_DIR):
        print(f"Found existing FairFace (padding=1.25) in {FAIRFACE_DIR}; skipping download.")
        return

    print("FairFace not fully present; checking files…")
    # Download missing pieces
    if not ff_zip.exists():
        print("Downloading FairFace (padding=1.25) image pack…")
        gdown.download(id=FAIRFACE_IMG_PAD125_ID, output=str(ff_zip), quiet=False)
    else:
        print("Found FairFace image zip; skipping re-download.")

    if not lbl_train.exists():
        print("Downloading FairFace label train CSV…")
        gdown.download(id=FAIRFACE_LABEL_TRAIN_ID, output=str(lbl_train), quiet=False)
    else:
        print("Found FairFace label train CSV; skipping re-download.")

    if not lbl_val.exists():
        print("Downloading FairFace label val CSV…")
        gdown.download(id=FAIRFACE_LABEL_VAL_ID, output=str(lbl_val), quiet=False)
    else:
        print("Found FairFace label val CSV; skipping re-download.")

    # Extract if images not yet extracted
    if not is_fairface_present(FAIRFACE_DIR):
        print("Extracting FairFace archives…")
        try:
            extract_archive(ff_zip, FAIRFACE_DIR)
        except Exception as e:
            print(f"Extraction issue for FairFace: {e}")
        # After extraction, we consider it “present” if images+labels are available
        if is_fairface_present(FAIRFACE_DIR):
            print("FairFace ready.")
        else:
            print("FairFace may still be incomplete; verify files under:", FAIRFACE_DIR)

def ensure_kaggle_downloaded(dataset_slug: str, dest: Path, dataset_name_for_print: str):
    """
    Ensure Kaggle dataset is present with extracted images.
    Skips re-download if images already exist; otherwise downloads and extracts.
    """
    dest.mkdir(parents=True, exist_ok=True)

    if _any_images_in_dir(dest):
        print(f"Found existing {dataset_name_for_print} images in {dest}; skipping download.")
        return

    # If archives are present (e.g., from a previous partial download), try extracting them first
    if _any_archives_in_dir(dest):
        print(f"Found archives for {dataset_name_for_print} in {dest}; extracting…")
        extract_all_archives(dest)
        if _any_images_in_dir(dest):
            print(f"{dataset_name_for_print} ready after extraction.")
            return

    if RUNNING_IN_COLAB:
        print("Running in Google Colab")
        # Check the environment variables exist
        kaggle_username = userdata.get('KAGGLE_USERNAME')
        kaggle_key = userdata.get('KAGGLE_KEY')
        if kaggle_username and kaggle_key:
            print("KAGGLE_USERNAME and KAGGLE_KEY environment variables are defined.")
        else:
            print("KAGGLE_USERNAME or KAGGLE_KEY environment variables are NOT defined.")
            if not kaggle_username:
                print("  - KAGGLE_USERNAME is not defined.")
            if not kaggle_key:
                print("  - KAGGLE_KEY is not defined.")
            return
    else:
        print("Not running in Google Colab")
        if not kaggle_creds.exists():
            print(f"Kaggle credentials missing; cannot download {dataset_name_for_print}.")
            return

    print(f"Downloading {dataset_name_for_print} via Kaggle: {dataset_slug}")
    cmd = f'kaggle datasets download -d {dataset_slug} -p "{dest}"'
    r = os.system(cmd)
    if r != 0:
        print(f"Kaggle download failed for {dataset_slug}.")
        return

    print(f"Extracting {dataset_name_for_print} archives…")
    extract_all_archives(dest)

    if _any_images_in_dir(dest):
        print(f"{dataset_name_for_print} ready.")
    else:
        print(f"No images detected for {dataset_name_for_print} after extraction. Check contents in {dest}.")

### Other helper utils

In [None]:
def extract_archive(archive_path: Path, dest_dir: Path):
    dest_dir.mkdir(parents=True, exist_ok=True)
    ap = str(archive_path).lower()
    print(f"Extracting {archive_path} to {dest_dir}…")
    if ap.endswith(".zip"):
        with zipfile.ZipFile(archive_path, 'r') as zf: zf.extractall(dest_dir)
    elif ap.endswith((".tar", ".tar.gz", ".tgz", ".tar.bz2", ".tbz")):
        import tarfile
        with tarfile.open(archive_path, 'r:*') as tf: tf.extractall(dest_dir)
    else:
        raise ValueError(f"Unknown archive format: {archive_path}")
    print("Done extracting files.")

def find_images(root: Path, include_dirs=None):
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
    roots = [root]
    if include_dirs:
        roots = [root / d for d in include_dirs if (root / d).exists()] or [root]
    files = []
    for r in roots:
        for p in r.rglob("*"):
            if p.suffix.lower() in exts and p.is_file(): files.append(p)
    return files

def to_rgb(img: Image.Image) -> Image.Image:
    if img.mode in ("RGBA", "LA"):
        bg = Image.new("RGB", img.size, (255, 255, 255)); bg.paste(img, mask=img.split()[-1]); return bg
    return img.convert("RGB") if img.mode != "RGB" else img

def center_crop_resize(img: Image.Image, size=224) -> Image.Image:
    return ImageOps.fit(to_rgb(img), (size, size), method=Image.BILINEAR, bleed=0.0, centering=(0.5, 0.5))

def safe_save_jpeg(img: Image.Image, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    img.save(path, format="JPEG", quality=JPEG_QUALITY, optimize=True)

def phash_file(path: Path):
    try:
        with Image.open(path) as im: return imagehash.phash(to_rgb(im))
    except Exception: return None

def deduplicate_paths(paths, verbose_name="items"):
    seen, unique, dupes = {}, [], 0
    for p in tqdm(paths, desc=f"Dedup {verbose_name}", leave=False):
        h = phash_file(p)
        if h is None: continue
        if h in seen: dupes += 1; continue
        seen[h] = p; unique.append(p)
    if dupes: print(f"• Removed {dupes} duplicates (pHash) from {verbose_name}.")
    return unique

def check_image_rules(path: Path):
    """Return (ok: bool, reason: str). Applies only if APPLY_IMAGE_FILTERS=True."""
    try:
        with Image.open(path) as im:
            w, h = im.size
            short_side = min(w, h)
            ar = (w / h) if h else 0
    except Exception as e:
        return False, f"unreadable ({e.__class__.__name__})"

    if not APPLY_IMAGE_FILTERS:
        return True, ""

    if short_side < MIN_SHORT_SIDE_PX:
        return False, f"min-dim too small (min={short_side}px < {MIN_SHORT_SIDE_PX}px)"
    if ar < MIN_ASPECT_RATIO or ar > MAX_ASPECT_RATIO:
        return False, f"aspect ratio {ar:.2f} outside [{MIN_ASPECT_RATIO:.2f},{MAX_ASPECT_RATIO:.2f}]"
    return True, ""

def filter_paths_with_logging(paths, class_name: str):
    kept = []
    for p in tqdm(paths, desc=f"Filter images ({class_name})", leave=False):
        ok, reason = check_image_rules(p)
        if ok:
            kept.append(p)
        else:
            if PRINT_REJECTION_MESSAGES:
                print(f"Reject [{class_name}]: {p} → {reason}")
    return kept

def sample_n_stratified(df, group_cols, n_total, seed=SEED):
    groups = df.groupby(group_cols)
    combos = list(groups.groups.keys())
    k = len(combos)
    base_n = n_total // k
    remainder = n_total - base_n * k

    sampled = []
    capacities = []
    for key, g in groups:
        take = min(base_n, len(g))
        samp = g.sample(n=take, random_state=seed) if take > 0 else g.iloc[[]]
        sampled.append(samp)
        capacities.append((key, len(g) - take))
    result = pd.concat(sampled, ignore_index=True)

    if remainder > 0:
        capacities.sort(key=lambda x: x[1], reverse=True)
        i = 0
        while remainder > 0 and i < len(capacities):
            key, cap = capacities[i]
            if cap > 0:
                g = groups.get_group(key).drop(result.index, errors="ignore")
                take = min(1, len(g))
                if take > 0:
                    extra = g.sample(n=take, random_state=seed + remainder + i)
                    result = pd.concat([result, extra], ignore_index=True)
                    remainder -= take
            i += 1
    if remainder > 0:
        rest = df.drop(result.index, errors="ignore")
        extra = rest.sample(n=min(remainder, len(rest)), random_state=seed + 999)
        result = pd.concat([result, extra], ignore_index=True)

    return result.sample(frac=1.0, random_state=seed).reset_index(drop=True)

def split_train_val_test(paths, split=SPLIT, seed=SEED):
    paths = list(paths); random.Random(seed).shuffle(paths)
    n = len(paths)
    n_train = int(n * split["train"])
    n_val   = int(n * split["val"])
    return {"train": paths[:n_train],
            "val":   paths[n_train:n_train+n_val],
            "test":  paths[n_train+n_val:]}

def copy_and_process(paths_by_split, class_name):
    for split_name, paths in paths_by_split.items():
        for src in tqdm(paths, desc=f"Write {class_name}/{split_name}", leave=False):
            try:
                with Image.open(src) as im:
                    # Safety: re-check rules in case something slipped through
                    ok, reason = check_image_rules(src)
                    if not ok:
                        if PRINT_REJECTION_MESSAGES:
                            print(f"Reject at write [{class_name}]: {src} → {reason}")
                        continue
                    out = center_crop_resize(im, IMG_SIZE)
                h = hashlib.md5(str(src).encode()).hexdigest()[:16]
                out_path = OUT_DIR / split_name / class_name / f"{h}.jpg"
                safe_save_jpeg(out, out_path)
            except Exception as e:
                if PRINT_REJECTION_MESSAGES:
                    print(f"Reject at write [{class_name}]: {src} → unreadable ({e.__class__.__name__})")
                continue

## 1) Download FairFace (padding=1.25) + labels

In [None]:
print("== Ensuring FairFace availability ==")
ensure_fairface_downloaded()

## 2) Parse labels, adult-only filter, build strata

In [None]:
def load_fairface_df(ff_dir: Path):
    df_train = pd.read_csv(next(ff_dir.glob("*label_train*.csv")))
    df_val   = pd.read_csv(next(ff_dir.glob("*label_val*.csv")))
    df = pd.concat([df_train, df_val], ignore_index=True)

    cols = {c.lower().strip(): c for c in df.columns}
    def map_col(*cands):
        for c in cands:
            lc = c.lower().strip()
            if lc in cols: return cols[lc]
        for k in cols:
            if any(cc in k for cc in cands): return cols[k]
        return None
    file_col   = map_col("file", "image", "path")
    age_col    = map_col("age")
    gender_col = map_col("gender", "sex")
    race_col   = map_col("race", "ethnicity", "race_7", "race_8", "race_4")

    if any(x is None for x in [file_col, age_col, gender_col, race_col]):
        raise ValueError(f"Could not map needed columns. Found {list(df.columns)}")

    df = df[[file_col, age_col, gender_col, race_col]].rename(
        columns={file_col:"file", age_col:"age", gender_col:"gender", race_col:"race"}
    )

    # Resolve image paths inside extracted pad=1.25 pack (usually under train/ and val/)
    img_root = ff_dir
    def resolve_path(rel):
        rel = str(rel).lstrip("./\\")
        cand = img_root / rel
        if cand.exists(): return cand
        for sub in ["train", "val", "fairface-img-margin125-trainval/train", "fairface-img-margin125-trainval/val"]:
            c = img_root / sub / rel
            if c.exists(): return c
        hits = list(img_root.rglob(Path(rel).name))
        return hits[0] if hits else None

    df["path"] = df["file"].apply(resolve_path)
    df = df[df["path"].notna()].reset_index(drop=True)

    # Adult-only: drop under-18 bins (exclude '0-2','3-9','10-19', any 'under <18' style)
    def is_adult(age_lab: str):
        s = str(age_lab).lower()
        if "10-19" in s or "0-2" in s or "3-9" in s: return False
        if "under" in s or "<" in s: return False
        return True

    for col in ["age", "gender", "race"]:
        df[col] = df[col].astype(str).str.strip()

    df = df[df["age"].apply(is_adult)].reset_index(drop=True)
    # (race × gender × adult-age-bin) strata to preserve FairFace balance in adult subset
    df["strata"] = df["race"] + " | " + df["gender"] + " | " + df["age"]
    return df

fairface_df = load_fairface_df(FAIRFACE_DIR)
print("FairFace (adult) candidates (pre-filter):", len(fairface_df), "  strata:", len(fairface_df["strata"].unique()))

# Apply image-quality filters to FairFace before stratified sampling (maintains balance within the *filtered* adult pool)
ff_keep_mask = []
for row in tqdm(fairface_df.itertuples(index=False), total=len(fairface_df), desc="Filter images (human/FairFace)", leave=False):
    ok, reason = check_image_rules(row.path)
    ff_keep_mask.append(ok)
    if not ok and PRINT_REJECTION_MESSAGES:
        print(f"Reject [human]: {row.path} → {reason}")
fairface_df = fairface_df[np.array(ff_keep_mask, dtype=bool)].reset_index(drop=True)
print("FairFace (adult) candidates (post-filter):", len(fairface_df))

## 3) Stratified sample N=TARGET_PER_CLASS humans

In [None]:
if len(fairface_df) < MIN_REQUIRED_PER_CLASS:
    raise RuntimeError(f"Not enough adult FairFace images ({len(fairface_df)}) to sample {TARGET_PER_CLASS}.")
human_sample = sample_n_stratified(fairface_df, ["strata"], TARGET_PER_CLASS, seed=SEED)
human_paths = deduplicate_paths(human_sample["path"].tolist(), "human")
if len(human_paths) < MIN_REQUIRED_PER_CLASS:
    print("Top-up from remaining adult pool (post-dedup).")
    chosen = set(map(str, human_paths))
    remaining = [p for p in fairface_df["path"].tolist() if str(p) not in chosen]
    remaining = deduplicate_paths(remaining, "human-topup-candidates")
    need = MIN_REQUIRED_PER_CLASS - len(human_paths)
    human_paths += remaining[:need]
    human_paths = human_paths[:MIN_REQUIRED_PER_CLASS]
print("Human final:", len(human_paths))

## 4) Kaggle downloads: Avatars + Animal faces

In [None]:
AVATAR_DIR = RAW_DIR / "avatars"
ANIMAL_DIR = RAW_DIR / "animal_faces"
print("\n== Ensuring Kaggle datasets availability ==")
ensure_kaggle_downloaded(KAGGLE_AVATARS_DATASET, RAW_DIR / "avatars", "Avatars (Google Cartoon Set)")
ensure_kaggle_downloaded(KAGGLE_ANIMALFACES_DATASET, RAW_DIR / "animal_faces", "Animal Faces (Dogs vs Cats)")

## 5) Collect & sample Avatars (with filters + dedup)

In [None]:
avatar_all = find_images(AVATAR_DIR, include_dirs=AVATAR_INCLUDE_DIRS)
avatar_all = filter_paths_with_logging(avatar_all, "avatar")
avatar_all = deduplicate_paths(avatar_all, "avatar")
if len(avatar_all) < MIN_REQUIRED_PER_CLASS:
    raise RuntimeError(f"Not enough avatar images ({len(avatar_all)}) for {TARGET_PER_CLASS}.")
avatar_paths = random.sample(avatar_all, TARGET_PER_CLASS)

## 6) Collect & sample Animal faces (with filters + dedup)

In [None]:
animal_all = find_images(ANIMAL_DIR, include_dirs=["test", "train"])
animal_all = filter_paths_with_logging(animal_all, "animal")
animal_all = deduplicate_paths(animal_all, "animal")
if len(animal_all) < MIN_REQUIRED_PER_CLASS:
    raise RuntimeError(f"Not enough animal images ({len(animal_all)}) for {TARGET_PER_CLASS}.")
animal_paths = random.sample(animal_all, TARGET_PER_CLASS)
print(len(animal_paths))

## 7) Split & write ImageFolder with resize/crop

In [None]:
if OUT_DIR.exists():
    print("Clearing old output:", OUT_DIR); shutil.rmtree(OUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

spl_human  = split_train_val_test(human_paths,  SPLIT, SEED)
spl_avatar = split_train_val_test(avatar_paths, SPLIT, SEED)
spl_animal = split_train_val_test(animal_paths, SPLIT, SEED)

copy_and_process(spl_human,  "human")
copy_and_process(spl_avatar, "avatar")
copy_and_process(spl_animal, "animal")

def count_images(root): return sum(1 for _ in root.rglob("*.jpg"))
for split_name in ["train", "val", "test"]:
    for cls in ["human", "avatar", "animal"]:
        print(f"{split_name:5s}/{cls:7s}: {count_images(OUT_DIR / split_name / cls)}")

## 8) Manifest for traceability

In [None]:
manifest = {
    "seed": SEED,
    "img_size": IMG_SIZE,
    "classes": {"human": len(human_paths), "avatar": len(avatar_paths), "animal": len(animal_paths)},
    "split": SPLIT,
    "sources": {
        "fairface_padding": "1.25",
        "fairface_readme": "https://github.com/joojs/fairface",
        "fairface_img_id": FAIRFACE_IMG_PAD125_ID,
        "fairface_label_train_id": FAIRFACE_LABEL_TRAIN_ID,
        "fairface_label_val_id": FAIRFACE_LABEL_VAL_ID,
        "kaggle_avatars": KAGGLE_AVATARS_DATASET,
        "kaggle_animals":  KAGGLE_ANIMALFACES_DATASET,
        "avatar_include_dirs": AVATAR_INCLUDE_DIRS,
        "animal_include_dirs": ANIMAL_INCLUDE_DIRS,
    },
    "target_per_class": TARGET_PER_CLASS,
    "filters": {
        "apply": APPLY_IMAGE_FILTERS,
        "min_short_side_px": MIN_SHORT_SIDE_PX,
        "min_ar": MIN_ASPECT_RATIO,
        "max_ar": MAX_ASPECT_RATIO,
        "print_rejections": PRINT_REJECTION_MESSAGES,
    },
}
(OUT_DIR / "MANIFEST.json").write_text(json.dumps(manifest, indent=2))
print("\nDataset build complete.")
print("Output (ImageFolder):", OUT_DIR.resolve())

In [None]:
end_time = time.time()
print(f"Run started at: {time.ctime(start_time)}")
print(f"Run ended at: {time.ctime(end_time)}")
elapsed_time_seconds = end_time - start_time
elapsed_time_minutes = elapsed_time_seconds / 60
print(f"Total elapsed time: {elapsed_time_minutes:.2f} minutes")

# Tools

In [None]:
# try:
#   drive.mount('/content/drive')
# except Exception as e:
#   print(f"Error mounting Google Drive: {e}")

In [None]:
def copy_data_to_drive():
    """Copies the data directory from Colab to Google Drive."""
    if not COLAB_DATA_DIR.exists():
        print(f"Source directory not found: {COLAB_DATA_DIR}")
        return

    DRIVE_DATA_PARENT_DIR.mkdir(parents=True, exist_ok=True)

    # Remove existing directory in Drive to avoid errors during copy
    if DRIVE_DATA_DIR.exists():
        print(f"Removing existing data directory in Drive: {DRIVE_DATA_DIR}")
        shutil.rmtree(DRIVE_DATA_DIR)

    print(f"Copying data from {COLAB_DATA_DIR} to {DRIVE_DATA_DIR}...")
    try:
        shutil.copytree(COLAB_DATA_DIR, DRIVE_DATA_DIR)
        print("Data successfully copied to Google Drive.")
    except Exception as e:
        print(f"Error copying data to Drive: {e}")

def copy_data_from_drive():
    """Copies the data directory from Google Drive to Colab."""
    if not DRIVE_DATA_DIR.exists():
        print(f"Source directory not found in Drive: {DRIVE_DATA_DIR}")
        print("Please ensure Google Drive is mounted and the data exists at the specified path.")
        return

    # Remove existing directory in Colab to avoid errors during copy
    if COLAB_DATA_DIR.exists():
        print(f"Removing existing data directory in Colab: {COLAB_DATA_DIR}")
        shutil.rmtree(COLAB_DATA_DIR)

    print(f"Copying data from {DRIVE_DATA_DIR} to {COLAB_DATA_DIR}...")
    try:
        shutil.copytree(DRIVE_DATA_DIR, COLAB_DATA_DIR)
        print("Data successfully copied to Colab environment.")
    except Exception as e:
        print(f"Error copying data from Drive: {e}")

In [None]:
# copy_data_to_drive()
# copy_data_from_drive()

In [None]:
import shutil
import os

def zip_data_directory():
    """Zips the 'data' directory."""
    data_dir = Path("data")
    if not data_dir.exists():
        print(f"Error: Directory '{data_dir}' not found.")
        return
    zip_filename = data_dir.with_suffix(".zip")
    print(f"Creating zip archive: {zip_filename} from {data_dir}")
    try:
        shutil.make_archive(str(data_dir), 'zip', data_dir)
        print("Zip archive created successfully.")
    except Exception as e:
        print(f"Error zipping directory: {e}")

def unzip_data_directory():
    """Unzips the 'data.zip' archive."""
    zip_filename = Path("data.zip")
    if not zip_filename.exists():
        print(f"Error: Zip file '{zip_filename}' not found.")
        return
    extract_dir = Path("data")
    if extract_dir.exists():
        print(f"Warning: Directory '{extract_dir}' already exists. Contents may be overwritten.")
    print(f"Extracting zip archive: {zip_filename} to {extract_dir}")
    try:
        shutil.unpack_archive(str(zip_filename), str(extract_dir), 'zip')
        print("Zip archive extracted successfully.")
    except Exception as e:
        print(f"Error unzipping archive: {e}")

In [None]:
# import shutil
# from pathlib import Path

# ZIP_NAME = "data.zip"

# # Define the source and destination paths
# source_zip = DRIVE_DATA_DIR / ZIP_NAME
# # Ensure the parent directory in Drive exists
# DRIVE_DATA_PARENT_DIR.mkdir(parents=True, exist_ok=True)
# # Define the destination path in Drive for the zip file
# drive_zip_destination = Path(ZIP_NAME)

# if not source_zip.exists():
#     print(f"Error: Source zip file '{source_zip}' not found.")
# else:
#     print(f"Copying '{source_zip}' to '{drive_zip_destination}'...")
#     try:
#         # Use shutil.copy2 to preserve metadata (optional, but good practice)
#         shutil.copy2(source_zip, drive_zip_destination)
#         unzip_data_directory()
#         print("Zip file successfully copied from Google Drive and unzipped.")
#     except Exception as e:
#         print(f"Error copying zip file to Drive: {e}")

In [None]:
# import shutil
# from pathlib import Path

# # zip_data_directory()

# # Define the source and destination paths
# source_zip = Path("data.zip")
# # Ensure the parent directory in Drive exists
# DRIVE_DATA_PARENT_DIR.mkdir(parents=True, exist_ok=True)
# # Define the destination path in Drive for the zip file
# drive_zip_destination = DRIVE_DATA_DIR / source_zip.name

# # Ensure the destination directory in Drive exists
# drive_zip_destination.parent.mkdir(parents=True, exist_ok=True)


# if not source_zip.exists():
#     print(f"Error: Source zip file '{source_zip}' not found.")
# else:
#     print(f"Copying '{source_zip}' to '{drive_zip_destination}'...")
#     try:
#         # Use shutil.copy2 to preserve metadata (optional, but good practice)
#         shutil.copy2(source_zip, drive_zip_destination)
#         print("Zip file successfully copied to Google Drive.")
#     except Exception as e:
#         print(f"Error copying zip file to Drive: {e}")

## Cleanup Utilities

In [None]:
def _is_subpath(child: Path, parent: Path) -> bool:
    try:
        child.resolve().relative_to(parent.resolve())
        return True
    except Exception:
        return False

def _safe_rmtree(p: Path):
    if p.exists():
        if _is_subpath(p, WORK_DIR):
            shutil.rmtree(p)
            print(f"Removed: {p}")
        else:
            print(f"Skip delete (outside WORK_DIR): {p}")

def cleanup_generated_dataset():
    """Remove only the generated ImageFolder dataset (data/final)."""
    _safe_rmtree(OUT_DIR)

def cleanup_all_datasets():
    """Remove initial raw downloads (data/raw) and the generated dataset (data/final)."""
    _safe_rmtree(OUT_DIR)
    _safe_rmtree(RAW_DIR)
    _safe_rmtree(TMP_DIR)

def cleanup_datasets(mode: str = "generated"):
    """
    mode ∈ {"generated", "all"}
      - "generated": remove only data/final
      - "all":       remove data/raw, data/final, data/tmp
    """
    mode = mode.lower().strip()
    if mode == "generated":
        cleanup_generated_dataset()
    elif mode == "all":
        cleanup_all_datasets()
    else:
        raise ValueError("mode must be 'generated' or 'all'")

In [None]:
# cleanup_all_datasets()

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# with open('/content/drive/My Drive/foo.txt', 'w') as f:
#   f.write('Hello Google Drive!')
# !cat /content/drive/My\ Drive/baa.txt
# drive.flush_and_unmount()
# print('All changes made in this colab session should now be visible in Drive.')

### Checking for Duplicates
Every animal image appears twice. Used this code to verify that before changing the import code above.

In [None]:
from collections import Counter
import os

def find_all_image_filenames(root_dir):
    """Recursively finds all image filenames (basename) within a directory."""
    image_filenames = []
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.webp'}
    for root, _, files in os.walk(root_dir):
        for file in files:
            if os.path.splitext(file)[1].lower() in image_extensions:
                image_filenames.append(file)
    return image_filenames

# Find all image filenames
all_animal_image_filenames = find_all_image_filenames(ANIMAL_DIR)

# Count the occurrences of each filename
filename_counts = Counter(all_animal_image_filenames)

# Find filenames that appear more than once
duplicate_filenames = {filename: count for filename, count in filename_counts.items() if count > 1}

print(f"Total number of animal image files found: {len(all_animal_image_filenames)}")
if duplicate_filenames:
    print(f"Number of filenames appearing more than once: {len(duplicate_filenames)}")
else:
    print("No duplicate filenames found in the animal_faces directory structure.")