In [3]:
# Cell 1 — helpers

import json
from collections import OrderedDict
from pathlib import Path

def load_imagenet_class_index(path):
    """
    imagenet_class_index.json: {"0": ["n01440764","tench"], ...}
    Returns:
      - idx_to_pair: dict[int] -> (wnid, label)
      - wnid_to_idx_label: dict[wnid] -> (idx, label)
    """
    path = Path(path)
    with path.open("r", encoding="utf-8") as f:
        raw = json.load(f)
    idx_to_pair = {int(k): (v[0], v[1]) for k, v in raw.items()}
    wnid_to_idx_label = {wnid: (idx, label) for idx, (wnid, label) in idx_to_pair.items()}
    return idx_to_pair, wnid_to_idx_label

def load_words(path):
    """
    words.txt lines look like either:
      n02124075 Egyptian cat
    or:
      n02124075\tEgyptian cat
    Returns wnid -> full label string.
    """
    if not path:
        return {}
    path = Path(path)
    wnid_to_words = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split("\t")
            if len(parts) == 1:
                parts = line.split(" ", 1)
            wnid = parts[0]
            label = parts[1].strip() if len(parts) > 1 else ""
            wnid_to_words[wnid] = label
    return wnid_to_words

def load_wnids(path):
    """Return list of WNIDs from Tiny-ImageNet (index not used here)."""
    path = Path(path)
    with path.open("r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

def build_sparse_tiny_index_by_imagenet_idx(
    tiny_wnids, idx_to_pair, wnid_to_idx_label, wnid_to_words=None, fail_on_missing=False
):
    """
    Build an OrderedDict whose KEYS are the original ImageNet-1k indices (as strings)
    for classes present in Tiny-ImageNet (intersection by WNID).

    Value schema matches imagenet_class_index.json: [wnid, label]

    If a Tiny WNID isn't in 1k, optionally use words.txt (if provided) to fill label,
    but it will have no 1k index; such classes are skipped (or raise if fail_on_missing).
    """
    out = {}
    missing = []
    for wnid in tiny_wnids:
        if wnid in wnid_to_idx_label:
            idx, label_1k = wnid_to_idx_label[wnid]
            out[idx] = [wnid, label_1k]
        else:
            # Not in 1k mapping
            label = None
            if wnid_to_words:
                words = wnid_to_words.get(wnid, "")
                label = words.split(",")[0].strip() if words else ""
            missing.append((wnid, label or "__UNKNOWN_LABEL__"))
    if missing:
        msg = f"{len(missing)} Tiny-ImageNet WNIDs are not present in ImageNet-1k index"
        if fail_on_missing:
            raise ValueError(msg + f": {[m[0] for m in missing[:5]]}{' ...' if len(missing)>5 else ''}")
        else:
            print("Warning:", msg)

    # Sort by 1k index and convert keys to strings
    ordered = OrderedDict((str(k), v) for k, v in sorted(out.items(), key=lambda kv: kv[0]))
    return ordered, missing

def build_dense_1000(tiny_sparse, idx_to_pair):
    """
    Create a dense dict with keys '0'..'999' where values are:
      - [wnid,label] if class is in Tiny-ImageNet (per tiny_sparse)
      - None otherwise
    """
    dense = OrderedDict()
    for i in range(1000):
        key = str(i)
        dense[key] = tiny_sparse.get(key, None)
    return dense

def save_json(obj, path):
    path = Path(path)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


# Cell 2 — set your paths

IMAGENET_JSON = "../data/imagenet_class_index.json"  # path to 1k mapping
WNIDS_TXT     = "../data/tiny-imagenet-200/wnids.txt"                  # path to Tiny-ImageNet wnids
WORDS_TXT     = "../data/tiny-imagenet-200/words.txt"                  # optional fallback (recommended)

OUT_SPARSE_JSON = "../data/tiny-imagenet-200/tiny_imagenet_class_index_sparse_by_1k_idx.json"
WRITE_DENSE_1000 = False
OUT_DENSE_JSON   = "../data/tiny-imagenet-200/tiny_imagenet_class_index_dense_1000.json"

FAIL_ON_MISSING = False  # True -> raise if any Tiny WNID isn't in 1k

In [4]:
# Cell 3 — build and save

idx_to_pair, wnid_to_idx_label = load_imagenet_class_index(IMAGENET_JSON)
tiny_wnids = load_wnids(WNIDS_TXT)
wnid_to_words = load_words(WORDS_TXT)

tiny_sparse, missing = build_sparse_tiny_index_by_imagenet_idx(
    tiny_wnids,
    idx_to_pair,
    wnid_to_idx_label,
    wnid_to_words=wnid_to_words,
    fail_on_missing=FAIL_ON_MISSING
)

save_json(tiny_sparse, OUT_SPARSE_JSON)
print(f"Wrote sparse Tiny-ImageNet mapping (by 1k indices) to {OUT_SPARSE_JSON}")
print(f"Matched: {len(tiny_sparse)} classes; Missing from 1k: {len(missing)}")

if WRITE_DENSE_1000:
    dense = build_dense_1000(tiny_sparse, idx_to_pair)
    save_json(dense, OUT_DENSE_JSON)
    print(f"Wrote dense 1000-length JSON to {OUT_DENSE_JSON}")


Wrote sparse Tiny-ImageNet mapping (by 1k indices) to ../data/tiny-imagenet-200/tiny_imagenet_class_index_sparse_by_1k_idx.json
Matched: 200 classes; Missing from 1k: 0
