In [3]:
import os
import re
import pandas as pd

In [4]:
def extract_paren_index(fname: str) -> int:
    """
    Get the number inside parentheses at end of stem, e.g. '... (3).png' -> 3
    If none, return a large number so it sorts last.
    """
    stem, _ = os.path.splitext(fname)
    m = re.search(r"\((\d+)\)\s*$", stem)
    if m:
        return int(m.group(1))
    return 999999

In [5]:
BASE_DIR = "objects"
IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".gif")

rows = []


# Folders to ignore at the top level of objects/
IGNORE_FOLDERS = {"small", "thumb", "thumbs", ".DS_Store"}

for folder in sorted(os.listdir(BASE_DIR)):
    if folder in IGNORE_FOLDERS:
        continue

    folder_path = os.path.join(BASE_DIR, folder)
    if not os.path.isdir(folder_path):
        continue

    # Only image files in this article folder
    files = [
        f for f in os.listdir(folder_path)
        if f.lower().endswith(IMAGE_EXTS)
    ]

    if not files:
        continue

    # Sort files by (n) index so (1) comes before (2), etc.
    files.sort(key=extract_paren_index)

    rename_info = []

    for idx, fname in enumerate(files, start=1):
        stem, ext = os.path.splitext(fname)
        # Remove trailing " (n)" from stem
        base_stem = re.sub(r"\s*\(\d+\)\s*$", "", stem)
        # New filename with underscore index
        new_name = f"{base_stem}_{idx}{ext}"

        old_path = os.path.join(folder_path, fname)
        new_path = os.path.join(folder_path, new_name)

        if old_path != new_path:
            print(f"Renaming: {old_path} -> {new_path}")
            os.rename(old_path, new_path)

        rename_info.append((idx, new_name))

    # Use folder name as the "article id"
    article_id = folder
    parent_id = f"{article_id}_Original"

    # First image is used on the parent row
    first_idx, first_new_name = rename_info[0]

    # ---- Parent (compound object) row ----
    rows.append({
        "object_id": parent_id,
        "article_id": article_id,
        "image_display_template": "compound_object",
        "image_parent_id": "",
        "image_file": first_new_name,
        "image_object_location": f"{BASE_DIR}/{folder}/{first_new_name}",
    })

    # ---- Child image rows ----
    for idx, new_name in rename_info:
        child_id = f"{article_id}_img{idx}"
        rows.append({
            "object_id": child_id,
            "article_id": article_id,
            "image_display_template": "image",
            "image_parent_id": parent_id,
            "image_file": new_name,
            "image_object_location": f"{BASE_DIR}/{folder}/{new_name}",
        })

# Build DataFrame and save
df = pd.DataFrame(rows, columns=[
    "object_id",
    "article_id",
    "image_display_template",
    "image_parent_id",
    "image_file",
    "image_object_location",
])

df.to_csv("image_metadata_test.csv", index=False)
print("Wrote image metadata to image_metadata_test.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'objects'