From 0054550c8a9cd7a0af73f67eb17e5695e2e70ed5 Mon Sep 17 00:00:00 2001 From: Vaibhav Srivastav Date: Tue, 21 Apr 2026 21:14:51 +0530 Subject: [PATCH 1/5] Update imagegen system skill for gpt-image-2 --- .../src/assets/samples/imagegen/SKILL.md | 55 +++++++--- .../samples/imagegen/agents/openai.yaml | 2 +- .../assets/samples/imagegen/references/cli.md | 91 ++++++++++++++-- .../imagegen/references/codex-network.md | 2 +- .../samples/imagegen/references/image-api.md | 51 ++++++++- .../samples/imagegen/references/prompting.md | 14 ++- .../imagegen/references/sample-prompts.md | 60 +++++++++- .../samples/imagegen/scripts/image_gen.py | 103 +++++++++++++++--- 8 files changed, 331 insertions(+), 47 deletions(-) diff --git a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md index 88195882efde..c51c5c6465ca 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md +++ b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md @@ -12,17 +12,19 @@ Generates or edits images for the current project (for example website assets, g This skill has exactly two top-level modes: - **Default built-in tool mode (preferred):** built-in `image_gen` tool for normal image generation and editing. Does not require `OPENAI_API_KEY`. -- **Fallback CLI mode (explicit-only):** `scripts/image_gen.py` CLI. Use only when the user explicitly asks for the CLI path. Requires `OPENAI_API_KEY`. +- **Fallback CLI mode:** `scripts/image_gen.py` CLI. Use when the user explicitly asks for the CLI/API/model path, or when they explicitly require a transparent background that needs `gpt-image-1.5`. Requires `OPENAI_API_KEY`. -Within the explicit CLI fallback only, the CLI exposes three subcommands: +Within CLI fallback, the CLI exposes three subcommands: - `generate` - `edit` - `generate-batch` Rules: -- Use the built-in `image_gen` tool by default for all normal image generation and editing requests. -- Never switch to CLI fallback automatically. +- Use the built-in `image_gen` tool by default for normal image generation and editing requests. +- Do not switch to CLI fallback for ordinary quality, size, or file-path control. +- If the user explicitly asks for a transparent image/background, use CLI fallback with `gpt-image-1.5` and explain briefly that transparent backgrounds are not supported in `gpt-image-2`, the latest model. +- The word `batch` by itself does not mean CLI fallback. If the user asks for many assets or says to batch-generate assets without explicitly asking for CLI/API/model controls, stay on the built-in path and issue one built-in call per requested asset or variant. - If the built-in tool fails or is unavailable, tell the user the CLI fallback exists and that it requires `OPENAI_API_KEY`. Proceed only if the user explicitly asks for that fallback. - If the user explicitly asks for CLI mode, use the bundled `scripts/image_gen.py` workflow. Do not create one-off SDK runners. - Never modify `scripts/image_gen.py`. If something is missing, ask the user before doing anything else. @@ -79,12 +81,13 @@ Built-in edit semantics: Execution strategy: - In the built-in default path, produce many assets or variants by issuing one `image_gen` call per requested asset or variant. -- In the explicit CLI fallback path, use the CLI `generate-batch` subcommand only when the user explicitly chose CLI mode and needs many prompts/assets. +- In the CLI fallback path, use the CLI `generate-batch` subcommand only when the user explicitly chose CLI mode and needs many prompts/assets. +- For many distinct assets, do not use `n` as a substitute for separate prompts. `n` is for variants of one prompt; distinct assets need distinct built-in calls or distinct CLI `generate-batch` jobs. Assume the user wants a new image unless they clearly ask to change an existing one. ## Workflow -1. Decide the top-level mode: built-in by default, fallback CLI only if explicitly requested. +1. Decide the top-level mode: built-in by default; fallback CLI if explicitly requested or if the user explicitly needs transparent output. 2. Decide the intent: `generate` or `edit`. 3. Decide whether the output is preview-only or meant to be consumed by the current project. 4. Decide the execution strategy: single asset vs repeated built-in calls vs CLI `generate-batch`. @@ -99,13 +102,13 @@ Assume the user wants a new image unless they clearly ask to change an existing - If the user's prompt is already specific and detailed, normalize it into a clear spec without adding creative requirements. - If the user's prompt is generic, add tasteful augmentation only when it materially improves output quality. 10. Use the built-in `image_gen` tool by default. -11. If the user explicitly chooses the CLI fallback, then and only then use the fallback-only docs for quality, `input_fidelity`, masks, output format, output paths, and network setup. +11. If the user explicitly chooses the CLI fallback, or explicitly asks for transparent output, then use the fallback-only docs for model, quality, size, `input_fidelity`, masks, output format, output paths, and network setup. 12. Inspect outputs and validate: subject, style, composition, text accuracy, and invariants/avoid items. 13. Iterate with a single targeted change, then re-check. 14. For preview-only work, render the image inline; the underlying file may remain at the default `$CODEX_HOME/generated_images/...` path. 15. For project-bound work, move or copy the selected artifact into the workspace and update any consuming code or references. Never leave a project-referenced asset only at the default `$CODEX_HOME/generated_images/...` path. -16. For batches, persist only the selected finals in the workspace unless the user explicitly asked to keep discarded variants. -17. Always report the final saved path for any workspace-bound asset, plus the final prompt and whether the built-in tool or fallback CLI mode was used. +16. For batches or multi-asset requests, persist every requested deliverable final in the workspace unless the user explicitly asked to keep outputs preview-only. Discarded variants do not need to be kept unless requested. +17. Always report the final saved path(s) for any workspace-bound asset(s), plus the final prompt or prompt set and whether the built-in tool or fallback CLI mode was used. ## Prompt augmentation @@ -140,6 +143,9 @@ Generate: - product-mockup — product/packaging shots, catalog imagery, merch concepts. - ui-mockup — app/web interface mockups and wireframes; specify the desired fidelity. - infographic-diagram — diagrams/infographics with structured layout and text. +- scientific-educational — classroom explainers, scientific diagrams, and learning visuals with required labels and accuracy constraints. +- ads-marketing — campaign concepts and ad creatives with audience, brand position, scene, and exact tagline/copy. +- productivity-visual — slide, chart, workflow, and data-heavy business visuals. - logo-brand — logo/mark exploration, vector-friendly. - illustration-story — comics, children’s book art, narrative scenes. - stylized-concept — style-driven concept art, 3D/stylized renders. @@ -179,7 +185,7 @@ Avoid: Notes: - `Asset type` and `Input images` are prompt scaffolding, not dedicated CLI flags. - `Scene/backdrop` refers to the visual setting. It is not the same as the fallback CLI `background` parameter, which controls output transparency behavior. -- Fallback-only execution notes such as `Quality:`, `Input fidelity:`, masks, output format, and output paths belong in the explicit CLI path only. Do not treat them as built-in `image_gen` tool arguments. +- Fallback-only execution notes such as `Quality:`, `Input fidelity:`, masks, output format, and output paths belong in the CLI path only. Do not treat them as built-in `image_gen` tool arguments. Augmentation rules: - Keep it short. @@ -220,7 +226,7 @@ Constraints: change only the background; keep the product and its edges unchange - Iterate with single-change follow-ups. - If the prompt is generic, add only the extra detail that will materially help. - If the prompt is already detailed, normalize it instead of expanding it. -- For explicit CLI fallback only, see `references/cli.md` and `references/image-api.md` for `quality`, `input_fidelity`, masks, output format, and output-path guidance. +- For CLI fallback only, see `references/cli.md` and `references/image-api.md` for model, `quality`, `input_fidelity`, masks, output format, and output-path guidance. More principles shared by both modes: `references/prompting.md`. Copy/paste specs shared by both modes: `references/sample-prompts.md`. @@ -228,10 +234,33 @@ Copy/paste specs shared by both modes: `references/sample-prompts.md`. ## Guidance by asset type Asset-type templates (website assets, game assets, wireframes, logo) are consolidated in `references/sample-prompts.md`. +## gpt-image-2 guidance for CLI fallback + +The fallback CLI defaults to `gpt-image-2`. + +- Use `gpt-image-2` for new CLI/API workflows unless the request needs transparent output. +- If the user explicitly asks for transparent output, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. +- `gpt-image-2` always uses high fidelity for image inputs; do not set `input_fidelity` with this model. +- `gpt-image-2` supports `quality` values `low`, `medium`, `high`, and `auto`. +- Use `quality low` for fast drafts, thumbnails, and quick iterations. Use `medium`, `high`, or `auto` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. +- Square images are typically fastest to generate. Use `1024x1024` for fast square drafts. +- If the user asks for 4K-style output, use `3824x2160` for landscape or `2160x3824` for portrait. Do not use `3840x2160`, because the maximum edge length must be less than `3840px`. +- `gpt-image-2` size may be `auto` or `WIDTHxHEIGHT` if all constraints hold: max edge `< 3840px`, both edges multiples of `16px`, long-to-short ratio `<= 3:1`, total pixels between `655,360` and `8,294,400`. + +Popular `gpt-image-2` sizes: +- `1024x1024` square +- `1536x1024` landscape +- `1024x1536` portrait +- `2048x2048` 2K square +- `2048x1152` 2K landscape +- `3824x2160` near-4K landscape +- `2160x3824` near-4K portrait +- `auto` + ## Fallback CLI mode only ### Temp and output conventions -These conventions apply only to the explicit CLI fallback. They do not describe built-in `image_gen` output behavior. +These conventions apply only to the CLI fallback. They do not describe built-in `image_gen` output behavior. - Use `tmp/imagegen/` for intermediate files (for example JSONL batches); delete them when done. - Write final artifacts under `output/imagegen/`. - Use `--out` or `--out-dir` to control output paths; keep filenames stable and descriptive. @@ -276,4 +305,4 @@ If installation is not possible in this environment, tell the user which depende - `references/cli.md`: fallback-only CLI usage via `scripts/image_gen.py`. - `references/image-api.md`: fallback-only API/CLI parameter reference. - `references/codex-network.md`: fallback-only network/sandbox troubleshooting for CLI mode. -- `scripts/image_gen.py`: fallback-only CLI implementation. Do not load or use it unless the user explicitly chooses CLI mode. +- `scripts/image_gen.py`: fallback-only CLI implementation. Do not load or use it unless the user explicitly chooses CLI mode or explicitly asks for transparent output. diff --git a/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml b/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml index c9cfddb14423..6a4b75901311 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml +++ b/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml @@ -3,4 +3,4 @@ interface: short_description: "Generate or edit images for websites, games, and more" icon_small: "./assets/imagegen-small.svg" icon_large: "./assets/imagegen.png" - default_prompt: "Generate or edit the visual assets for this task with the built-in `image_gen` tool by default. First confirm that the task actually calls for a raster image; if the project already has SVG/vector/code-native assets and the user wants to extend or match those, do not use this skill. If the task includes reference images, treat them as references unless the user clearly wants an existing image modified. For multi-asset requests, loop built-in calls rather than treating batch as a separate top-level mode. Only use the fallback CLI if the user explicitly asks for it, and keep CLI-only controls such as `generate-batch`, `quality`, `input_fidelity`, masks, and output paths on that fallback path." + default_prompt: "Generate or edit the visual assets for this task with the built-in `image_gen` tool by default. First confirm that the task actually calls for a raster image; if the project already has SVG/vector/code-native assets and the user wants to extend or match those, do not use this skill. If the task includes reference images, treat them as references unless the user clearly wants an existing image modified. For multi-asset requests, loop built-in calls; the word `batch` alone is not CLI opt-in. Use the fallback CLI only if the user explicitly asks for CLI/API/model controls or explicitly needs transparent output; for transparent output use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. Keep CLI-only controls such as `generate-batch`, `quality`, `input_fidelity`, masks, and output paths on that fallback path." diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md index 8cb0663c5744..053fc9e5d4d7 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md @@ -1,13 +1,14 @@ # CLI reference (`scripts/image_gen.py`) -This file is for the fallback CLI mode only. Read it only after the user explicitly asks to use `scripts/image_gen.py` instead of the built-in `image_gen` tool. +This file is for the fallback CLI mode only. Read it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or when the user explicitly asks for transparent output that requires the `gpt-image-1.5` fallback path. `generate-batch` is a CLI subcommand in this fallback path. It is not a top-level mode of the skill. +The word `batch` in a user request is not CLI opt-in by itself. ## What this CLI does - `generate`: generate a new image from a prompt - `edit`: edit one or more existing images -- `generate-batch`: run many generation jobs from a JSONL file +- `generate-batch`: run many generation jobs from a JSONL file after the user explicitly chooses CLI/API/model controls Real API calls require **network access** + `OPENAI_API_KEY`. `--dry-run` does not. @@ -16,7 +17,7 @@ Set a stable path to the skill CLI (default `CODEX_HOME` is `~/.codex`): ``` export CODEX_HOME="${CODEX_HOME:-$HOME/.codex}" -export IMAGE_GEN="$CODEX_HOME/skills/imagegen/scripts/image_gen.py" +export IMAGE_GEN="$CODEX_HOME/skills/.system/imagegen/scripts/image_gen.py" ``` Install dependencies into that environment with its package manager. In uv-managed environments, `uv pip install ...` remains the preferred path. @@ -60,25 +61,96 @@ python "$IMAGE_GEN" edit \ - **Never modify** `scripts/image_gen.py`. If something is missing, ask the user before doing anything else. ## Defaults -- Model: `gpt-image-1.5` +- Model: `gpt-image-2` - Supported model family for this CLI: GPT Image models (`gpt-image-*`) -- Size: `1024x1024` -- Quality: `auto` +- Size: `auto` +- Quality: `medium` - Output format: `png` - Default one-off output path: `output/imagegen/output.png` - Background: unspecified unless `--background` is set +## gpt-image-2 size and model guidance + +`gpt-image-2` is the default model for new CLI fallback work. + +- Use `--quality low` for fast drafts, thumbnails, and quick iterations. +- Use `--quality medium`, `--quality high`, or `--quality auto` for final assets, dense text, diagrams, identity-sensitive edits, and high-resolution outputs. +- Square images are typically fastest. Use `--size 1024x1024` for quick square drafts. +- If the user asks for 4K-style output, use `--size 3824x2160` for landscape or `--size 2160x3824` for portrait. +- Do not pass `--input-fidelity` with `gpt-image-2`; this model always uses high fidelity for image inputs. +- Do not use `--background transparent` with `gpt-image-2`; use `gpt-image-1.5` for transparent output. + +Popular `gpt-image-2` sizes: +- `1024x1024` +- `1536x1024` +- `1024x1536` +- `2048x2048` +- `2048x1152` +- `3824x2160` +- `2160x3824` +- `auto` + +`gpt-image-2` size constraints: +- max edge `< 3840px` +- both edges multiples of `16px` +- long edge to short edge ratio `<= 3:1` +- total pixels between `655,360` and `8,294,400` + +Fast draft: + +```bash +python "$IMAGE_GEN" generate \ + --prompt "A product thumbnail of a matte ceramic mug on a stone surface" \ + --quality low \ + --size 1024x1024 \ + --out output/imagegen/mug-draft.png +``` + +Final 2K landscape: + +```bash +python "$IMAGE_GEN" generate \ + --prompt "A polished landing-page hero image of a matte ceramic mug on a stone surface" \ + --quality high \ + --size 2048x1152 \ + --out output/imagegen/mug-hero.png +``` + +Near-4K landscape: + +```bash +python "$IMAGE_GEN" generate \ + --prompt "A detailed architectural visualization at golden hour" \ + --size 3824x2160 \ + --quality high \ + --out output/imagegen/architecture-near-4k.png +``` + +Transparent background request: + +```bash +python "$IMAGE_GEN" generate \ + --model gpt-image-1.5 \ + --prompt "A clean product cutout on a transparent background" \ + --background transparent \ + --output-format png \ + --out output/imagegen/product-cutout.png +``` + +When using this path, explain briefly that transparent backgrounds are not supported in `gpt-image-2`, the latest model, so `gpt-image-1.5` is required. + ## Quality, input fidelity, and masks (CLI fallback only) These are explicit CLI controls. They are not built-in `image_gen` tool arguments. - `--quality` works for `generate`, `edit`, and `generate-batch`: `low|medium|high|auto` -- `--input-fidelity` is **edit-only** and validated as `low|high` +- `--input-fidelity` is **edit-only** and validated as `low|high`; it is not supported for `gpt-image-2` - `--mask` is **edit-only** Example: ```bash python "$IMAGE_GEN" edit \ + --model gpt-image-1.5 \ --image input.png \ --prompt "Change only the background" \ --quality high \ @@ -147,10 +219,11 @@ Notes: - Per-job overrides are supported in JSONL (for example `size`, `quality`, `background`, `output_format`, `output_compression`, `moderation`, `n`, `model`, `out`, and prompt-augmentation fields). - `--n` generates multiple variants for a single prompt; `generate-batch` is for many different prompts. - In batch mode, per-job `out` is treated as a filename under `--out-dir`. +- For many requested deliverable assets, provide one prompt/job per distinct asset and use semantic filenames when possible. ## CLI notes -- Supported sizes: `1024x1024`, `1536x1024`, `1024x1536`, or `auto`. -- Transparent backgrounds require `output_format` to be `png` or `webp`. +- Supported sizes depend on the model. `gpt-image-2` supports flexible constrained sizes; older GPT Image models support `1024x1024`, `1536x1024`, `1024x1536`, or `auto`. +- Transparent backgrounds require `output_format` to be `png` or `webp` and are not supported by `gpt-image-2`. - `--prompt-file`, `--output-compression`, `--moderation`, `--max-attempts`, `--fail-fast`, `--force`, and `--no-augment` are supported. - This CLI is intended for GPT Image models. Do not assume older non-GPT image-model behavior applies here. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md b/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md index 249d62826253..54c0db2c3b44 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md @@ -1,6 +1,6 @@ # Codex network approvals / sandbox notes -This file is for the fallback CLI mode only. Read it only after the user explicitly asks to use `scripts/image_gen.py`. +This file is for the fallback CLI mode only. Read it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or when the user explicitly asks for transparent output that requires the `gpt-image-1.5` fallback path. This guidance is intentionally isolated from `SKILL.md` because it can vary by environment and may become stale. Prefer the defaults in your environment when in doubt. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md index a2750c15c638..959262b9a0d3 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md @@ -1,13 +1,46 @@ # Image API quick reference -This file is for the fallback CLI mode only. Use it only after the user explicitly asks to use `scripts/image_gen.py` instead of the built-in `image_gen` tool. +This file is for the fallback CLI mode only. Use it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or when the user explicitly asks for transparent output that requires the `gpt-image-1.5` fallback path. These parameters describe the Image API and bundled CLI fallback surface. Do not assume they are normal arguments on the built-in `image_gen` tool. ## Scope -- This fallback CLI is intended for GPT Image models (`gpt-image-1.5`, `gpt-image-1`, and `gpt-image-1-mini`). +- This fallback CLI is intended for GPT Image models (`gpt-image-2`, `gpt-image-1.5`, `gpt-image-1`, and `gpt-image-1-mini`). - The built-in `image_gen` tool and the fallback CLI do not expose the same controls. +## Model summary + +| Model | Quality | Input fidelity | Resolutions | Recommended use | +| --- | --- | --- | --- | --- | +| `gpt-image-2` | `low`, `medium`, `high`, `auto` | Always high fidelity for image inputs; do not set `input_fidelity` | `auto` or flexible sizes that satisfy the constraints below | Default for new CLI/API workflows: high-quality generation and editing, text-heavy images, photorealism, compositing, identity-sensitive edits, and workflows where fewer retries matter | +| `gpt-image-1.5` | `low`, `medium`, `high`, `auto` | `low`, `high` | `1024x1024`, `1024x1536`, `1536x1024`, `auto` | Transparent backgrounds and backward-compatible workflows | +| `gpt-image-1` | `low`, `medium`, `high`, `auto` | `low`, `high` | `1024x1024`, `1024x1536`, `1536x1024`, `auto` | Legacy compatibility | +| `gpt-image-1-mini` | `low`, `medium`, `high`, `auto` | `low`, `high` | `1024x1024`, `1024x1536`, `1536x1024`, `auto` | Cost-sensitive draft batches and lower-stakes previews | + +## gpt-image-2 sizes + +`gpt-image-2` accepts `auto` or any `WIDTHxHEIGHT` size that satisfies all constraints: + +- Maximum edge length must be less than `3840px`. +- Both edges must be multiples of `16px`. +- Long edge to short edge ratio must not exceed `3:1`. +- Total pixels must be at least `655,360` and no more than `8,294,400`. + +Popular sizes: + +| Label | Size | Notes | +| --- | --- | --- | +| Square | `1024x1024` | Typical fast default | +| Landscape | `1536x1024` | Standard landscape | +| Portrait | `1024x1536` | Standard portrait | +| 2K square | `2048x2048` | Larger square output | +| 2K landscape | `2048x1152` | Widescreen output | +| Near-4K landscape | `3824x2160` | Use instead of `3840x2160` | +| Near-4K portrait | `2160x3824` | Use instead of `2160x3840` | +| Auto | `auto` | Default size | + +Square images are typically fastest to generate. For 4K-style output, use `3824x2160` or `2160x3824`, not `3840x2160`, because the maximum edge length must be less than `3840px`. + ## Endpoints - Generate: `POST /v1/images/generations` (`client.images.generate(...)`) - Edit: `POST /v1/images/edits` (`client.images.edit(...)`) @@ -16,7 +49,7 @@ These parameters describe the Image API and bundled CLI fallback surface. Do not - `prompt`: text prompt - `model`: image model - `n`: number of images (1-10) -- `size`: `1024x1024`, `1536x1024`, `1024x1536`, or `auto` +- `size`: `auto` by default for `gpt-image-2`; flexible `WIDTHxHEIGHT` sizes are allowed only for `gpt-image-2`; older GPT Image models use `1024x1024`, `1536x1024`, `1024x1536`, or `auto` - `quality`: `low`, `medium`, `high`, or `auto` - `background`: output transparency behavior (`transparent`, `opaque`, or `auto`) for generated output; this is not the same thing as the prompt's visual scene/backdrop - `output_format`: `png` (default), `jpeg`, `webp` @@ -26,12 +59,17 @@ These parameters describe the Image API and bundled CLI fallback surface. Do not ## Edit-specific parameters - `image`: one or more input images. For GPT Image models, you can provide up to 16 images. - `mask`: optional mask image -- `input_fidelity`: `low` (default) or `high` +- `input_fidelity`: `low` or `high` only for models that support it; do not set this for `gpt-image-2` Model-specific note for `input_fidelity`: +- `gpt-image-2` always uses high fidelity for image inputs and does not support setting `input_fidelity`. - `gpt-image-1` and `gpt-image-1-mini` preserve all input images, but the first image gets richer textures and finer details. - `gpt-image-1.5` preserves the first 5 input images with higher fidelity. +## Transparent backgrounds + +`gpt-image-2` does not currently support transparent backgrounds. If the user explicitly asks for a transparent image or transparent background, use `gpt-image-1.5` with `background=transparent` and a transparent-capable output format such as `png` or `webp`. + ## Output - `data[]` list with `b64_json` per image - The bundled `scripts/image_gen.py` CLI decodes `b64_json` and writes output files for you. @@ -41,8 +79,9 @@ Model-specific note for `input_fidelity`: - Use the edits endpoint when the user requests changes to an existing image. - Masking is prompt-guided; exact shapes are not guaranteed. - Large sizes and high quality increase latency and cost. -- High `input_fidelity` can materially increase input token usage. -- If a request fails because a specific option is unsupported by the selected GPT Image model, retry manually without that option. +- Use `quality=low` for fast drafts, thumbnails, and quick iterations. Use `medium` or `high` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. +- High `input_fidelity` can materially increase input token usage on models that support it. +- If a request fails because a specific option is unsupported by the selected GPT Image model, retry manually without that option only when the option is not required by the user. If transparent output is required, switch to `gpt-image-1.5` instead of dropping `background=transparent`. ## Important boundary - `quality`, `input_fidelity`, explicit masks, `background`, `output_format`, and related parameters are fallback-only execution controls. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md index 8b6c684eef66..26606db9cf77 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md @@ -28,6 +28,7 @@ This file is about prompt structure, specificity, and iteration. Fallback-only e - If the user prompt is already specific and detailed, normalize it into a clean spec without adding creative requirements. - If the prompt is generic, you may add tasteful detail when it materially improves the output. - Treat examples in `sample-prompts.md` as fully-authored recipes, not as the default amount of augmentation to add to every request. +- For photorealism, include `photorealistic` directly when that is the goal, plus concrete real-world texture such as pores, wrinkles, fabric wear, material grain, or imperfect everyday detail. ## Allowed and disallowed augmentation @@ -46,6 +47,7 @@ Do not add: - Specify framing and viewpoint (close-up, wide, top-down) and placement only when it materially helps. - Call out negative space if the asset clearly needs room for UI or copy. - Avoid making left/right layout decisions unless the user or surrounding layout supports them. +- For people, describe body framing, scale, gaze, and object interactions when they matter (`full body visible`, `looking down at the book`, `hands naturally gripping the handlebars`). ## Constraints and invariants - State what must not change (`keep background unchanged`). @@ -55,6 +57,7 @@ Do not add: - Put literal text in quotes or ALL CAPS and specify typography (font style, size, color, placement). - Spell uncommon words letter-by-letter if accuracy matters. - For in-image copy, require verbatim rendering and no extra characters. +- In CLI fallback mode, use `medium` or `high` quality for small text, dense infographics, data-heavy slides, multi-font layouts, legends, axes, and footnotes. ## Input images and references - Do not assume that every provided image is an edit target. @@ -71,15 +74,22 @@ Do not add: ## Fallback-only execution controls - `quality`, `input_fidelity`, explicit masks, output format, and output paths are fallback-only execution controls. - Do not assume they are built-in `image_gen` tool arguments. -- If the user explicitly chooses CLI fallback, see `references/cli.md` and `references/image-api.md` for those controls. +- If the user explicitly chooses CLI fallback or explicitly asks for transparent output, see `references/cli.md` and `references/image-api.md` for those controls. +- In CLI fallback mode, `gpt-image-2` is the default. It supports `quality=low|medium|high|auto`; use `low` for fast drafts and thumbnails, and move to `medium`, `high`, or `auto` for final assets. +- `gpt-image-2` always uses high fidelity for image inputs, so do not set `input_fidelity` with that model. +- If the user explicitly asks for transparent output, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. +- If the user asks for 4K-style output with `gpt-image-2`, use `3824x2160` for landscape or `2160x3824` for portrait. ## Use-case tips Generate: - photorealistic-natural: Prompt as if a real photo is captured in the moment; use photography language (lens, lighting, framing); call for real texture; avoid over-stylized polish unless requested. - product-mockup: Describe the product/packaging and materials; ensure clean silhouette and label clarity; if in-image text is needed, require verbatim rendering and specify typography. - ui-mockup: Describe the target fidelity first (shippable mockup or low-fi wireframe), then focus on layout, hierarchy, and practical UI elements; avoid concept-art language. -- infographic-diagram: Define the audience and layout flow; label parts explicitly; require verbatim text. +- infographic-diagram: Define the audience and layout flow; label parts explicitly; require verbatim text; prefer higher quality in CLI mode for dense labels. - logo-brand: Keep it simple and scalable; ask for a strong silhouette and balanced negative space; avoid decorative flourishes unless requested. +- ads-marketing: Write like a creative brief; include brand positioning, audience, desired vibe, scene, and exact tagline if text must appear. +- productivity-visual: Name the exact artifact (slide, chart, workflow diagram), define the canvas and hierarchy, provide real labels/data, and ask for readable typography and polished spacing. +- scientific-educational: Define audience, lesson objective, required labels, scientific constraints, arrows, and scan-friendly whitespace. - illustration-story: Define panels or scene beats; keep each action concrete. - stylized-concept: Specify style cues, material finish, and rendering approach (3D, painterly, clay) without inventing new story elements. - historical-scene: State the location/date and required period accuracy; constrain clothing, props, and environment to match the era. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md b/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md index e4b2b01da0c0..79d619c49021 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md @@ -2,7 +2,7 @@ These prompt recipes are shared across both top-level modes of the skill: - built-in `image_gen` tool (default) -- explicit `scripts/image_gen.py` CLI fallback +- `scripts/image_gen.py` CLI fallback for explicit CLI/API/model requests or explicit transparent-output requests Use these as starting points. They are intentionally complete prompt recipes, not the default amount of augmentation to add to every user request. @@ -13,7 +13,14 @@ When adapting a user's prompt: The labeled lines are prompt scaffolding, not a closed schema. `Asset type` and `Input images` are prompt-only scaffolding; the CLI does not expose them as dedicated flags. -Execution details such as explicit CLI flags, `quality`, `input_fidelity`, masks, output formats, and local output paths depend on mode. Use the built-in tool by default; only apply CLI-specific controls after the user explicitly opts into fallback mode. +Execution details such as explicit CLI flags, `quality`, `input_fidelity`, masks, output formats, and local output paths depend on mode. Use the built-in tool by default; only apply CLI-specific controls when the user explicitly opts into fallback mode or explicitly asks for transparent output. + +CLI model notes: +- `gpt-image-2` is the fallback CLI default for new workflows. +- `gpt-image-2` supports `quality` values `low`, `medium`, `high`, and `auto`. +- For 4K-style `gpt-image-2` output, use `3824x2160` or `2160x3824` instead of `3840x2160`. +- If transparent output is explicitly required, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. +- Do not set `input_fidelity` with `gpt-image-2`; image inputs already use high fidelity. For prompting principles (structure, specificity, invariants, iteration), see `references/prompting.md`. @@ -68,6 +75,18 @@ Text (verbatim): "Bean Hopper", "Grinder", "Brew Group", "Boiler", "Water Tank", Constraints: clear labels, strong contrast, no logos or trademarks, no watermark ``` +### scientific-educational +``` +Use case: scientific-educational +Primary request: biology diagram titled "Cellular Respiration at a Glance" for high school students +Scene/backdrop: clean white classroom handout background +Subject: glucose turns into energy inside a cell; include glycolysis, Krebs cycle, and electron transport chain +Style/medium: flat scientific diagram with consistent icons, arrows, and readable labels +Composition/framing: landscape slide-style layout with clear hierarchy and generous whitespace +Text (verbatim): "Cellular Respiration at a Glance", "Glucose", "Pyruvate", "ATP", "NADH", "FADH2", "CO2", "O2", "H2O" +Constraints: scientifically plausible; avoid tiny text; no extra decoration; no watermark +``` + ### logo-brand ``` Use case: logo-brand @@ -100,6 +119,30 @@ Lighting/mood: volumetric light rays cutting through fog Constraints: no logos or trademarks; no watermark ``` +### ads-marketing +``` +Use case: ads-marketing +Primary request: campaign image for a streetwear brand called Thread +Subject: group of friends hanging out together in a stylish urban setting +Style/medium: polished youth streetwear campaign photography +Composition/framing: vertical ad layout with natural poses and integrated headline space +Lighting/mood: contemporary, energetic, tasteful +Text (verbatim): "Yours to Create." +Constraints: render the tagline exactly once; clean legible typography; no extra text; no watermarks; no unrelated logos +``` + +### productivity-visual +``` +Use case: productivity-visual +Primary request: one pitch-deck slide titled "Market Opportunity" +Asset type: fundraising slide image +Style/medium: clean modern deck slide, white background, crisp sans-serif typography +Subject: TAM/SAM/SOM concentric-circle diagram plus a small growth bar chart from 2021 to 2026 +Composition/framing: 16:9 landscape slide, clear data hierarchy, polished spacing +Text (verbatim): "Market Opportunity", "TAM: $42B", "SAM: $8.7B", "SOM: $340M", "AGI Research, 2024", "Internal analysis" +Constraints: readable labels, no clip art, no stock photography, no decorative clutter, no watermark +``` + ### historical-scene ``` Use case: historical-scene @@ -351,6 +394,8 @@ Primary request: isolate the product on a clean transparent background Constraints: crisp silhouette; no halos or fringing; preserve label text exactly; no restyling ``` +CLI note: if transparent output is explicitly required, use `gpt-image-1.5` because `gpt-image-2` does not currently support transparent backgrounds. + ### style-transfer ``` Use case: style-transfer @@ -367,6 +412,17 @@ Primary request: place the subject from Image 2 next to the person in Image 1 Constraints: match lighting, perspective, and scale; keep the base framing unchanged; no extra elements ``` +### character consistency workflow +``` +Use case: identity-preserve +Input images: Image 1: previous character anchor illustration +Primary request: continue the story with the same character in a new scene and action +Scene/backdrop: snowy forest after a winter storm +Subject: same young forest hero gently helping a frightened squirrel out of a fallen tree +Style/medium: same children's book watercolor illustration style as Image 1 +Constraints: do not redesign the character; preserve facial features, proportions, outfit, color palette, and personality; no text; no watermark +``` + ### sketch-to-render ``` Use case: sketch-to-render diff --git a/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py b/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py index 57cab20e05ce..338f261715c7 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py +++ b/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 """Fallback CLI for explicit image generation or editing with GPT Image models. -Used only when the user explicitly opts into CLI fallback mode. +Used only when the user explicitly opts into CLI fallback mode, or when explicit +transparent output requires the `gpt-image-1.5` fallback path. -Defaults to gpt-image-1.5 and a structured prompt augmentation workflow. +Defaults to gpt-image-2 and a structured prompt augmentation workflow. """ from __future__ import annotations @@ -21,20 +22,28 @@ from io import BytesIO -DEFAULT_MODEL = "gpt-image-1.5" -DEFAULT_SIZE = "1024x1024" -DEFAULT_QUALITY = "auto" +DEFAULT_MODEL = "gpt-image-2" +DEFAULT_SIZE = "auto" +DEFAULT_QUALITY = "medium" DEFAULT_OUTPUT_FORMAT = "png" DEFAULT_CONCURRENCY = 5 DEFAULT_DOWNSCALE_SUFFIX = "-web" DEFAULT_OUTPUT_PATH = "output/imagegen/output.png" GPT_IMAGE_MODEL_PREFIX = "gpt-image-" -ALLOWED_SIZES = {"1024x1024", "1536x1024", "1024x1536", "auto"} +ALLOWED_LEGACY_SIZES = {"1024x1024", "1536x1024", "1024x1536", "auto"} ALLOWED_QUALITIES = {"low", "medium", "high", "auto"} ALLOWED_BACKGROUNDS = {"transparent", "opaque", "auto", None} ALLOWED_INPUT_FIDELITIES = {"low", "high", None} +GPT_IMAGE_2_MODEL = "gpt-image-2" +GPT_IMAGE_2_MIN_PIXELS = 655_360 +GPT_IMAGE_2_MAX_PIXELS = 8_294_400 +GPT_IMAGE_2_MAX_EDGE_EXCLUSIVE = 3840 +GPT_IMAGE_2_MAX_RATIO = 3.0 +GPT_IMAGE_2_NEAR_4K_LANDSCAPE = "3824x2160" +GPT_IMAGE_2_NEAR_4K_PORTRAIT = "2160x3824" + MAX_IMAGE_BYTES = 50 * 1024 * 1024 MAX_BATCH_JOBS = 500 @@ -104,10 +113,52 @@ def _normalize_output_format(fmt: Optional[str]) -> str: return "jpeg" if fmt == "jpg" else fmt -def _validate_size(size: str) -> None: - if size not in ALLOWED_SIZES: +def _parse_size(size: str) -> Optional[Tuple[int, int]]: + match = re.fullmatch(r"([1-9][0-9]*)x([1-9][0-9]*)", size) + if not match: + return None + return int(match.group(1)), int(match.group(2)) + + +def _validate_gpt_image_2_size(size: str) -> None: + if size == "auto": + return + + parsed = _parse_size(size) + if parsed is None: + _die("size must be auto or WIDTHxHEIGHT, for example 1024x1024.") + + width, height = parsed + max_edge = max(width, height) + min_edge = min(width, height) + total_pixels = width * height + + if max_edge >= GPT_IMAGE_2_MAX_EDGE_EXCLUSIVE: + hint = GPT_IMAGE_2_NEAR_4K_LANDSCAPE + if height > width: + hint = GPT_IMAGE_2_NEAR_4K_PORTRAIT + _die( + "gpt-image-2 size maximum edge length must be less than 3840px. " + f"For 4K-style output, use {hint} instead of {size}." + ) + if width % 16 != 0 or height % 16 != 0: + _die("gpt-image-2 size width and height must be multiples of 16px.") + if max_edge / min_edge > GPT_IMAGE_2_MAX_RATIO: + _die("gpt-image-2 size long edge to short edge ratio must not exceed 3:1.") + if total_pixels < GPT_IMAGE_2_MIN_PIXELS or total_pixels > GPT_IMAGE_2_MAX_PIXELS: + _die( + "gpt-image-2 size total pixels must be at least 655,360 and no more than 8,294,400." + ) + + +def _validate_size(size: str, model: str) -> None: + if model == GPT_IMAGE_2_MODEL: + _validate_gpt_image_2_size(size) + return + + if size not in ALLOWED_LEGACY_SIZES: _die( - "size must be one of 1024x1024, 1536x1024, 1024x1536, or auto for GPT image models." + "size must be one of 1024x1024, 1536x1024, 1024x1536, or auto for this GPT Image model." ) @@ -138,17 +189,38 @@ def _validate_transparency(background: Optional[str], output_format: str) -> Non _die("transparent background requires output-format png or webp.") +def _validate_model_specific_options( + *, + model: str, + background: Optional[str], + input_fidelity: Optional[str] = None, +) -> None: + if model != GPT_IMAGE_2_MODEL: + return + if background == "transparent": + _die( + "transparent backgrounds are not supported in gpt-image-2, the latest model. " + "Use --model gpt-image-1.5 --background transparent --output-format png instead." + ) + if input_fidelity is not None: + _die( + "input_fidelity is not supported in gpt-image-2 because image inputs always use high fidelity for this model." + ) + + def _validate_generate_payload(payload: Dict[str, Any]) -> None: - _validate_model(str(payload.get("model", DEFAULT_MODEL))) + model = str(payload.get("model", DEFAULT_MODEL)) + _validate_model(model) n = int(payload.get("n", 1)) if n < 1 or n > 10: _die("n must be between 1 and 10") size = str(payload.get("size", DEFAULT_SIZE)) quality = str(payload.get("quality", DEFAULT_QUALITY)) background = payload.get("background") - _validate_size(size) + _validate_size(size, model) _validate_quality(quality) _validate_background(background) + _validate_model_specific_options(model=model, background=background) oc = payload.get("output_compression") if oc is not None and not (0 <= int(oc) <= 100): _die("output_compression must be between 0 and 100") @@ -912,10 +984,15 @@ def main() -> int: if getattr(args, "downscale_max_dim", None) is not None and args.downscale_max_dim < 1: _die("--downscale-max-dim must be >= 1") - _validate_size(args.size) + _validate_model(args.model) + _validate_size(args.size, args.model) _validate_quality(args.quality) _validate_background(args.background) - _validate_model(args.model) + _validate_model_specific_options( + model=args.model, + background=args.background, + input_fidelity=getattr(args, "input_fidelity", None), + ) _ensure_api_key(args.dry_run) args.func(args) From a8e3155424f98eef46d56dd62cdd0eedfa4cf634 Mon Sep 17 00:00:00 2001 From: won Date: Tue, 21 Apr 2026 12:40:00 -0700 Subject: [PATCH 2/5] fix(imagegen): use auto CLI quality default --- .codespellignore | 1 + codex-rs/skills/src/assets/samples/imagegen/SKILL.md | 2 +- codex-rs/skills/src/assets/samples/imagegen/references/cli.md | 4 ++-- .../src/assets/samples/imagegen/references/image-api.md | 2 +- .../src/assets/samples/imagegen/references/prompting.md | 2 +- .../skills/src/assets/samples/imagegen/scripts/image_gen.py | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.codespellignore b/.codespellignore index 947511b0c31a..23924fe083cb 100644 --- a/.codespellignore +++ b/.codespellignore @@ -1,5 +1,6 @@ iTerm iTerm2 psuedo +SOM te TE diff --git a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md index c51c5c6465ca..5c76b81db1af 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md +++ b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md @@ -242,7 +242,7 @@ The fallback CLI defaults to `gpt-image-2`. - If the user explicitly asks for transparent output, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. - `gpt-image-2` always uses high fidelity for image inputs; do not set `input_fidelity` with this model. - `gpt-image-2` supports `quality` values `low`, `medium`, `high`, and `auto`. -- Use `quality low` for fast drafts, thumbnails, and quick iterations. Use `medium`, `high`, or `auto` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. +- Use `quality low` for fast drafts, thumbnails, and quick iterations. Use `auto`, `medium`, or `high` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. - Square images are typically fastest to generate. Use `1024x1024` for fast square drafts. - If the user asks for 4K-style output, use `3824x2160` for landscape or `2160x3824` for portrait. Do not use `3840x2160`, because the maximum edge length must be less than `3840px`. - `gpt-image-2` size may be `auto` or `WIDTHxHEIGHT` if all constraints hold: max edge `< 3840px`, both edges multiples of `16px`, long-to-short ratio `<= 3:1`, total pixels between `655,360` and `8,294,400`. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md index 053fc9e5d4d7..b58903ae2783 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md @@ -64,7 +64,7 @@ python "$IMAGE_GEN" edit \ - Model: `gpt-image-2` - Supported model family for this CLI: GPT Image models (`gpt-image-*`) - Size: `auto` -- Quality: `medium` +- Quality: `auto` - Output format: `png` - Default one-off output path: `output/imagegen/output.png` - Background: unspecified unless `--background` is set @@ -74,7 +74,7 @@ python "$IMAGE_GEN" edit \ `gpt-image-2` is the default model for new CLI fallback work. - Use `--quality low` for fast drafts, thumbnails, and quick iterations. -- Use `--quality medium`, `--quality high`, or `--quality auto` for final assets, dense text, diagrams, identity-sensitive edits, and high-resolution outputs. +- Use `--quality auto`, `--quality medium`, or `--quality high` for final assets, dense text, diagrams, identity-sensitive edits, and high-resolution outputs. - Square images are typically fastest. Use `--size 1024x1024` for quick square drafts. - If the user asks for 4K-style output, use `--size 3824x2160` for landscape or `--size 2160x3824` for portrait. - Do not pass `--input-fidelity` with `gpt-image-2`; this model always uses high fidelity for image inputs. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md index 959262b9a0d3..0d56d431bc5f 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md @@ -79,7 +79,7 @@ Model-specific note for `input_fidelity`: - Use the edits endpoint when the user requests changes to an existing image. - Masking is prompt-guided; exact shapes are not guaranteed. - Large sizes and high quality increase latency and cost. -- Use `quality=low` for fast drafts, thumbnails, and quick iterations. Use `medium` or `high` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. +- Use `quality=low` for fast drafts, thumbnails, and quick iterations. Use `auto`, `medium`, or `high` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. - High `input_fidelity` can materially increase input token usage on models that support it. - If a request fails because a specific option is unsupported by the selected GPT Image model, retry manually without that option only when the option is not required by the user. If transparent output is required, switch to `gpt-image-1.5` instead of dropping `background=transparent`. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md index 26606db9cf77..73538ba32b88 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md @@ -75,7 +75,7 @@ Do not add: - `quality`, `input_fidelity`, explicit masks, output format, and output paths are fallback-only execution controls. - Do not assume they are built-in `image_gen` tool arguments. - If the user explicitly chooses CLI fallback or explicitly asks for transparent output, see `references/cli.md` and `references/image-api.md` for those controls. -- In CLI fallback mode, `gpt-image-2` is the default. It supports `quality=low|medium|high|auto`; use `low` for fast drafts and thumbnails, and move to `medium`, `high`, or `auto` for final assets. +- In CLI fallback mode, `gpt-image-2` is the default. It supports `quality=low|medium|high|auto`; use `low` for fast drafts and thumbnails, and move to `auto`, `medium`, or `high` for final assets. - `gpt-image-2` always uses high fidelity for image inputs, so do not set `input_fidelity` with that model. - If the user explicitly asks for transparent output, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. - If the user asks for 4K-style output with `gpt-image-2`, use `3824x2160` for landscape or `2160x3824` for portrait. diff --git a/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py b/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py index 338f261715c7..602dc9df0069 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py +++ b/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py @@ -24,7 +24,7 @@ DEFAULT_MODEL = "gpt-image-2" DEFAULT_SIZE = "auto" -DEFAULT_QUALITY = "medium" +DEFAULT_QUALITY = "auto" DEFAULT_OUTPUT_FORMAT = "png" DEFAULT_CONCURRENCY = 5 DEFAULT_DOWNSCALE_SUFFIX = "-web" From 13dba916a127a0a988a1f8b4671ee3d8a721c273 Mon Sep 17 00:00:00 2001 From: Gabriel C Date: Wed, 22 Apr 2026 22:20:54 +0800 Subject: [PATCH 3/5] update approach for transparent images in gpt-image-2 --- .../src/assets/samples/imagegen/SKILL.md | 70 ++- .../samples/imagegen/agents/openai.yaml | 2 +- .../assets/samples/imagegen/references/cli.md | 37 +- .../imagegen/references/codex-network.md | 2 +- .../samples/imagegen/references/image-api.md | 20 +- .../samples/imagegen/references/prompting.md | 20 +- .../imagegen/references/sample-prompts.md | 13 +- .../samples/imagegen/scripts/image_gen.py | 16 +- .../imagegen/scripts/remove_chroma_key.py | 422 ++++++++++++++++++ 9 files changed, 538 insertions(+), 64 deletions(-) create mode 100644 codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py diff --git a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md index 5c76b81db1af..28f44a810be7 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md +++ b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md @@ -11,8 +11,8 @@ Generates or edits images for the current project (for example website assets, g This skill has exactly two top-level modes: -- **Default built-in tool mode (preferred):** built-in `image_gen` tool for normal image generation and editing. Does not require `OPENAI_API_KEY`. -- **Fallback CLI mode:** `scripts/image_gen.py` CLI. Use when the user explicitly asks for the CLI/API/model path, or when they explicitly require a transparent background that needs `gpt-image-1.5`. Requires `OPENAI_API_KEY`. +- **Default built-in tool mode (preferred):** built-in `image_gen` tool for normal image generation, editing, and simple transparent-image requests. Does not require `OPENAI_API_KEY`. +- **Fallback CLI mode:** `scripts/image_gen.py` CLI. Use when the user explicitly asks for the CLI/API/model path, or after the user explicitly confirms a true model-native transparency fallback with `gpt-image-1.5`. Requires `OPENAI_API_KEY`. Within CLI fallback, the CLI exposes three subcommands: @@ -23,7 +23,9 @@ Within CLI fallback, the CLI exposes three subcommands: Rules: - Use the built-in `image_gen` tool by default for normal image generation and editing requests. - Do not switch to CLI fallback for ordinary quality, size, or file-path control. -- If the user explicitly asks for a transparent image/background, use CLI fallback with `gpt-image-1.5` and explain briefly that transparent backgrounds are not supported in `gpt-image-2`, the latest model. +- If the user explicitly asks for a transparent image/background, stay on built-in `image_gen` first: prompt for a flat removable chroma-key background, then remove it locally with `python scripts/remove_chroma_key.py`. +- Never silently switch from built-in `image_gen` or CLI `gpt-image-2` to CLI `gpt-image-1.5`. Treat this as a model/path downgrade and ask the user before doing it, unless the user has already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. +- If a transparent request appears too complex for clean chroma-key removal, asks for true/native transparency, or local removal fails validation, explain that true transparency requires CLI `gpt-image-1.5 --background transparent --output-format png` because `gpt-image-2` does not support `background=transparent`, then ask whether to proceed. Run the CLI fallback only after the user confirms. - The word `batch` by itself does not mean CLI fallback. If the user asks for many assets or says to batch-generate assets without explicitly asking for CLI/API/model controls, stay on the built-in path and issue one built-in call per requested asset or variant. - If the built-in tool fails or is unavailable, tell the user the CLI fallback exists and that it requires `OPENAI_API_KEY`. Proceed only if the user explicitly asks for that fallback. - If the user explicitly asks for CLI mode, use the bundled `scripts/image_gen.py` workflow. Do not create one-off SDK runners. @@ -48,6 +50,9 @@ Fallback-only docs/resources for CLI mode: - `references/codex-network.md` - `scripts/image_gen.py` +Local post-processing helper: +- `scripts/remove_chroma_key.py`: removes a flat chroma-key background from a generated image and writes a PNG/WebP with alpha. Prefer auto-key sampling, soft matte, and despill for antialiased edges. + ## When to use - Generate a new image (concept art, product shot, cover, website hero) - Generate a new image using one or more reference images for style, composition, or mood @@ -87,7 +92,7 @@ Execution strategy: Assume the user wants a new image unless they clearly ask to change an existing one. ## Workflow -1. Decide the top-level mode: built-in by default; fallback CLI if explicitly requested or if the user explicitly needs transparent output. +1. Decide the top-level mode: built-in by default, including simple transparent-output requests; fallback CLI only if explicitly requested or after the user explicitly confirms a transparent-output fallback. 2. Decide the intent: `generate` or `edit`. 3. Decide whether the output is preview-only or meant to be consumed by the current project. 4. Decide the execution strategy: single asset vs repeated built-in calls vs CLI `generate-batch`. @@ -102,13 +107,44 @@ Assume the user wants a new image unless they clearly ask to change an existing - If the user's prompt is already specific and detailed, normalize it into a clear spec without adding creative requirements. - If the user's prompt is generic, add tasteful augmentation only when it materially improves output quality. 10. Use the built-in `image_gen` tool by default. -11. If the user explicitly chooses the CLI fallback, or explicitly asks for transparent output, then use the fallback-only docs for model, quality, size, `input_fidelity`, masks, output format, output paths, and network setup. +11. For transparent-output requests, follow the transparent image guidance below: generate with built-in `image_gen` on a flat chroma-key background, copy the selected output into the workspace or `tmp/imagegen/`, run `python scripts/remove_chroma_key.py`, and validate the alpha result before using it. If this path looks unsuitable or fails, ask before switching to CLI `gpt-image-1.5`. 12. Inspect outputs and validate: subject, style, composition, text accuracy, and invariants/avoid items. 13. Iterate with a single targeted change, then re-check. 14. For preview-only work, render the image inline; the underlying file may remain at the default `$CODEX_HOME/generated_images/...` path. 15. For project-bound work, move or copy the selected artifact into the workspace and update any consuming code or references. Never leave a project-referenced asset only at the default `$CODEX_HOME/generated_images/...` path. 16. For batches or multi-asset requests, persist every requested deliverable final in the workspace unless the user explicitly asked to keep outputs preview-only. Discarded variants do not need to be kept unless requested. -17. Always report the final saved path(s) for any workspace-bound asset(s), plus the final prompt or prompt set and whether the built-in tool or fallback CLI mode was used. +17. If the user explicitly chooses or confirms the CLI fallback, then use the fallback-only docs for model, quality, size, `input_fidelity`, masks, output format, output paths, and network setup. +18. Always report the final saved path(s) for any workspace-bound asset(s), plus the final prompt or prompt set and whether the built-in tool or fallback CLI mode was used. + +## Transparent image requests + +Transparent-image requests still use built-in `image_gen` first. Because the built-in tool does not expose a true transparent-background control, create a removable chroma-key source image and then convert the key color to alpha locally. + +Default sequence: +1. Use built-in `image_gen` to generate the requested subject on a perfectly flat solid chroma-key background. +2. Choose a key color that is unlikely to appear in the subject: default `#00ff00`, use `#ff00ff` for green subjects, and avoid `#0000ff` for blue subjects. +3. After generation, move or copy the selected source image from `$CODEX_HOME/generated_images/...` into the workspace or `tmp/imagegen/`. +4. Run `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill`. +5. Validate that the output has an alpha channel, transparent corners, plausible subject coverage, and no obvious key-color fringe. If a thin fringe remains, retry once with `--edge-contract 1`; use `--edge-feather 0.25` only when the edge is visibly stair-stepped and the subject is not shiny or reflective. +6. Save the final alpha PNG/WebP in the project if the asset is project-bound. Never leave a project-referenced transparent asset only under `$CODEX_HOME/*`. + +Prompt transparent requests like this: + +```text +Create the requested subject on a perfectly flat solid #00ff00 chroma-key background for background removal. +The background must be one uniform color with no shadows, gradients, texture, reflections, floor plane, or lighting variation. +Keep the subject fully separated from the background with crisp edges and generous padding. +Do not use #00ff00 anywhere in the subject. +No cast shadow, no contact shadow, no reflection, no watermark, and no text unless explicitly requested. +``` + +Do not automatically use CLI `gpt-image-1.5 --background transparent --output-format png` instead of chroma keying. Ask the user first when the user asks for true/native transparency, when local removal fails validation, or when the requested image is complex: hair, fur, feathers, smoke, glass, liquids, translucent materials, reflective objects, soft shadows, realistic product grounding, or subject colors that conflict with all practical key colors. + +Use a concise confirmation like: + +```text +This likely needs true native transparency. The default built-in path uses a chroma-key background plus local removal, but true transparency requires the CLI fallback with gpt-image-1.5 because gpt-image-2 does not support background=transparent. It also requires OPENAI_API_KEY. Should I proceed with that CLI fallback? +``` ## Prompt augmentation @@ -156,7 +192,7 @@ Edit: - identity-preserve — try-on, person-in-scene; lock face/body/pose. - precise-object-edit — remove/replace a specific element (including interior swaps). - lighting-weather — time-of-day/season/atmosphere changes only. -- background-extraction — transparent background / clean cutout. +- background-extraction — transparent background / clean cutout. Use built-in `image_gen` with chroma-key removal first for simple opaque subjects; ask before using CLI true transparency for complex subjects. - style-transfer — apply reference style while changing subject/scene. - compositing — multi-image insert/merge with matched lighting/perspective. - sketch-to-render — drawing/line art to photoreal render. @@ -227,6 +263,7 @@ Constraints: change only the background; keep the product and its edges unchange - If the prompt is generic, add only the extra detail that will materially help. - If the prompt is already detailed, normalize it instead of expanding it. - For CLI fallback only, see `references/cli.md` and `references/image-api.md` for model, `quality`, `input_fidelity`, masks, output format, and output-path guidance. +- For transparent images, use the built-in-first chroma-key workflow unless the request is complex enough to need true CLI transparency; ask before switching to CLI `gpt-image-1.5`. More principles shared by both modes: `references/prompting.md`. Copy/paste specs shared by both modes: `references/sample-prompts.md`. @@ -238,14 +275,14 @@ Asset-type templates (website assets, game assets, wireframes, logo) are consoli The fallback CLI defaults to `gpt-image-2`. -- Use `gpt-image-2` for new CLI/API workflows unless the request needs transparent output. -- If the user explicitly asks for transparent output, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. +- Use `gpt-image-2` for new CLI/API workflows unless the request needs true model-native transparent output. +- If a transparent request may need CLI fallback, ask before using `gpt-image-1.5` unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. Explain that the built-in chroma-key path is the default, but true transparency requires `gpt-image-1.5` because `gpt-image-2` does not support `background=transparent`. - `gpt-image-2` always uses high fidelity for image inputs; do not set `input_fidelity` with this model. - `gpt-image-2` supports `quality` values `low`, `medium`, `high`, and `auto`. -- Use `quality low` for fast drafts, thumbnails, and quick iterations. Use `auto`, `medium`, or `high` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. +- Use `quality low` for fast drafts, thumbnails, and quick iterations. Use `medium`, `high`, or `auto` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. - Square images are typically fastest to generate. Use `1024x1024` for fast square drafts. -- If the user asks for 4K-style output, use `3824x2160` for landscape or `2160x3824` for portrait. Do not use `3840x2160`, because the maximum edge length must be less than `3840px`. -- `gpt-image-2` size may be `auto` or `WIDTHxHEIGHT` if all constraints hold: max edge `< 3840px`, both edges multiples of `16px`, long-to-short ratio `<= 3:1`, total pixels between `655,360` and `8,294,400`. +- If the user asks for 4K-style output, use `3840x2160` for landscape or `2160x3840` for portrait. +- `gpt-image-2` size may be `auto` or `WIDTHxHEIGHT` if all constraints hold: max edge `<= 3840px`, both edges multiples of `16px`, long-to-short ratio `<= 3:1`, total pixels between `655,360` and `8,294,400`. Popular `gpt-image-2` sizes: - `1024x1024` square @@ -253,8 +290,8 @@ Popular `gpt-image-2` sizes: - `1024x1536` portrait - `2048x2048` 2K square - `2048x1152` 2K landscape -- `3824x2160` near-4K landscape -- `2160x3824` near-4K portrait +- `3840x2160` 4K landscape +- `2160x3840` 4K portrait - `auto` ## Fallback CLI mode only @@ -273,7 +310,7 @@ Required Python package: uv pip install openai ``` -Optional for downscaling only: +Required for local chroma-key removal and optional downscaling: ```bash uv pip install pillow ``` @@ -305,4 +342,5 @@ If installation is not possible in this environment, tell the user which depende - `references/cli.md`: fallback-only CLI usage via `scripts/image_gen.py`. - `references/image-api.md`: fallback-only API/CLI parameter reference. - `references/codex-network.md`: fallback-only network/sandbox troubleshooting for CLI mode. -- `scripts/image_gen.py`: fallback-only CLI implementation. Do not load or use it unless the user explicitly chooses CLI mode or explicitly asks for transparent output. +- `scripts/image_gen.py`: fallback-only CLI implementation. Do not load or use it unless the user explicitly chooses CLI mode or explicitly confirms a transparent request's true CLI transparency fallback. +- `scripts/remove_chroma_key.py`: local post-processing helper for built-in transparent-image requests. diff --git a/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml b/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml index 6a4b75901311..6caf48e74c4e 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml +++ b/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml @@ -3,4 +3,4 @@ interface: short_description: "Generate or edit images for websites, games, and more" icon_small: "./assets/imagegen-small.svg" icon_large: "./assets/imagegen.png" - default_prompt: "Generate or edit the visual assets for this task with the built-in `image_gen` tool by default. First confirm that the task actually calls for a raster image; if the project already has SVG/vector/code-native assets and the user wants to extend or match those, do not use this skill. If the task includes reference images, treat them as references unless the user clearly wants an existing image modified. For multi-asset requests, loop built-in calls; the word `batch` alone is not CLI opt-in. Use the fallback CLI only if the user explicitly asks for CLI/API/model controls or explicitly needs transparent output; for transparent output use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. Keep CLI-only controls such as `generate-batch`, `quality`, `input_fidelity`, masks, and output paths on that fallback path." + default_prompt: "Generate or edit the visual assets for this task with the built-in `image_gen` tool by default. First confirm that the task actually calls for a raster image; if the project already has SVG/vector/code-native assets and the user wants to extend or match those, do not use this skill. If the task includes reference images, treat them as references unless the user clearly wants an existing image modified. For multi-asset requests, loop built-in calls; the word `batch` alone is not CLI opt-in. For transparent-image requests, still use built-in `image_gen` first by prompting for a flat chroma-key background, then remove that background locally with `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill` and save the alpha PNG/WebP. Do not silently downgrade from built-in `image_gen` or CLI `gpt-image-2` to CLI `gpt-image-1.5`. If true/native transparency, a complex transparent subject, or failed chroma-key validation may require `gpt-image-1.5 --background transparent --output-format png`, explain that `gpt-image-2` does not support `background=transparent` and ask the user before proceeding, unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. Keep CLI-only controls such as `generate-batch`, `quality`, `input_fidelity`, masks, and output paths on that fallback path." diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md index b58903ae2783..3b57c8e91159 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md @@ -1,6 +1,6 @@ # CLI reference (`scripts/image_gen.py`) -This file is for the fallback CLI mode only. Read it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or when the user explicitly asks for transparent output that requires the `gpt-image-1.5` fallback path. +This file is for the fallback CLI mode only. Read it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or after the user explicitly confirms that a transparent-output request should use the `gpt-image-1.5` true-transparency fallback path. `generate-batch` is a CLI subcommand in this fallback path. It is not a top-level mode of the skill. The word `batch` in a user request is not CLI opt-in by itself. @@ -59,12 +59,13 @@ python "$IMAGE_GEN" edit \ - Use the bundled CLI directly (`python "$IMAGE_GEN" ...`) after activating the correct environment. - Do **not** create one-off runners (for example `gen_images.py`) unless the user explicitly asks for a custom wrapper. - **Never modify** `scripts/image_gen.py`. If something is missing, ask the user before doing anything else. +- Do not silently downgrade from CLI `gpt-image-2` or built-in `image_gen` to CLI `gpt-image-1.5`; ask first unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. ## Defaults - Model: `gpt-image-2` - Supported model family for this CLI: GPT Image models (`gpt-image-*`) - Size: `auto` -- Quality: `auto` +- Quality: `medium` - Output format: `png` - Default one-off output path: `output/imagegen/output.png` - Background: unspecified unless `--background` is set @@ -74,11 +75,11 @@ python "$IMAGE_GEN" edit \ `gpt-image-2` is the default model for new CLI fallback work. - Use `--quality low` for fast drafts, thumbnails, and quick iterations. -- Use `--quality auto`, `--quality medium`, or `--quality high` for final assets, dense text, diagrams, identity-sensitive edits, and high-resolution outputs. +- Use `--quality medium`, `--quality high`, or `--quality auto` for final assets, dense text, diagrams, identity-sensitive edits, and high-resolution outputs. - Square images are typically fastest. Use `--size 1024x1024` for quick square drafts. -- If the user asks for 4K-style output, use `--size 3824x2160` for landscape or `--size 2160x3824` for portrait. +- If the user asks for 4K-style output, use `--size 3840x2160` for landscape or `--size 2160x3840` for portrait. - Do not pass `--input-fidelity` with `gpt-image-2`; this model always uses high fidelity for image inputs. -- Do not use `--background transparent` with `gpt-image-2`; use `gpt-image-1.5` for transparent output. +- Do not use `--background transparent` with `gpt-image-2`; the default transparent-image workflow uses built-in `image_gen` on a flat chroma-key background plus local removal. Use `gpt-image-1.5` only after the user explicitly confirms the true-transparent CLI fallback, unless they already requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. Popular `gpt-image-2` sizes: - `1024x1024` @@ -86,15 +87,16 @@ Popular `gpt-image-2` sizes: - `1024x1536` - `2048x2048` - `2048x1152` -- `3824x2160` -- `2160x3824` +- `3840x2160` +- `2160x3840` - `auto` `gpt-image-2` size constraints: -- max edge `< 3840px` +- max edge `<= 3840px` - both edges multiples of `16px` - long edge to short edge ratio `<= 3:1` - total pixels between `655,360` and `8,294,400` +- outputs above `2560x1440` total pixels are experimental Fast draft: @@ -116,17 +118,19 @@ python "$IMAGE_GEN" generate \ --out output/imagegen/mug-hero.png ``` -Near-4K landscape: +4K landscape: ```bash python "$IMAGE_GEN" generate \ --prompt "A detailed architectural visualization at golden hour" \ - --size 3824x2160 \ + --size 3840x2160 \ --quality high \ - --out output/imagegen/architecture-near-4k.png + --out output/imagegen/architecture-4k.png ``` -Transparent background request: +True transparent fallback request: + +Ask for confirmation before using this command unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. ```bash python "$IMAGE_GEN" generate \ @@ -137,7 +141,7 @@ python "$IMAGE_GEN" generate \ --out output/imagegen/product-cutout.png ``` -When using this path, explain briefly that transparent backgrounds are not supported in `gpt-image-2`, the latest model, so `gpt-image-1.5` is required. +When using this path, explain briefly that built-in `image_gen` plus chroma-key removal is the default transparent-image path, but this request needs true model-native transparency. `gpt-image-2` does not support `background=transparent`, so `gpt-image-1.5` is required for this confirmed fallback. ## Quality, input fidelity, and masks (CLI fallback only) These are explicit CLI controls. They are not built-in `image_gen` tool arguments. @@ -161,6 +165,10 @@ python "$IMAGE_GEN" edit \ Mask notes: - For multi-image edits, pass repeated `--image` flags. Their order is meaningful, so describe each image by index and role in the prompt. - The CLI accepts a single `--mask`. +- Image and mask must be the same size and format and each under 50MB. +- Masks must include an alpha channel. +- If multiple input images are provided, the mask applies to the first image. +- Masking is prompt-guided; do not promise exact pixel-perfect mask boundaries. - Use a PNG mask when possible; the script treats mask handling as best-effort and does not perform full preflight validation beyond file checks/warnings. - In the edit prompt, repeat invariants (`change only the background; keep the subject unchanged`) to reduce drift. @@ -223,7 +231,7 @@ Notes: ## CLI notes - Supported sizes depend on the model. `gpt-image-2` supports flexible constrained sizes; older GPT Image models support `1024x1024`, `1536x1024`, `1024x1536`, or `auto`. -- Transparent backgrounds require `output_format` to be `png` or `webp` and are not supported by `gpt-image-2`. +- True transparent CLI outputs require `output_format` to be `png` or `webp` and are not supported by `gpt-image-2`. - `--prompt-file`, `--output-compression`, `--moderation`, `--max-attempts`, `--fail-fast`, `--force`, and `--no-augment` are supported. - This CLI is intended for GPT Image models. Do not assume older non-GPT image-model behavior applies here. @@ -231,3 +239,4 @@ Notes: - API parameter quick reference for fallback CLI mode: `references/image-api.md` - Prompt examples shared across both top-level modes: `references/sample-prompts.md` - Network/sandbox notes for fallback CLI mode: `references/codex-network.md` +- Built-in-first transparent image workflow: `SKILL.md` and `scripts/remove_chroma_key.py` diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md b/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md index 54c0db2c3b44..5ce1fbc748bd 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/codex-network.md @@ -1,6 +1,6 @@ # Codex network approvals / sandbox notes -This file is for the fallback CLI mode only. Read it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or when the user explicitly asks for transparent output that requires the `gpt-image-1.5` fallback path. +This file is for the fallback CLI mode only. Read it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or after the user explicitly confirms that a transparent-output request should use the `gpt-image-1.5` true-transparency fallback path. This guidance is intentionally isolated from `SKILL.md` because it can vary by environment and may become stale. Prefer the defaults in your environment when in doubt. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md index 0d56d431bc5f..5e32e7c9fba3 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md @@ -1,6 +1,6 @@ # Image API quick reference -This file is for the fallback CLI mode only. Use it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or when the user explicitly asks for transparent output that requires the `gpt-image-1.5` fallback path. +This file is for the fallback CLI mode only. Use it when the user explicitly asks to use `scripts/image_gen.py` / CLI / API / model controls, or after the user explicitly confirms that a transparent-output request should use the `gpt-image-1.5` true-transparency fallback path. These parameters describe the Image API and bundled CLI fallback surface. Do not assume they are normal arguments on the built-in `image_gen` tool. @@ -13,7 +13,7 @@ These parameters describe the Image API and bundled CLI fallback surface. Do not | Model | Quality | Input fidelity | Resolutions | Recommended use | | --- | --- | --- | --- | --- | | `gpt-image-2` | `low`, `medium`, `high`, `auto` | Always high fidelity for image inputs; do not set `input_fidelity` | `auto` or flexible sizes that satisfy the constraints below | Default for new CLI/API workflows: high-quality generation and editing, text-heavy images, photorealism, compositing, identity-sensitive edits, and workflows where fewer retries matter | -| `gpt-image-1.5` | `low`, `medium`, `high`, `auto` | `low`, `high` | `1024x1024`, `1024x1536`, `1536x1024`, `auto` | Transparent backgrounds and backward-compatible workflows | +| `gpt-image-1.5` | `low`, `medium`, `high`, `auto` | `low`, `high` | `1024x1024`, `1024x1536`, `1536x1024`, `auto` | True transparent-background fallback and backward-compatible workflows | | `gpt-image-1` | `low`, `medium`, `high`, `auto` | `low`, `high` | `1024x1024`, `1024x1536`, `1536x1024`, `auto` | Legacy compatibility | | `gpt-image-1-mini` | `low`, `medium`, `high`, `auto` | `low`, `high` | `1024x1024`, `1024x1536`, `1536x1024`, `auto` | Cost-sensitive draft batches and lower-stakes previews | @@ -21,7 +21,7 @@ These parameters describe the Image API and bundled CLI fallback surface. Do not `gpt-image-2` accepts `auto` or any `WIDTHxHEIGHT` size that satisfies all constraints: -- Maximum edge length must be less than `3840px`. +- Maximum edge length must be less than or equal to `3840px`. - Both edges must be multiples of `16px`. - Long edge to short edge ratio must not exceed `3:1`. - Total pixels must be at least `655,360` and no more than `8,294,400`. @@ -35,11 +35,11 @@ Popular sizes: | Portrait | `1024x1536` | Standard portrait | | 2K square | `2048x2048` | Larger square output | | 2K landscape | `2048x1152` | Widescreen output | -| Near-4K landscape | `3824x2160` | Use instead of `3840x2160` | -| Near-4K portrait | `2160x3824` | Use instead of `2160x3840` | +| 4K landscape | `3840x2160` | Widescreen 4K output | +| 4K portrait | `2160x3840` | Vertical 4K output | | Auto | `auto` | Default size | -Square images are typically fastest to generate. For 4K-style output, use `3824x2160` or `2160x3824`, not `3840x2160`, because the maximum edge length must be less than `3840px`. +Square images are typically fastest to generate. For 4K-style output, use `3840x2160` or `2160x3840`. ## Endpoints - Generate: `POST /v1/images/generations` (`client.images.generate(...)`) @@ -68,7 +68,9 @@ Model-specific note for `input_fidelity`: ## Transparent backgrounds -`gpt-image-2` does not currently support transparent backgrounds. If the user explicitly asks for a transparent image or transparent background, use `gpt-image-1.5` with `background=transparent` and a transparent-capable output format such as `png` or `webp`. +`gpt-image-2` does not currently support the Image API `background=transparent` parameter. The skill's default transparent-image path is built-in `image_gen` with a flat chroma-key background, followed by local alpha extraction with `python scripts/remove_chroma_key.py`. + +Use CLI `gpt-image-1.5` with `background=transparent` and a transparent-capable output format such as `png` or `webp` only after the user explicitly confirms that fallback, unless they already requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. If the user asks for true/native transparency, the subject is too complex for clean chroma-key removal, or local background removal fails validation, explain the tradeoff and ask before switching. ## Output - `data[]` list with `b64_json` per image @@ -79,9 +81,9 @@ Model-specific note for `input_fidelity`: - Use the edits endpoint when the user requests changes to an existing image. - Masking is prompt-guided; exact shapes are not guaranteed. - Large sizes and high quality increase latency and cost. -- Use `quality=low` for fast drafts, thumbnails, and quick iterations. Use `auto`, `medium`, or `high` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. +- Use `quality=low` for fast drafts, thumbnails, and quick iterations. Use `medium` or `high` for final assets, dense text, diagrams, identity-sensitive edits, or high-resolution outputs. - High `input_fidelity` can materially increase input token usage on models that support it. -- If a request fails because a specific option is unsupported by the selected GPT Image model, retry manually without that option only when the option is not required by the user. If transparent output is required, switch to `gpt-image-1.5` instead of dropping `background=transparent`. +- If a request fails because a specific option is unsupported by the selected GPT Image model, retry manually without that option only when the option is not required by the user. If true transparent CLI output is required, ask before switching to `gpt-image-1.5` instead of dropping `background=transparent`, unless the user already explicitly chose that fallback. ## Important boundary - `quality`, `input_fidelity`, explicit masks, `background`, `output_format`, and related parameters are fallback-only execution controls. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md index 73538ba32b88..45b92b37353c 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md @@ -15,6 +15,7 @@ This file is about prompt structure, specificity, and iteration. Fallback-only e - [Text in images](#text-in-images) - [Input images and references](#input-images-and-references) - [Iterate deliberately](#iterate-deliberately) +- [Transparent images](#transparent-images) - [Fallback-only execution controls](#fallback-only-execution-controls) - [Use-case tips](#use-case-tips) - [Where to find copy/paste recipes](#where-to-find-copypaste-recipes) @@ -71,14 +72,23 @@ Do not add: - Re-specify critical constraints when you iterate. - Prefer one targeted follow-up at a time over rewriting the whole prompt. +## Transparent images +- Use built-in `image_gen` first for transparent-image requests. If the subject is clearly too complex for chroma-key removal, explain the fallback and ask before switching to CLI. +- Prompt for a perfectly flat solid chroma-key background, usually `#00ff00`; use `#ff00ff` when the subject is green, and avoid key colors that appear in the subject. +- Explicitly prohibit shadows, gradients, floor planes, reflections, texture, and lighting variation in the background. +- Ask for crisp edges, generous padding, and no use of the key color inside the subject. +- After generation, remove the background locally with `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill` and validate the alpha result before shipping it. +- Use soft matte and despill for antialiased edges; hard tolerance-only removal is mainly for flat pixel-art or exact-color fixtures. +- Use CLI `gpt-image-1.5 --background transparent --output-format png` only after the user explicitly confirms the fallback, or when the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. Ask first for true/native transparency requests, failed chroma-key validation, or complex transparent subjects such as hair, fur, glass, smoke, liquids, translucent materials, reflective objects, or soft shadows. + ## Fallback-only execution controls - `quality`, `input_fidelity`, explicit masks, output format, and output paths are fallback-only execution controls. - Do not assume they are built-in `image_gen` tool arguments. -- If the user explicitly chooses CLI fallback or explicitly asks for transparent output, see `references/cli.md` and `references/image-api.md` for those controls. -- In CLI fallback mode, `gpt-image-2` is the default. It supports `quality=low|medium|high|auto`; use `low` for fast drafts and thumbnails, and move to `auto`, `medium`, or `high` for final assets. +- If the user explicitly chooses CLI fallback, see `references/cli.md` and `references/image-api.md` for those controls. +- In CLI fallback mode, `gpt-image-2` is the default. It supports `quality=low|medium|high|auto`; use `low` for fast drafts and thumbnails, and move to `medium`, `high`, or `auto` for final assets. - `gpt-image-2` always uses high fidelity for image inputs, so do not set `input_fidelity` with that model. -- If the user explicitly asks for transparent output, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. -- If the user asks for 4K-style output with `gpt-image-2`, use `3824x2160` for landscape or `2160x3824` for portrait. +- If a transparent request needs true CLI transparency, ask before using `gpt-image-1.5` unless the user already explicitly chose it. Explain that built-in chroma-key removal is the default path, but `gpt-image-2` does not support `background=transparent`. +- If the user asks for 4K-style output with `gpt-image-2`, use `3840x2160` for landscape or `2160x3840` for portrait. ## Use-case tips Generate: @@ -99,7 +109,7 @@ Edit: - identity-preserve: Lock identity (face, body, pose, hair, expression); change only the specified elements; match lighting and shadows. - precise-object-edit: Specify exactly what to remove/replace; preserve surrounding texture and lighting; keep everything else unchanged. - lighting-weather: Change only environmental conditions (light, shadows, atmosphere, precipitation); keep geometry, framing, and subject identity. -- background-extraction: Request a clean cutout; crisp silhouette; no halos; preserve label text exactly; no restyling. +- background-extraction: For simple opaque subjects, request a clean cutout on a perfectly flat chroma-key background; crisp silhouette; generous padding; no shadows; no halos; preserve label text exactly; no restyling. Ask before using true CLI transparency for complex subjects. - style-transfer: Specify style cues to preserve (palette, texture, brushwork) and what must change; add `no extra elements` to prevent drift. - compositing: Reference inputs by index; specify what moves where; match lighting, perspective, and scale; keep the base framing unchanged. - sketch-to-render: Preserve layout, proportions, and perspective; choose materials and lighting that support the supplied sketch without adding new elements. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md b/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md index 79d619c49021..f8284e897fac 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md @@ -2,7 +2,7 @@ These prompt recipes are shared across both top-level modes of the skill: - built-in `image_gen` tool (default) -- `scripts/image_gen.py` CLI fallback for explicit CLI/API/model requests or explicit transparent-output requests +- `scripts/image_gen.py` CLI fallback for explicit CLI/API/model requests or user-confirmed true-transparent-output fallback requests Use these as starting points. They are intentionally complete prompt recipes, not the default amount of augmentation to add to every user request. @@ -13,13 +13,13 @@ When adapting a user's prompt: The labeled lines are prompt scaffolding, not a closed schema. `Asset type` and `Input images` are prompt-only scaffolding; the CLI does not expose them as dedicated flags. -Execution details such as explicit CLI flags, `quality`, `input_fidelity`, masks, output formats, and local output paths depend on mode. Use the built-in tool by default; only apply CLI-specific controls when the user explicitly opts into fallback mode or explicitly asks for transparent output. +Execution details such as explicit CLI flags, `quality`, `input_fidelity`, masks, output formats, and local output paths depend on mode. Use the built-in tool by default, including simple transparent-image requests. For transparent images, prompt for a flat chroma-key background and remove it locally with `python scripts/remove_chroma_key.py`; only apply CLI-specific controls when the user explicitly opts into fallback mode or explicitly confirms that the transparent request should use true CLI transparency. CLI model notes: - `gpt-image-2` is the fallback CLI default for new workflows. - `gpt-image-2` supports `quality` values `low`, `medium`, `high`, and `auto`. -- For 4K-style `gpt-image-2` output, use `3824x2160` or `2160x3824` instead of `3840x2160`. -- If transparent output is explicitly required, use `gpt-image-1.5` and explain that transparent backgrounds are not supported in `gpt-image-2`, the latest model. +- For 4K-style `gpt-image-2` output, use `3840x2160` or `2160x3840`. +- If transparent output needs true CLI fallback, ask before using `gpt-image-1.5` unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. Explain that built-in chroma-key removal is the default path, but `gpt-image-2` does not support `background=transparent`. - Do not set `input_fidelity` with `gpt-image-2`; image inputs already use high fidelity. For prompting principles (structure, specificity, invariants, iteration), see `references/prompting.md`. @@ -391,10 +391,11 @@ Constraints: preserve subject identity, geometry, camera angle, and composition; Use case: background-extraction Input images: Image 1: product photo Primary request: isolate the product on a clean transparent background -Constraints: crisp silhouette; no halos or fringing; preserve label text exactly; no restyling +Scene/backdrop: perfectly flat solid #00ff00 chroma-key background for local background removal +Constraints: background must be one uniform color with no shadows, gradients, texture, reflections, floor plane, or lighting variation; crisp silhouette; generous padding; no halos or fringing; preserve label text exactly; no restyling; do not use #00ff00 anywhere in the subject ``` -CLI note: if transparent output is explicitly required, use `gpt-image-1.5` because `gpt-image-2` does not currently support transparent backgrounds. +Post-process note: after built-in generation, run `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill`. Ask before using CLI `gpt-image-1.5 --background transparent --output-format png` for true/native transparency, failed chroma-key validation, or complex subjects such as hair, fur, glass, smoke, liquids, translucent materials, reflections, or soft shadows, unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. ### style-transfer ``` diff --git a/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py b/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py index 602dc9df0069..9e0ea6717c55 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py +++ b/codex-rs/skills/src/assets/samples/imagegen/scripts/image_gen.py @@ -24,7 +24,7 @@ DEFAULT_MODEL = "gpt-image-2" DEFAULT_SIZE = "auto" -DEFAULT_QUALITY = "auto" +DEFAULT_QUALITY = "medium" DEFAULT_OUTPUT_FORMAT = "png" DEFAULT_CONCURRENCY = 5 DEFAULT_DOWNSCALE_SUFFIX = "-web" @@ -39,10 +39,8 @@ GPT_IMAGE_2_MODEL = "gpt-image-2" GPT_IMAGE_2_MIN_PIXELS = 655_360 GPT_IMAGE_2_MAX_PIXELS = 8_294_400 -GPT_IMAGE_2_MAX_EDGE_EXCLUSIVE = 3840 +GPT_IMAGE_2_MAX_EDGE = 3840 GPT_IMAGE_2_MAX_RATIO = 3.0 -GPT_IMAGE_2_NEAR_4K_LANDSCAPE = "3824x2160" -GPT_IMAGE_2_NEAR_4K_PORTRAIT = "2160x3824" MAX_IMAGE_BYTES = 50 * 1024 * 1024 MAX_BATCH_JOBS = 500 @@ -133,14 +131,8 @@ def _validate_gpt_image_2_size(size: str) -> None: min_edge = min(width, height) total_pixels = width * height - if max_edge >= GPT_IMAGE_2_MAX_EDGE_EXCLUSIVE: - hint = GPT_IMAGE_2_NEAR_4K_LANDSCAPE - if height > width: - hint = GPT_IMAGE_2_NEAR_4K_PORTRAIT - _die( - "gpt-image-2 size maximum edge length must be less than 3840px. " - f"For 4K-style output, use {hint} instead of {size}." - ) + if max_edge > GPT_IMAGE_2_MAX_EDGE: + _die("gpt-image-2 size maximum edge length must be less than or equal to 3840px.") if width % 16 != 0 or height % 16 != 0: _die("gpt-image-2 size width and height must be multiples of 16px.") if max_edge / min_edge > GPT_IMAGE_2_MAX_RATIO: diff --git a/codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py b/codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py new file mode 100644 index 000000000000..b9757c7d1892 --- /dev/null +++ b/codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +"""Remove a solid chroma-key background from an image. + +This helper supports the imagegen skill's built-in-first transparent workflow: +generate an image on a flat key color, then convert that key color to alpha. +""" + +from __future__ import annotations + +import argparse +from io import BytesIO +from pathlib import Path +import re +from statistics import median +import sys +from typing import Iterable, Tuple + + +Color = Tuple[int, int, int] +KEY_DOMINANCE_THRESHOLD = 16.0 +ALPHA_NOISE_FLOOR = 8 + + +def _die(message: str, code: int = 1) -> None: + print(f"Error: {message}", file=sys.stderr) + raise SystemExit(code) + + +def _dependency_hint(package: str) -> str: + return ( + "Activate the repo-selected environment first, then install it with " + f"`uv pip install {package}`. If this repo uses a local virtualenv, start with " + "`source .venv/bin/activate`; otherwise use this repo's configured shared fallback " + "environment." + ) + + +def _load_pillow(): + try: + from PIL import Image, ImageFilter + except ImportError: + _die(f"Pillow is required for chroma-key removal. {_dependency_hint('pillow')}") + return Image, ImageFilter + + +def _parse_key_color(raw: str) -> Color: + value = raw.strip() + match = re.fullmatch(r"#?([0-9a-fA-F]{6})", value) + if not match: + _die("key color must be a hex RGB value like #00ff00.") + hex_value = match.group(1) + return ( + int(hex_value[0:2], 16), + int(hex_value[2:4], 16), + int(hex_value[4:6], 16), + ) + + +def _validate_args(args: argparse.Namespace) -> None: + if args.tolerance < 0 or args.tolerance > 255: + _die("--tolerance must be between 0 and 255.") + if args.transparent_threshold < 0 or args.transparent_threshold > 255: + _die("--transparent-threshold must be between 0 and 255.") + if args.opaque_threshold < 0 or args.opaque_threshold > 255: + _die("--opaque-threshold must be between 0 and 255.") + if args.soft_matte and args.transparent_threshold >= args.opaque_threshold: + _die("--transparent-threshold must be lower than --opaque-threshold.") + if args.edge_feather < 0 or args.edge_feather > 64: + _die("--edge-feather must be between 0 and 64.") + if args.edge_contract < 0 or args.edge_contract > 16: + _die("--edge-contract must be between 0 and 16.") + + src = Path(args.input) + if not src.exists(): + _die(f"Input image not found: {src}") + + out = Path(args.out) + if out.exists() and not args.force: + _die(f"Output already exists: {out} (use --force to overwrite)") + + if out.suffix.lower() not in {".png", ".webp"}: + _die("--out must end in .png or .webp so the alpha channel is preserved.") + + +def _channel_distance(a: Color, b: Color) -> int: + return max(abs(a[0] - b[0]), abs(a[1] - b[1]), abs(a[2] - b[2])) + + +def _clamp_channel(value: float) -> int: + return max(0, min(255, int(round(value)))) + + +def _smoothstep(value: float) -> float: + value = max(0.0, min(1.0, value)) + return value * value * (3.0 - 2.0 * value) + + +def _soft_alpha(distance: int, transparent_threshold: float, opaque_threshold: float) -> int: + if distance <= transparent_threshold: + return 0 + if distance >= opaque_threshold: + return 255 + ratio = (float(distance) - transparent_threshold) / ( + opaque_threshold - transparent_threshold + ) + return _clamp_channel(255.0 * _smoothstep(ratio)) + + +def _dominance_alpha(rgb: Color, key: Color) -> int: + spill_channels = _spill_channels(key) + if not spill_channels: + return 255 + + channels = [float(value) for value in rgb] + non_spill = [idx for idx in range(3) if idx not in spill_channels] + key_strength = ( + min(channels[idx] for idx in spill_channels) + if len(spill_channels) > 1 + else channels[spill_channels[0]] + ) + non_key_strength = max((channels[idx] for idx in non_spill), default=0.0) + dominance = key_strength - non_key_strength + if dominance <= 0: + return 255 + + denominator = max(1.0, float(max(key)) - non_key_strength) + alpha = 1.0 - min(1.0, dominance / denominator) + return _clamp_channel(alpha * 255.0) + + +def _spill_channels(key: Color) -> list[int]: + key_max = max(key) + if key_max < 128: + return [] + return [idx for idx, value in enumerate(key) if value >= key_max - 16 and value >= 128] + + +def _key_channel_dominance(rgb: Color, key: Color) -> float: + spill_channels = _spill_channels(key) + if not spill_channels: + return 0.0 + + channels = [float(value) for value in rgb] + non_spill = [idx for idx in range(3) if idx not in spill_channels] + key_strength = ( + min(channels[idx] for idx in spill_channels) + if len(spill_channels) > 1 + else channels[spill_channels[0]] + ) + non_key_strength = max((channels[idx] for idx in non_spill), default=0.0) + return key_strength - non_key_strength + + +def _looks_key_colored(rgb: Color, key: Color, distance: int) -> bool: + if distance <= 32: + return True + + spill_channels = _spill_channels(key) + if not spill_channels: + return True + + return _key_channel_dominance(rgb, key) >= KEY_DOMINANCE_THRESHOLD + + +def _cleanup_spill(rgb: Color, key: Color, alpha: int = 255) -> Color: + if alpha >= 252: + return rgb + + spill_channels = _spill_channels(key) + if not spill_channels: + return rgb + + channels = [float(value) for value in rgb] + non_spill = [idx for idx in range(3) if idx not in spill_channels] + if non_spill: + anchor = max(channels[idx] for idx in non_spill) + cap = max(0.0, anchor - 1.0) + for idx in spill_channels: + if channels[idx] > cap: + channels[idx] = cap + + return ( + _clamp_channel(channels[0]), + _clamp_channel(channels[1]), + _clamp_channel(channels[2]), + ) + + +def _alpha_for_pixels( + pixels: Iterable[tuple[int, int, int, int]], + *, + key: Color, + tolerance: int, + spill_cleanup: bool, + soft_matte: bool, + transparent_threshold: float, + opaque_threshold: float, +) -> tuple[list[tuple[int, int, int, int]], int]: + output: list[tuple[int, int, int, int]] = [] + transparent = 0 + + for red, green, blue, alpha in pixels: + rgb = (red, green, blue) + distance = _channel_distance(rgb, key) + key_like = _looks_key_colored(rgb, key, distance) + output_alpha = ( + min( + _soft_alpha(distance, transparent_threshold, opaque_threshold), + _dominance_alpha(rgb, key), + ) + if soft_matte and key_like + else (0 if distance <= tolerance else 255) + ) + output_alpha = int(round(output_alpha * (alpha / 255.0))) + if 0 < output_alpha <= ALPHA_NOISE_FLOOR: + output_alpha = 0 + + if output_alpha == 0: + output.append((0, 0, 0, 0)) + transparent += 1 + continue + + if spill_cleanup and key_like: + red, green, blue = _cleanup_spill(rgb, key, output_alpha) + output.append((red, green, blue, output_alpha)) + + return output, transparent + + +def _contract_alpha(image, pixels: int): + if pixels == 0: + return image + + _, ImageFilter = _load_pillow() + alpha = image.split()[-1] + for _ in range(pixels): + alpha = alpha.filter(ImageFilter.MinFilter(3)) + image.putalpha(alpha) + return image + + +def _apply_edge_feather(image, radius: float): + if radius == 0: + return image + + _, ImageFilter = _load_pillow() + alpha = image.split()[-1] + alpha = alpha.filter(ImageFilter.GaussianBlur(radius=radius)) + image.putalpha(alpha) + return image + + +def _encode_image(image, output_format: str) -> bytes: + out = BytesIO() + image.save(out, format=output_format.upper()) + return out.getvalue() + + +def _get_pixels(image): + getter = getattr(image, "get_flattened_data", None) + return getter() if getter else image.getdata() + + +def _sample_border_key(image, mode: str) -> Color: + rgb = image.convert("RGB") + width, height = rgb.size + samples: list[Color] = [] + + if mode == "corners": + patch = max(1, min(width, height, 12)) + boxes = [ + (0, 0, patch, patch), + (width - patch, 0, width, patch), + (0, height - patch, patch, height), + (width - patch, height - patch, width, height), + ] + for left, top, right, bottom in boxes: + for y in range(top, bottom): + for x in range(left, right): + samples.append(rgb.getpixel((x, y))) + else: + band = max(1, min(width, height, 6)) + step = max(1, min(width, height) // 256) + for x in range(0, width, step): + for y in range(band): + samples.append(rgb.getpixel((x, y))) + samples.append(rgb.getpixel((x, height - 1 - y))) + for y in range(0, height, step): + for x in range(band): + samples.append(rgb.getpixel((x, y))) + samples.append(rgb.getpixel((width - 1 - x, y))) + + if not samples: + _die("Could not sample background key color from image border.") + + return ( + int(round(median(sample[0] for sample in samples))), + int(round(median(sample[1] for sample in samples))), + int(round(median(sample[2] for sample in samples))), + ) + + +def _remove_chroma_key(args: argparse.Namespace) -> None: + Image, _ = _load_pillow() + src = Path(args.input) + out = Path(args.out) + + with Image.open(src) as image: + rgba = image.convert("RGBA") + key = ( + _sample_border_key(rgba, args.auto_key) + if args.auto_key != "none" + else _parse_key_color(args.key_color) + ) + + pixels, transparent = _alpha_for_pixels( + _get_pixels(rgba), + key=key, + tolerance=args.tolerance, + spill_cleanup=args.spill_cleanup, + soft_matte=args.soft_matte, + transparent_threshold=args.transparent_threshold, + opaque_threshold=args.opaque_threshold, + ) + rgba.putdata(pixels) + rgba = _contract_alpha(rgba, args.edge_contract) + rgba = _apply_edge_feather(rgba, args.edge_feather) + + alpha_values = [pixel[3] for pixel in _get_pixels(rgba)] + total = len(alpha_values) + transparent_after = sum(1 for alpha in alpha_values if alpha == 0) + partial_after = sum(1 for alpha in alpha_values if 0 < alpha < 255) + + out.parent.mkdir(parents=True, exist_ok=True) + output_format = "PNG" if out.suffix.lower() == ".png" else "WEBP" + out.write_bytes(_encode_image(rgba, output_format)) + + print(f"Wrote {out}") + print(f"Key color: #{key[0]:02x}{key[1]:02x}{key[2]:02x}") + print(f"Transparent pixels: {transparent_after}/{total}") + print(f"Partially transparent pixels: {partial_after}/{total}") + if transparent == 0: + print("Warning: no pixels matched the key color before feathering.", file=sys.stderr) + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Remove a solid chroma-key background and write an image with alpha." + ) + parser.add_argument("--input", required=True, help="Input image path.") + parser.add_argument("--out", required=True, help="Output .png or .webp path.") + parser.add_argument( + "--key-color", + default="#00ff00", + help="Hex RGB key color to remove, for example #00ff00.", + ) + parser.add_argument( + "--tolerance", + type=int, + default=12, + help="Hard-key per-channel tolerance for matching the key color, 0-255.", + ) + parser.add_argument( + "--auto-key", + choices=["none", "corners", "border"], + default="none", + help="Sample the key color from image corners or border instead of --key-color.", + ) + parser.add_argument( + "--soft-matte", + action="store_true", + help="Use a smooth alpha ramp between transparent and opaque thresholds.", + ) + parser.add_argument( + "--transparent-threshold", + type=float, + default=12.0, + help="Soft-matte distance at or below which pixels become fully transparent.", + ) + parser.add_argument( + "--opaque-threshold", + type=float, + default=96.0, + help="Soft-matte distance at or above which pixels become fully opaque.", + ) + parser.add_argument( + "--edge-feather", + type=float, + default=0.0, + help="Optional alpha blur radius for softened edges, 0-64.", + ) + parser.add_argument( + "--edge-contract", + type=int, + default=0, + help="Shrink the visible alpha matte by this many pixels before feathering.", + ) + parser.add_argument( + "--spill-cleanup", + dest="spill_cleanup", + action="store_true", + help="Reduce obvious key-color spill on opaque pixels.", + ) + parser.add_argument( + "--despill", + dest="spill_cleanup", + action="store_true", + help="Alias for --spill-cleanup; decontaminate key-color edge spill.", + ) + parser.add_argument("--force", action="store_true", help="Overwrite an existing output file.") + return parser + + +def main() -> None: + parser = _build_parser() + args = parser.parse_args() + _validate_args(args) + _remove_chroma_key(args) + + +if __name__ == "__main__": + main() From cd15fcc9ae04038d1d1f995b6c964a47b88766f7 Mon Sep 17 00:00:00 2001 From: Gabriel C Date: Wed, 22 Apr 2026 22:33:55 +0800 Subject: [PATCH 4/5] Simplify ImageGen default prompt --- codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml b/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml index 6caf48e74c4e..5e01d4410ef4 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml +++ b/codex-rs/skills/src/assets/samples/imagegen/agents/openai.yaml @@ -3,4 +3,4 @@ interface: short_description: "Generate or edit images for websites, games, and more" icon_small: "./assets/imagegen-small.svg" icon_large: "./assets/imagegen.png" - default_prompt: "Generate or edit the visual assets for this task with the built-in `image_gen` tool by default. First confirm that the task actually calls for a raster image; if the project already has SVG/vector/code-native assets and the user wants to extend or match those, do not use this skill. If the task includes reference images, treat them as references unless the user clearly wants an existing image modified. For multi-asset requests, loop built-in calls; the word `batch` alone is not CLI opt-in. For transparent-image requests, still use built-in `image_gen` first by prompting for a flat chroma-key background, then remove that background locally with `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill` and save the alpha PNG/WebP. Do not silently downgrade from built-in `image_gen` or CLI `gpt-image-2` to CLI `gpt-image-1.5`. If true/native transparency, a complex transparent subject, or failed chroma-key validation may require `gpt-image-1.5 --background transparent --output-format png`, explain that `gpt-image-2` does not support `background=transparent` and ask the user before proceeding, unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. Keep CLI-only controls such as `generate-batch`, `quality`, `input_fidelity`, masks, and output paths on that fallback path." + default_prompt: "Use $imagegen to make or edit an image for this project." From 74a06a28b1150c6133103500b2ef64f3fa26f341 Mon Sep 17 00:00:00 2001 From: Gabriel C Date: Wed, 22 Apr 2026 22:41:52 +0800 Subject: [PATCH 5/5] Fix imagegen chroma-key helper memory use --- .../src/assets/samples/imagegen/SKILL.md | 20 +++- .../assets/samples/imagegen/references/cli.md | 2 +- .../samples/imagegen/references/image-api.md | 2 +- .../samples/imagegen/references/prompting.md | 2 +- .../imagegen/references/sample-prompts.md | 4 +- .../imagegen/scripts/remove_chroma_key.py | 110 ++++++++++-------- 6 files changed, 84 insertions(+), 56 deletions(-) diff --git a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md index 28f44a810be7..4285e5e6419e 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/SKILL.md +++ b/codex-rs/skills/src/assets/samples/imagegen/SKILL.md @@ -23,7 +23,7 @@ Within CLI fallback, the CLI exposes three subcommands: Rules: - Use the built-in `image_gen` tool by default for normal image generation and editing requests. - Do not switch to CLI fallback for ordinary quality, size, or file-path control. -- If the user explicitly asks for a transparent image/background, stay on built-in `image_gen` first: prompt for a flat removable chroma-key background, then remove it locally with `python scripts/remove_chroma_key.py`. +- If the user explicitly asks for a transparent image/background, stay on built-in `image_gen` first: prompt for a flat removable chroma-key background, then remove it locally with the installed helper at `$CODEX_HOME/skills/.system/imagegen/scripts/remove_chroma_key.py`. - Never silently switch from built-in `image_gen` or CLI `gpt-image-2` to CLI `gpt-image-1.5`. Treat this as a model/path downgrade and ask the user before doing it, unless the user has already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. - If a transparent request appears too complex for clean chroma-key removal, asks for true/native transparency, or local removal fails validation, explain that true transparency requires CLI `gpt-image-1.5 --background transparent --output-format png` because `gpt-image-2` does not support `background=transparent`, then ask whether to proceed. Run the CLI fallback only after the user confirms. - The word `batch` by itself does not mean CLI fallback. If the user asks for many assets or says to batch-generate assets without explicitly asking for CLI/API/model controls, stay on the built-in path and issue one built-in call per requested asset or variant. @@ -51,7 +51,7 @@ Fallback-only docs/resources for CLI mode: - `scripts/image_gen.py` Local post-processing helper: -- `scripts/remove_chroma_key.py`: removes a flat chroma-key background from a generated image and writes a PNG/WebP with alpha. Prefer auto-key sampling, soft matte, and despill for antialiased edges. +- `$CODEX_HOME/skills/.system/imagegen/scripts/remove_chroma_key.py`: removes a flat chroma-key background from a generated image and writes a PNG/WebP with alpha. Prefer auto-key sampling, soft matte, and despill for antialiased edges. ## When to use - Generate a new image (concept art, product shot, cover, website hero) @@ -107,7 +107,7 @@ Assume the user wants a new image unless they clearly ask to change an existing - If the user's prompt is already specific and detailed, normalize it into a clear spec without adding creative requirements. - If the user's prompt is generic, add tasteful augmentation only when it materially improves output quality. 10. Use the built-in `image_gen` tool by default. -11. For transparent-output requests, follow the transparent image guidance below: generate with built-in `image_gen` on a flat chroma-key background, copy the selected output into the workspace or `tmp/imagegen/`, run `python scripts/remove_chroma_key.py`, and validate the alpha result before using it. If this path looks unsuitable or fails, ask before switching to CLI `gpt-image-1.5`. +11. For transparent-output requests, follow the transparent image guidance below: generate with built-in `image_gen` on a flat chroma-key background, copy the selected output into the workspace or `tmp/imagegen/`, run the installed `$CODEX_HOME/skills/.system/imagegen/scripts/remove_chroma_key.py` helper, and validate the alpha result before using it. If this path looks unsuitable or fails, ask before switching to CLI `gpt-image-1.5`. 12. Inspect outputs and validate: subject, style, composition, text accuracy, and invariants/avoid items. 13. Iterate with a single targeted change, then re-check. 14. For preview-only work, render the image inline; the underlying file may remain at the default `$CODEX_HOME/generated_images/...` path. @@ -124,7 +124,17 @@ Default sequence: 1. Use built-in `image_gen` to generate the requested subject on a perfectly flat solid chroma-key background. 2. Choose a key color that is unlikely to appear in the subject: default `#00ff00`, use `#ff00ff` for green subjects, and avoid `#0000ff` for blue subjects. 3. After generation, move or copy the selected source image from `$CODEX_HOME/generated_images/...` into the workspace or `tmp/imagegen/`. -4. Run `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill`. +4. Run the installed helper path, not a project-relative script path: + ```bash + python "${CODEX_HOME:-$HOME/.codex}/skills/.system/imagegen/scripts/remove_chroma_key.py" \ + --input \ + --out \ + --auto-key border \ + --soft-matte \ + --transparent-threshold 12 \ + --opaque-threshold 220 \ + --despill + ``` 5. Validate that the output has an alpha channel, transparent corners, plausible subject coverage, and no obvious key-color fringe. If a thin fringe remains, retry once with `--edge-contract 1`; use `--edge-feather 0.25` only when the edge is visibly stair-stepped and the subject is not shiny or reflective. 6. Save the final alpha PNG/WebP in the project if the asset is project-bound. Never leave a project-referenced transparent asset only under `$CODEX_HOME/*`. @@ -343,4 +353,4 @@ If installation is not possible in this environment, tell the user which depende - `references/image-api.md`: fallback-only API/CLI parameter reference. - `references/codex-network.md`: fallback-only network/sandbox troubleshooting for CLI mode. - `scripts/image_gen.py`: fallback-only CLI implementation. Do not load or use it unless the user explicitly chooses CLI mode or explicitly confirms a transparent request's true CLI transparency fallback. -- `scripts/remove_chroma_key.py`: local post-processing helper for built-in transparent-image requests. +- `$CODEX_HOME/skills/.system/imagegen/scripts/remove_chroma_key.py`: local post-processing helper for built-in transparent-image requests. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md index 3b57c8e91159..f4a5a63d3d95 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/cli.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/cli.md @@ -239,4 +239,4 @@ Notes: - API parameter quick reference for fallback CLI mode: `references/image-api.md` - Prompt examples shared across both top-level modes: `references/sample-prompts.md` - Network/sandbox notes for fallback CLI mode: `references/codex-network.md` -- Built-in-first transparent image workflow: `SKILL.md` and `scripts/remove_chroma_key.py` +- Built-in-first transparent image workflow: `SKILL.md` and `$CODEX_HOME/skills/.system/imagegen/scripts/remove_chroma_key.py` diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md index 5e32e7c9fba3..db8567de2f4f 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/image-api.md @@ -68,7 +68,7 @@ Model-specific note for `input_fidelity`: ## Transparent backgrounds -`gpt-image-2` does not currently support the Image API `background=transparent` parameter. The skill's default transparent-image path is built-in `image_gen` with a flat chroma-key background, followed by local alpha extraction with `python scripts/remove_chroma_key.py`. +`gpt-image-2` does not currently support the Image API `background=transparent` parameter. The skill's default transparent-image path is built-in `image_gen` with a flat chroma-key background, followed by local alpha extraction with `python "${CODEX_HOME:-$HOME/.codex}/skills/.system/imagegen/scripts/remove_chroma_key.py"`. Use CLI `gpt-image-1.5` with `background=transparent` and a transparent-capable output format such as `png` or `webp` only after the user explicitly confirms that fallback, unless they already requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. If the user asks for true/native transparency, the subject is too complex for clean chroma-key removal, or local background removal fails validation, explain the tradeoff and ask before switching. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md index 45b92b37353c..9d2da42f196b 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/prompting.md @@ -77,7 +77,7 @@ Do not add: - Prompt for a perfectly flat solid chroma-key background, usually `#00ff00`; use `#ff00ff` when the subject is green, and avoid key colors that appear in the subject. - Explicitly prohibit shadows, gradients, floor planes, reflections, texture, and lighting variation in the background. - Ask for crisp edges, generous padding, and no use of the key color inside the subject. -- After generation, remove the background locally with `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill` and validate the alpha result before shipping it. +- After generation, remove the background locally with `python "${CODEX_HOME:-$HOME/.codex}/skills/.system/imagegen/scripts/remove_chroma_key.py" --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill` and validate the alpha result before shipping it. - Use soft matte and despill for antialiased edges; hard tolerance-only removal is mainly for flat pixel-art or exact-color fixtures. - Use CLI `gpt-image-1.5 --background transparent --output-format png` only after the user explicitly confirms the fallback, or when the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. Ask first for true/native transparency requests, failed chroma-key validation, or complex transparent subjects such as hair, fur, glass, smoke, liquids, translucent materials, reflective objects, or soft shadows. diff --git a/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md b/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md index f8284e897fac..d94929555ea2 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md +++ b/codex-rs/skills/src/assets/samples/imagegen/references/sample-prompts.md @@ -13,7 +13,7 @@ When adapting a user's prompt: The labeled lines are prompt scaffolding, not a closed schema. `Asset type` and `Input images` are prompt-only scaffolding; the CLI does not expose them as dedicated flags. -Execution details such as explicit CLI flags, `quality`, `input_fidelity`, masks, output formats, and local output paths depend on mode. Use the built-in tool by default, including simple transparent-image requests. For transparent images, prompt for a flat chroma-key background and remove it locally with `python scripts/remove_chroma_key.py`; only apply CLI-specific controls when the user explicitly opts into fallback mode or explicitly confirms that the transparent request should use true CLI transparency. +Execution details such as explicit CLI flags, `quality`, `input_fidelity`, masks, output formats, and local output paths depend on mode. Use the built-in tool by default, including simple transparent-image requests. For transparent images, prompt for a flat chroma-key background and remove it locally with `python "${CODEX_HOME:-$HOME/.codex}/skills/.system/imagegen/scripts/remove_chroma_key.py"`; only apply CLI-specific controls when the user explicitly opts into fallback mode or explicitly confirms that the transparent request should use true CLI transparency. CLI model notes: - `gpt-image-2` is the fallback CLI default for new workflows. @@ -395,7 +395,7 @@ Scene/backdrop: perfectly flat solid #00ff00 chroma-key background for local bac Constraints: background must be one uniform color with no shadows, gradients, texture, reflections, floor plane, or lighting variation; crisp silhouette; generous padding; no halos or fringing; preserve label text exactly; no restyling; do not use #00ff00 anywhere in the subject ``` -Post-process note: after built-in generation, run `python scripts/remove_chroma_key.py --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill`. Ask before using CLI `gpt-image-1.5 --background transparent --output-format png` for true/native transparency, failed chroma-key validation, or complex subjects such as hair, fur, glass, smoke, liquids, translucent materials, reflections, or soft shadows, unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. +Post-process note: after built-in generation, run `python "${CODEX_HOME:-$HOME/.codex}/skills/.system/imagegen/scripts/remove_chroma_key.py" --input --out --auto-key border --soft-matte --transparent-threshold 12 --opaque-threshold 220 --despill`. Ask before using CLI `gpt-image-1.5 --background transparent --output-format png` for true/native transparency, failed chroma-key validation, or complex subjects such as hair, fur, glass, smoke, liquids, translucent materials, reflections, or soft shadows, unless the user already explicitly requested `gpt-image-1.5`, `scripts/image_gen.py`, or CLI fallback. ### style-transfer ``` diff --git a/codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py b/codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py index b9757c7d1892..50539877b05c 100644 --- a/codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py +++ b/codex-rs/skills/src/assets/samples/imagegen/scripts/remove_chroma_key.py @@ -13,7 +13,7 @@ import re from statistics import median import sys -from typing import Iterable, Tuple +from typing import Tuple Color = Tuple[int, int, int] @@ -186,8 +186,8 @@ def _cleanup_spill(rgb: Color, key: Color, alpha: int = 255) -> Color: ) -def _alpha_for_pixels( - pixels: Iterable[tuple[int, int, int, int]], +def _apply_alpha_to_image( + image, *, key: Color, tolerance: int, @@ -195,36 +195,39 @@ def _alpha_for_pixels( soft_matte: bool, transparent_threshold: float, opaque_threshold: float, -) -> tuple[list[tuple[int, int, int, int]], int]: - output: list[tuple[int, int, int, int]] = [] +) -> int: + pixels = image.load() + width, height = image.size transparent = 0 - for red, green, blue, alpha in pixels: - rgb = (red, green, blue) - distance = _channel_distance(rgb, key) - key_like = _looks_key_colored(rgb, key, distance) - output_alpha = ( - min( - _soft_alpha(distance, transparent_threshold, opaque_threshold), - _dominance_alpha(rgb, key), + for y in range(height): + for x in range(width): + red, green, blue, alpha = pixels[x, y] + rgb = (red, green, blue) + distance = _channel_distance(rgb, key) + key_like = _looks_key_colored(rgb, key, distance) + output_alpha = ( + min( + _soft_alpha(distance, transparent_threshold, opaque_threshold), + _dominance_alpha(rgb, key), + ) + if soft_matte and key_like + else (0 if distance <= tolerance else 255) ) - if soft_matte and key_like - else (0 if distance <= tolerance else 255) - ) - output_alpha = int(round(output_alpha * (alpha / 255.0))) - if 0 < output_alpha <= ALPHA_NOISE_FLOOR: - output_alpha = 0 + output_alpha = int(round(output_alpha * (alpha / 255.0))) + if 0 < output_alpha <= ALPHA_NOISE_FLOOR: + output_alpha = 0 - if output_alpha == 0: - output.append((0, 0, 0, 0)) - transparent += 1 - continue + if output_alpha == 0: + pixels[x, y] = (0, 0, 0, 0) + transparent += 1 + continue - if spill_cleanup and key_like: - red, green, blue = _cleanup_spill(rgb, key, output_alpha) - output.append((red, green, blue, output_alpha)) + if spill_cleanup and key_like: + red, green, blue = _cleanup_spill(rgb, key, output_alpha) + pixels[x, y] = (red, green, blue, output_alpha) - return output, transparent + return transparent def _contract_alpha(image, pixels: int): @@ -232,7 +235,7 @@ def _contract_alpha(image, pixels: int): return image _, ImageFilter = _load_pillow() - alpha = image.split()[-1] + alpha = image.getchannel("A") for _ in range(pixels): alpha = alpha.filter(ImageFilter.MinFilter(3)) image.putalpha(alpha) @@ -244,7 +247,7 @@ def _apply_edge_feather(image, radius: float): return image _, ImageFilter = _load_pillow() - alpha = image.split()[-1] + alpha = image.getchannel("A") alpha = alpha.filter(ImageFilter.GaussianBlur(radius=radius)) image.putalpha(alpha) return image @@ -256,14 +259,28 @@ def _encode_image(image, output_format: str) -> bytes: return out.getvalue() -def _get_pixels(image): - getter = getattr(image, "get_flattened_data", None) - return getter() if getter else image.getdata() +def _alpha_counts(image) -> tuple[int, int, int]: + pixels = image.load() + width, height = image.size + total = 0 + transparent = 0 + partial = 0 + + for y in range(height): + for x in range(width): + alpha = pixels[x, y][3] + total += 1 + if alpha == 0: + transparent += 1 + elif alpha < 255: + partial += 1 + + return total, transparent, partial def _sample_border_key(image, mode: str) -> Color: - rgb = image.convert("RGB") - width, height = rgb.size + width, height = image.size + pixels = image.load() samples: list[Color] = [] if mode == "corners": @@ -277,18 +294,23 @@ def _sample_border_key(image, mode: str) -> Color: for left, top, right, bottom in boxes: for y in range(top, bottom): for x in range(left, right): - samples.append(rgb.getpixel((x, y))) + red, green, blue = pixels[x, y][:3] + samples.append((red, green, blue)) else: band = max(1, min(width, height, 6)) step = max(1, min(width, height) // 256) for x in range(0, width, step): for y in range(band): - samples.append(rgb.getpixel((x, y))) - samples.append(rgb.getpixel((x, height - 1 - y))) + red, green, blue = pixels[x, y][:3] + samples.append((red, green, blue)) + red, green, blue = pixels[x, height - 1 - y][:3] + samples.append((red, green, blue)) for y in range(0, height, step): for x in range(band): - samples.append(rgb.getpixel((x, y))) - samples.append(rgb.getpixel((width - 1 - x, y))) + red, green, blue = pixels[x, y][:3] + samples.append((red, green, blue)) + red, green, blue = pixels[width - 1 - x, y][:3] + samples.append((red, green, blue)) if not samples: _die("Could not sample background key color from image border.") @@ -313,8 +335,8 @@ def _remove_chroma_key(args: argparse.Namespace) -> None: else _parse_key_color(args.key_color) ) - pixels, transparent = _alpha_for_pixels( - _get_pixels(rgba), + transparent = _apply_alpha_to_image( + rgba, key=key, tolerance=args.tolerance, spill_cleanup=args.spill_cleanup, @@ -322,14 +344,10 @@ def _remove_chroma_key(args: argparse.Namespace) -> None: transparent_threshold=args.transparent_threshold, opaque_threshold=args.opaque_threshold, ) - rgba.putdata(pixels) rgba = _contract_alpha(rgba, args.edge_contract) rgba = _apply_edge_feather(rgba, args.edge_feather) - alpha_values = [pixel[3] for pixel in _get_pixels(rgba)] - total = len(alpha_values) - transparent_after = sum(1 for alpha in alpha_values if alpha == 0) - partial_after = sum(1 for alpha in alpha_values if 0 < alpha < 255) + total, transparent_after, partial_after = _alpha_counts(rgba) out.parent.mkdir(parents=True, exist_ok=True) output_format = "PNG" if out.suffix.lower() == ".png" else "WEBP"