In [5]:
# %% [1] 从全局 step 命名的 9 张图拼 3x3 大图 + 生成 JSONL 索引
import re
import json
from pathlib import Path
from PIL import Image
from tqdm import tqdm

# ==== 参数 ====
root_dir = Path(r"F:\PangYe\bridgev2_DATA\extracted_nine_train")   # 含 idxxxxx 子目录的根路径



In [None]:
output_dir = root_dir / "composed_images"      # 合成大图输出目录
jsonl_path = root_dir / "composed_index.jsonl" # JSONL 索引输出路径
output_dir.mkdir(parents=True, exist_ok=True)

# 3x3 展示的顺序映射：将选出的 9 帧按 [0..8] 重排为 1,2,3 / 6,5,4 / 7,8,9
grid_perm = [0, 1, 2, 5, 4, 3, 6, 7, 8]

# 解析 stepXXXX 的数字部分
step_pat = re.compile(r"step(\d+)\.png$", re.IGNORECASE)

def sorted_step_images(images_dir: Path):
    """返回 (path, step_idx) 列表，按 step_idx 升序"""
    items = []
    for p in images_dir.glob("step*.png"):
        m = step_pat.search(p.name)
        if m:
            items.append((p, int(m.group(1))))
    items.sort(key=lambda x: x[1])
    return items

def pick_n_indices(total: int, n: int = 9):
    """在 [0, total-1] 中等间隔取 n 个索引（含首尾），保证长度为 n。"""
    if total < n:
        return None
    # 等间距（含首尾），用四舍五入可避免重复概率
    return [int(round(i * (total - 1) / (n - 1))) for i in range(n)]

ep_folders = sorted([p for p in root_dir.iterdir() if p.is_dir() and p.name.startswith("id")])
written = 0

# 先清空/新建 jsonl
with open(jsonl_path, "w", encoding="utf-8") as jf:
    pass


pbar = tqdm(ep_folders, desc="Composing 3x3 and writing JSONL")

for ep_dir in pbar:
    images_dir = ep_dir / "images"
    if not images_dir.exists():
        pbar.write(f"[WARN] {ep_dir.name} 没有 images/ 目录，跳过。")
        continue

    # 收集并按全局 step 号排序
    items = sorted_step_images(images_dir)  # [(path, step_idx), ...]
    if len(items) < 9:
        pbar.write(f"[WARN] {ep_dir.name} 仅有 {len(items)} 张图(<9)，跳过。")
        continue

    # 选取 9 张（等间距，含首尾）
    sel_idx = pick_n_indices(len(items), n=9)
    if sel_idx is None:
        pbar.write(f"[WARN] {ep_dir.name} 无法选取 9 帧，跳过。")
        continue
    selected = [items[i][0] for i in sel_idx]  # 9 个 Path

    # 检查尺寸一致
    imgs = [Image.open(p) for p in selected]
    widths = [im.size[0] for im in imgs]
    heights = [im.size[1] for im in imgs]
    if len(set(widths)) != 1 or len(set(heights)) != 1:
        pbar.write(f"[WARN] {ep_dir.name} 存在不同尺寸图像，跳过。")
        for im in imgs: im.close()
        continue

    w, h = imgs[0].size
    big_img = Image.new("RGB", (w * 3, h * 3))

    # 按指定顺序 grid_perm 粘贴
    for k, perm_pos in enumerate(grid_perm):
        r, c = divmod(k, 3)  # 在大图中的行列
        im = imgs[perm_pos]
        big_img.paste(im, (c * w, r * h))

    # 保存大图
    out_path = output_dir / f"{ep_dir.name}_grid.png"
    big_img.save(out_path)

    # 关闭小图
    for im in imgs:
        im.close()

    # 读取 instruction 文本
    instr_path = ep_dir / "instruction.txt"
    if instr_path.exists():
        try:
            text_content = instr_path.read_text(encoding="utf-8").strip()
        except Exception:
            text_content = ""
            pbar.write(f"[WARN] {ep_dir.name} 读取 instruction.txt 失败，已置空。")
    else:
        text_content = ""
        pbar.write(f"[WARN] {ep_dir.name} 缺少 instruction.txt。")

    # 写入 JSONL
    rec = {
        "idx": ep_dir.name,          # 例如 id00001
        "target": str(out_path.relative_to(output_dir)),     # 合成大图路径
        "text": text_content         # 指令文本内容
    }
    with open(jsonl_path, "a", encoding="utf-8") as jf:
        jf.write(json.dumps(rec, ensure_ascii=False) + "\n")

    written += 1

pbar.close()
print(f"\n✅ 完成：共合成 {written} 张大图；索引已写入 {jsonl_path}.")


In [6]:
# ==== 参数 ====
output_dir = root_dir / "single_corner_images"  # 输出目录
jsonl_path = root_dir / "corner_index.jsonl"    # 索引文件路径
output_dir.mkdir(parents=True, exist_ok=True)

pbar = tqdm(sorted([p for p in root_dir.iterdir() if p.is_dir() and p.name.startswith("id")]),
            desc="Generating corner images")

# 先清空/新建 JSONL 文件
with open(jsonl_path, "w", encoding="utf-8") as jf:
    pass

written = 0

for ep_dir in pbar:
    images_dir = ep_dir / "images"
    if not images_dir.exists():
        pbar.write(f"[WARN] {ep_dir.name} 缺少 images/ 文件夹，跳过。")
        continue

    imgs = sorted(images_dir.glob("step*.png"))
    if not imgs:
        pbar.write(f"[WARN] {ep_dir.name} 没有图片，跳过。")
        continue

    # 取第一张图（最小编号）
    first_img_path = imgs[0]
    img = Image.open(first_img_path)
    w, h = img.size

    # 创建黑底大图（3x3）
    big_img = Image.new("RGB", (w * 3, h * 3), color=(0, 0, 0))
    big_img.paste(img, (0, 0))
    img.close()

    # 保存输出
    out_path = output_dir / f"{ep_dir.name}_corner.png"
    big_img.save(out_path)

    # 写入 JSONL
    rec = {
        "idx": ep_dir.name,
        "source": str(out_path.relative_to(output_dir))  # ✅ 相对路径
    }
    with open(jsonl_path, "a", encoding="utf-8") as jf:
        jf.write(json.dumps(rec, ensure_ascii=False) + "\n")

    written += 1

pbar.close()
print(f"\n✅ 已生成 {written} 张 corner 图，并写入索引文件：{jsonl_path}")

Generating corner images: 100%|██████████| 20000/20000 [19:37<00:00, 16.99it/s]


✅ 已生成 20000 张 corner 图，并写入索引文件：F:\PangYe\bridgev2_DATA\extracted_nine_train\corner_index.jsonl





In [None]:
jsonl_a = r"F:\PangYe\bridgev2_DATA\extracted_nine_test\composed_index.jsonl"   # 第一个文件（包含 idx, target, text）
jsonl_b = r"F:\PangYe\bridgev2_DATA\extracted_nine_test\corner_index.jsonl"     # 第二个文件（包含 idx, source）
output_jsonl = r"F:\PangYe\bridgev2_DATA\extracted_nine_test\merged_index.jsonl"  # 合并输出文件

# ==== 读取 JSONL A ====
records_a = {}
with open(jsonl_a, "r", encoding="utf-8") as fa:
    for line in fa:
        rec = json.loads(line)
        idx = rec.get("idx")
        if idx:
            records_a[idx] = rec

# ==== 合并 JSONL B ====
merged = []
with open(jsonl_b, "r", encoding="utf-8") as fb:
    for line in tqdm(fb, desc="Merging by idx"):
        rec_b = json.loads(line)
        idx = rec_b.get("idx")
        if not idx:
            continue
        base = records_a.get(idx, {"idx": idx})  # 如果 A 中没有，就新建
        base.update(rec_b)  # 合并字段（source 会被加进去）
        merged.append(base)

# ==== 写出结果 ====
with open(output_jsonl, "w", encoding="utf-8") as fout:
    for rec in merged:
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"\n✅ 合并完成：共写入 {len(merged)} 条记录 → {output_jsonl}")

In [1]:
# %% [1] 从 merged_index.jsonl 读取，随机抽取 n 个模板用于文本增强，输出 augmented_index.jsonl
import json
import random
from pathlib import Path
from tqdm import tqdm

# ===== 参数 =====
input_jsonl  = Path(r"C:\Users\ROG_STRIX\Desktop\材料\pairs_test.jsonl")      # 输入
output_jsonl = Path(r"C:\Users\ROG_STRIX\Desktop\材料\pairs_test_augmented_index.jsonl")   # 输出
n_per_sample = 1                                      # 每个样本要生成的增强条数（不重复，<= 模板总数）
random_seed  = 1234                                   # 随机种子（可改，复现用）

# ===== 102 条英文模板（占位符：[ACTION]）=====
TEMPLATES = [
    "This is a 3x3 sequence of 9 images, step by step showing how the robot is about to [ACTION].",
    "A 3x3 grid illustrates in 9 stages the robot's motion as it [ACTION].",
    "In this sequence of 9 sub-images, the robot is [ACTION].",
    "These nine frames describe, step by step, how the robot prepares to [ACTION].",
    "A series of 9 images (3x3) gradually depict the robot as it [ACTION].",
    "Shown here is a 3x3 arrangement of frames, narrating how the robot will [ACTION].",
    "Nine images organized in a 3x3 grid describe the robot carefully [ACTION].",
    "This 3x3 set demonstrates step by step how the robot proceeds to [ACTION].",
    "A sequence of 9 pictures in a 3x3 layout shows the robot's preparation for [ACTION].",
    "In this visual narrative, 9 sub-images arranged 3x3 depict the robot as it [ACTION].",
    "This is a 9-frame sequence (3x3) showing the robot's process of [ACTION].",
    "A 3x3 panel of images illustrates the robot step by step as it [ACTION].",
    "In these 9 frames, the robot is shown progressively moving to [ACTION].",
    "This collage of 9 sub-images demonstrates the robot's upcoming action: [ACTION].",
    "A structured 3x3 arrangement of images visually explains how the robot will [ACTION].",
    "This set of 9 frames tells the step-by-step story of the robot attempting to [ACTION].",
    "A sequence of 9 stages, displayed in a 3x3 grid, captures the robot as it [ACTION].",
    "This is a 3x3 visual sequence where the robot gradually performs [ACTION].",
    "Nine ordered sub-images, laid out 3x3, describe the robot on its way to [ACTION].",
    "This grid of 9 frames illustrates the stepwise procedure of the robot as it [ACTION].",
    "The 3x3 grid of nine sequential images narrates how the robot begins to [ACTION].",
    "A full 3x3 panel chronicles nine gradual steps as the robot proceeds to [ACTION].",
    "This 9-frame grid visually conveys each phase of the robot attempting to [ACTION].",
    "Here, nine evenly arranged images describe the robot's careful approach to [ACTION].",
    "A visual timeline of 9 sub-images, in a 3x3 layout, depicts the robot's plan to [ACTION].",
    "This illustrated 3x3 chart of nine photos shows how the robot steadily moves to [ACTION].",
    "Displayed in a 3x3 collage, the nine frames depict a progressive journey of the robot to [ACTION].",
    "The robot's motion to [ACTION] is broken down step by step across these nine images in a grid.",
    "A 9-step portrayal in a neat 3x3 grid showcases the robot's process of [ACTION].",
    "The robot's transition to [ACTION] is highlighted across this grid of 9 ordered images.",
    "A set of nine sequential visuals demonstrates the robot's progression toward [ACTION].",
    "This 3x3 mosaic of images illustrates the robot's incremental movement to [ACTION].",
    "The action sequence of nine images arranged 3x3 reveals the robot preparing to [ACTION].",
    "A methodical breakdown of 9 frames shows how the robot advances to [ACTION].",
    "These nine cells arranged in a square grid chronicle the robot's attempt to [ACTION].",
    "Step-by-step transformations are depicted in this 3x3 grid as the robot proceeds to [ACTION].",
    "Nine shots arranged in rows and columns explain how the robot accomplishes [ACTION].",
    "A storyboard of nine frames demonstrates the sequence of the robot trying to [ACTION].",
    "This structured 3x3 format of images visually narrates the robot's attempt to [ACTION].",
    "A continuous 9-frame depiction showcases the robot's progress as it works to [ACTION].",
    "The 3x3 chart of nine pictures portrays the robot's series of moves to [ACTION].",
    "Sequential snapshots (nine in total) arranged in a 3x3 matrix display the robot preparing to [ACTION].",
    "A concise storyboard of nine visuals in a grid outlines the robot's preparation for [ACTION].",
    "A strip of nine staged photos in a square grid exhibits how the robot approaches to [ACTION].",
    "This carefully arranged 3x3 compilation of frames reveals the robot's behavior leading to [ACTION].",
    "An array of nine sequential photos captures the robot's approach to [ACTION].",
    "A progressive storyline told in 9 grid cells illustrates the robot's execution of [ACTION].",
    "These nine sequential snapshots show in order how the robot moves toward [ACTION].",
    "The robot's maneuver to [ACTION] is detailed through this neatly ordered 9-image grid.",
    "A 3x3 sequence of nine frames shows the unfolding stages of the robot attempting to [ACTION].",
    "Nine chronological sub-images portray the robot's stepwise preparation for [ACTION].",
    "A visual instruction of nine shots in a 3x3 panel narrates how the robot proceeds to [ACTION].",
    "In this graphical 3x3 layout, nine pictures capture the robot's progression toward [ACTION].",
    "The grid-based visual of nine ordered photos illustrates the robot's effort to [ACTION].",
    "A composed 3x3 diagram of photos reveals the stages of the robot readying to [ACTION].",
    "Nine frames, structured in a grid, sequentially document the robot's journey to [ACTION].",
    "A composite of 9 sequential captures in a 3x3 format tells how the robot is about to [ACTION].",
    "The staged layout of nine pictures in a grid represents the robot's procedural approach to [ACTION].",
    "This panel of 3x3 visuals shows the step-by-step development as the robot works toward [ACTION].",
    "An ordered 9-frame depiction in a compact grid illustrates how the robot moves forward to [ACTION].",
    "Nine organized sub-images in three rows highlight the robot's preparatory process for [ACTION].",
    "A detailed 9-frame progression displayed in a 3x3 format outlines the robot's path to [ACTION].",
    "This grid of nine sequential photos captures the robot gradually preparing to [ACTION].",
    "Nine interconnected images arranged in a 3x3 frame reveal how the robot executes [ACTION].",
    "A dynamic 3x3 storyboard of photos showcases the robot's evolving movement toward [ACTION].",
    "These nine sequential visuals in a compact grid describe the robot's method of [ACTION].",
    "The robot's smooth transition to [ACTION] is portrayed in these 9 grid-based shots.",
    "A chain of nine consecutive photos in a grid narrates the robot's controlled steps to [ACTION].",
    "This clean 3x3 layout illustrates the robot's incremental progress toward [ACTION].",
    "A timeline of nine consecutive snapshots within a grid shows how the robot prepares to [ACTION].",
    "Nine images methodically ordered in rows and columns explain the robot's planned action to [ACTION].",
    "This ordered 9-frame visual panel demonstrates each stage as the robot attempts to [ACTION].",
    "A geometric 3x3 collage of images portrays the robot's coordinated effort to [ACTION].",
    "Nine logically arranged pictures depict the stages of the robot making its move to [ACTION].",
    "A composite square grid of 9 visual slices describes the robot's approach to [ACTION].",
    "These nine snapshots portray the continuous process as the robot progresses to [ACTION].",
    "This 3x3 storyboard presents the robot's behavior in phases as it begins to [ACTION].",
    "A structural layout of nine ordered visuals narrates how the robot steadily approaches [ACTION].",
    "The robot's operation to [ACTION] unfolds step by step in this set of 9 gridded images.",
    "A precise nine-frame chart in 3x3 order reveals the robot's deliberate preparation for [ACTION].",
    "This grid of nine ordered shots visually recounts how the robot performs [ACTION].",
    "A sequential journey of nine steps depicted in this compact grid highlights the robot's goal to [ACTION].",
    "The systematic 3x3 collection of photos illustrates the robot's careful moves to [ACTION].",
    "Nine chronologically arranged images in a square frame explain the robot's steady approach to [ACTION].",
    "A concise visual storyline unfolds in 9 frames to depict the robot's move toward [ACTION].",
    "A box of 3x3 sequential captures portrays how the robot gradually moves to [ACTION].",
    "These nine squares of imagery document the robot's transition toward [ACTION].",
    "A set of nine aligned visuals provides a clear depiction of the robot as it goes to [ACTION].",
    "The orderly 3x3 photo board outlines each moment of the robot preparing for [ACTION].",
    "This neat 9-frame arrangement visually guides us through the robot's steps to [ACTION].",
    "Nine progressively captured shots in a grid frame the robot's precise act of [ACTION].",
    "A structured progression of 9 cells in a grid illustrates how the robot aims to [ACTION].",
    "The robot's motion toward [ACTION] is depicted clearly through these 9 panel images.",
    "This grid-based representation of nine sequential frames charts the robot's stepwise action to [ACTION].",
    "Nine squares filled with sequential visuals display how the robot prepares itself to [ACTION].",
    "A concise but complete 9-step visualization outlines the robot's upcoming [ACTION].",
    "The 3x3 storyboard of ordered pictures captures the essence of the robot's move to [ACTION].",
    "A visually descriptive sequence of 9 frames in a grid layout demonstrates the robot's approach to [ACTION].",
    "Nine neatly organized cells in a visual chart map out the robot's advance toward [ACTION].",
    "This collective 3x3 framework of sequential frames illustrates the phases of the robot's [ACTION].",
    "A narrated storyline in nine image steps, presented in a grid, highlights the robot's motion to [ACTION].",
    "These nine chronologically placed visuals give a clear overview of the robot progressing to [ACTION].",
]

# ===== 主逻辑 =====
random.seed(random_seed)

if n_per_sample < 1 or n_per_sample > len(TEMPLATES):
    raise ValueError(f"n_per_sample must be in [1, {len(TEMPLATES)}], got {n_per_sample}")

count_in, count_out = 0, 0
with open(input_jsonl, "r", encoding="utf-8") as fin, open(output_jsonl, "w", encoding="utf-8") as fout:
    for line in tqdm(fin, desc="Per-sample unique template augmentation"):
        line = line.strip()
        if not line:
            continue
        rec = json.loads(line)
        count_in += 1

        action = (rec.get("text") or "").strip()
        if not action:
            # 没有动作文本则跳过；如需保留，可改为写空文本的 n 条
            continue

        # 1) 对该样本独立抽取 n 条不重复模板
        per_sample_templates = random.sample(TEMPLATES, n_per_sample)

        # 2) 逐个模板写出一条增强记录（确保该样本的 n 条文本互不相同）
        for tmpl in per_sample_templates:
            rec_out = dict(rec)
            action_lower = action.lower()
            rec_out["text"] = tmpl.replace("[ACTION]", action_lower)
            fout.write(json.dumps(rec_out, ensure_ascii=False) + "\n")
            count_out += 1

print(f"\n✅ Done. Read {count_in} samples, wrote {count_out} augmented records → {output_jsonl}")
print("ℹ️  For each sample, templates are unique within that sample (no intra-sample duplicates).")

Per-sample unique template augmentation: 1014it [00:00, 165809.91it/s]


✅ Done. Read 1014 samples, wrote 1014 augmented records → C:\Users\ROG_STRIX\Desktop\材料\pairs_test_augmented_index.jsonl
ℹ️  For each sample, templates are unique within that sample (no intra-sample duplicates).





In [None]:
# %% [1] 使用 pandas 读取 JSONL 文件并查看内容
import pandas as pd
from pathlib import Path


# 读取 JSONL 文件
file_path = (r"F:\PangYe\bridgev2_DATA\extracted_nine\merged_index.jsonl")
df = pd.read_json(file_path, lines=True)

# 打印前几行和基本信息
print(df.head())
print("\nDataFrame shape:", df.shape)
print("\nColumns:", df.columns.tolist())



In [None]:
df = df[["idx", "source", "target", "text"]]
print(df.head())

In [None]:
df.to_json("merged_index_reordered.jsonl", orient="records", lines=True, force_ascii=False)

In [9]:
# ===== 102 条英文模板（占位符：[ACTION]）=====
TEMPLATES = [
    "This is a 3x3 sequence of 9 images, step by step showing how the robot is about to [ACTION].",
    "A 3x3 grid illustrates in 9 stages the robot's motion as it [ACTION].",
    "In this sequence of 9 sub-images, the robot is [ACTION].",
    "These nine frames describe, step by step, how the robot prepares to [ACTION].",
    "A series of 9 images (3x3) gradually depict the robot as it [ACTION].",
    "Shown here is a 3x3 arrangement of frames, narrating how the robot will [ACTION].",
    "Nine images organized in a 3x3 grid describe the robot carefully [ACTION].",
    "This 3x3 set demonstrates step by step how the robot proceeds to [ACTION].",
    "A sequence of 9 pictures in a 3x3 layout shows the robot's preparation for [ACTION].",
    "In this visual narrative, 9 sub-images arranged 3x3 depict the robot as it [ACTION].",
    "This is a 9-frame sequence (3x3) showing the robot's process of [ACTION].",
    "A 3x3 panel of images illustrates the robot step by step as it [ACTION].",
    "In these 9 frames, the robot is shown progressively moving to [ACTION].",
    "This collage of 9 sub-images demonstrates the robot's upcoming action: [ACTION].",
    "A structured 3x3 arrangement of images visually explains how the robot will [ACTION].",
    "This set of 9 frames tells the step-by-step story of the robot attempting to [ACTION].",
    "A sequence of 9 stages, displayed in a 3x3 grid, captures the robot as it [ACTION].",
    "This is a 3x3 visual sequence where the robot gradually performs [ACTION].",
    "Nine ordered sub-images, laid out 3x3, describe the robot on its way to [ACTION].",
    "This grid of 9 frames illustrates the stepwise procedure of the robot as it [ACTION].",
    "The 3x3 grid of nine sequential images narrates how the robot begins to [ACTION].",
    "A full 3x3 panel chronicles nine gradual steps as the robot proceeds to [ACTION].",
    "This 9-frame grid visually conveys each phase of the robot attempting to [ACTION].",
    "Here, nine evenly arranged images describe the robot's careful approach to [ACTION].",
    "A visual timeline of 9 sub-images, in a 3x3 layout, depicts the robot's plan to [ACTION].",
    "This illustrated 3x3 chart of nine photos shows how the robot steadily moves to [ACTION].",
    "Displayed in a 3x3 collage, the nine frames depict a progressive journey of the robot to [ACTION].",
    "The robot's motion to [ACTION] is broken down step by step across these nine images in a grid.",
    "A 9-step portrayal in a neat 3x3 grid showcases the robot's process of [ACTION].",
    "The robot's transition to [ACTION] is highlighted across this grid of 9 ordered images.",
    "A set of nine sequential visuals demonstrates the robot's progression toward [ACTION].",
    "This 3x3 mosaic of images illustrates the robot's incremental movement to [ACTION].",
    "The action sequence of nine images arranged 3x3 reveals the robot preparing to [ACTION].",
    "A methodical breakdown of 9 frames shows how the robot advances to [ACTION].",
    "These nine cells arranged in a square grid chronicle the robot's attempt to [ACTION].",
    "Step-by-step transformations are depicted in this 3x3 grid as the robot proceeds to [ACTION].",
    "Nine shots arranged in rows and columns explain how the robot accomplishes [ACTION].",
    "A storyboard of nine frames demonstrates the sequence of the robot trying to [ACTION].",
    "This structured 3x3 format of images visually narrates the robot's attempt to [ACTION].",
    "A continuous 9-frame depiction showcases the robot's progress as it works to [ACTION].",
    "The 3x3 chart of nine pictures portrays the robot's series of moves to [ACTION].",
    "Sequential snapshots (nine in total) arranged in a 3x3 matrix display the robot preparing to [ACTION].",
    "A concise storyboard of nine visuals in a grid outlines the robot's preparation for [ACTION].",
    "A strip of nine staged photos in a square grid exhibits how the robot approaches to [ACTION].",
    "This carefully arranged 3x3 compilation of frames reveals the robot's behavior leading to [ACTION].",
    "An array of nine sequential photos captures the robot's approach to [ACTION].",
    "A progressive storyline told in 9 grid cells illustrates the robot's execution of [ACTION].",
    "These nine sequential snapshots show in order how the robot moves toward [ACTION].",
    "The robot's maneuver to [ACTION] is detailed through this neatly ordered 9-image grid.",
    "A 3x3 sequence of nine frames shows the unfolding stages of the robot attempting to [ACTION].",
    "Nine chronological sub-images portray the robot's stepwise preparation for [ACTION].",
    "A visual instruction of nine shots in a 3x3 panel narrates how the robot proceeds to [ACTION].",
    "In this graphical 3x3 layout, nine pictures capture the robot's progression toward [ACTION].",
    "The grid-based visual of nine ordered photos illustrates the robot's effort to [ACTION].",
    "A composed 3x3 diagram of photos reveals the stages of the robot readying to [ACTION].",
    "Nine frames, structured in a grid, sequentially document the robot's journey to [ACTION].",
    "A composite of 9 sequential captures in a 3x3 format tells how the robot is about to [ACTION].",
    "The staged layout of nine pictures in a grid represents the robot's procedural approach to [ACTION].",
    "This panel of 3x3 visuals shows the step-by-step development as the robot works toward [ACTION].",
    "An ordered 9-frame depiction in a compact grid illustrates how the robot moves forward to [ACTION].",
    "Nine organized sub-images in three rows highlight the robot's preparatory process for [ACTION].",
    "A detailed 9-frame progression displayed in a 3x3 format outlines the robot's path to [ACTION].",
    "This grid of nine sequential photos captures the robot gradually preparing to [ACTION].",
    "Nine interconnected images arranged in a 3x3 frame reveal how the robot executes [ACTION].",
    "A dynamic 3x3 storyboard of photos showcases the robot's evolving movement toward [ACTION].",
    "These nine sequential visuals in a compact grid describe the robot's method of [ACTION].",
    "The robot's smooth transition to [ACTION] is portrayed in these 9 grid-based shots.",
    "A chain of nine consecutive photos in a grid narrates the robot's controlled steps to [ACTION].",
    "This clean 3x3 layout illustrates the robot's incremental progress toward [ACTION].",
    "A timeline of nine consecutive snapshots within a grid shows how the robot prepares to [ACTION].",
    "Nine images methodically ordered in rows and columns explain the robot's planned action to [ACTION].",
    "This ordered 9-frame visual panel demonstrates each stage as the robot attempts to [ACTION].",
    "A geometric 3x3 collage of images portrays the robot's coordinated effort to [ACTION].",
    "Nine logically arranged pictures depict the stages of the robot making its move to [ACTION].",
    "A composite square grid of 9 visual slices describes the robot's approach to [ACTION].",
    "These nine snapshots portray the continuous process as the robot progresses to [ACTION].",
    "This 3x3 storyboard presents the robot's behavior in phases as it begins to [ACTION].",
    "A structural layout of nine ordered visuals narrates how the robot steadily approaches [ACTION].",
    "The robot's operation to [ACTION] unfolds step by step in this set of 9 gridded images.",
    "A precise nine-frame chart in 3x3 order reveals the robot's deliberate preparation for [ACTION].",
    "This grid of nine ordered shots visually recounts how the robot performs [ACTION].",
    "A sequential journey of nine steps depicted in this compact grid highlights the robot's goal to [ACTION].",
    "The systematic 3x3 collection of photos illustrates the robot's careful moves to [ACTION].",
    "Nine chronologically arranged images in a square frame explain the robot's steady approach to [ACTION].",
    "A concise visual storyline unfolds in 9 frames to depict the robot's move toward [ACTION].",
    "A box of 3x3 sequential captures portrays how the robot gradually moves to [ACTION].",
    "These nine squares of imagery document the robot's transition toward [ACTION].",
    "A set of nine aligned visuals provides a clear depiction of the robot as it goes to [ACTION].",
    "The orderly 3x3 photo board outlines each moment of the robot preparing for [ACTION].",
    "This neat 9-frame arrangement visually guides us through the robot's steps to [ACTION].",
    "Nine progressively captured shots in a grid frame the robot's precise act of [ACTION].",
    "A structured progression of 9 cells in a grid illustrates how the robot aims to [ACTION].",
    "The robot's motion toward [ACTION] is depicted clearly through these 9 panel images.",
    "This grid-based representation of nine sequential frames charts the robot's stepwise action to [ACTION].",
    "Nine squares filled with sequential visuals display how the robot prepares itself to [ACTION].",
    "A concise but complete 9-step visualization outlines the robot's upcoming [ACTION].",
    "The 3x3 storyboard of ordered pictures captures the essence of the robot's move to [ACTION].",
    "A visually descriptive sequence of 9 frames in a grid layout demonstrates the robot's approach to [ACTION].",
    "Nine neatly organized cells in a visual chart map out the robot's advance toward [ACTION].",
    "This collective 3x3 framework of sequential frames illustrates the phases of the robot's [ACTION].",
    "A narrated storyline in nine image steps, presented in a grid, highlights the robot's motion to [ACTION].",
    "These nine chronologically placed visuals give a clear overview of the robot progressing to [ACTION].",
]

In [10]:
import unicodedata

def find_non_ascii(text):
    return [c for c in text if ord(c) > 127]

for i, t in enumerate(TEMPLATES):
    bad_chars = find_non_ascii(t)
    if bad_chars:
        print(f"{i}: {bad_chars}")


In [3]:
def normalize_templates(templates):
    replacements = {
        "×": "x",
        "’": "'",
        "“": '"',
        "”": '"',
        "–": "-",
        "\u00A0": " ",
    }
    fixed = []
    for t in templates:
        for k, v in replacements.items():
            t = t.replace(k, v)
        fixed.append(t)
    return fixed


In [4]:
NORMALIZED_TEMPLATES = normalize_templates(TEMPLATES)

In [6]:
print("\nNormalized Templates:")
for i, t in enumerate(NORMALIZED_TEMPLATES):
    print(f"{i}: {t}")


Normalized Templates:
0: This is a 3x3 sequence of 9 images, step by step showing how the robot is about to [ACTION].
1: A 3x3 grid illustrates in 9 stages the robot's motion as it [ACTION].
2: In this sequence of 9 sub-images, the robot is [ACTION].
3: These nine frames describe, step by step, how the robot prepares to [ACTION].
4: A series of 9 images (3x3) gradually depict the robot as it [ACTION].
5: Shown here is a 3x3 arrangement of frames, narrating how the robot will [ACTION].
6: Nine images organized in a 3x3 grid describe the robot carefully [ACTION].
7: This 3x3 set demonstrates step by step how the robot proceeds to [ACTION].
8: A sequence of 9 pictures in a 3x3 layout shows the robot's preparation for [ACTION].
9: In this visual narrative, 9 sub-images arranged 3x3 depict the robot as it [ACTION].
10: This is a 9-frame sequence (3x3) showing the robot's process of [ACTION].
11: A 3x3 panel of images illustrates the robot step by step as it [ACTION].
12: In these 9 frames,