In [1]:
import json
import random

# Input and output paths
input_file = "dogs2.jsonl"
train_file = "train.jsonl"
val_file = "val.jsonl"

# Set random seed for reproducibility
random.seed(42)

# Load all records
with open(input_file, "r") as f:
    records = [json.loads(line.strip()) for line in f if line.strip()]

# Shuffle and split
random.shuffle(records)
split_idx = int(0.8 * len(records))
train_records = records[:split_idx]
val_records = records[split_idx:]

# Save train split
with open(train_file, "w") as f:
    for rec in train_records:
        f.write(json.dumps(rec) + "\n")

# Save validation split
with open(val_file, "w") as f:
    for rec in val_records:
        f.write(json.dumps(rec) + "\n")

print(f"Split complete: {len(train_records)} train / {len(val_records)} val")

Split complete: 878 train / 220 val


In [1]:
import os
import json
from PIL import Image

def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f if line.strip()]

def get_image_sizes(jsonl_path):
    records = load_jsonl(jsonl_path)
    sizes = []

    for i, rec in enumerate(records):
        try:
            image_path = rec.get("image")
            if not os.path.exists(image_path):
                continue

            with Image.open(image_path) as img:
                img = img.convert("RGB")
                width, height = img.size
                sizes.append((width, height))
        except Exception as e:
            print(f"[WARN] Skipping index {i}: {e}")

    if not sizes:
        print("❌ No valid images found.")
        return

    # Convert to total pixels for sorting
    sizes_by_area = [(w, h, w * h) for w, h in sizes]
    min_size = min(sizes_by_area, key=lambda x: x[2])
    max_size = max(sizes_by_area, key=lambda x: x[2])

    print(f"✅ Total images checked: {len(sizes)}")
    print(f"🔽 Min size: {min_size[0]}x{min_size[1]} = {min_size[2]} pixels")
    print(f"🔼 Max size: {max_size[0]}x{max_size[1]} = {max_size[2]} pixels")

# Example usage
get_image_sizes("train.jsonl")


✅ Total images checked: 878
🔽 Min size: 640x640 = 409600 pixels
🔼 Max size: 640x640 = 409600 pixels


In [1]:
"""
jsonl_to_json.py
Convert file.jsonl  ➜  file.json
Usage:  python jsonl_to_json.py input.jsonl output.json
"""

import json
import sys
from pathlib import Path

def jsonl_to_json(src_path: str, dst_path: str) -> None:
    # Load every line as an individual JSON object
    with open(src_path, "r", encoding="utf-8") as f:
        records = [json.loads(line) for line in f if line.strip()]

    # Save as one JSON array (pretty-printed, 2-space indent)
    with open(dst_path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    print(f"✅  Wrote {len(records)} records to {dst_path}")

if __name__ == "__main__":

    src, dst = "train.jsonl", "train.json"
    if not Path(src).is_file():
        sys.exit(f"❌  Input file not found: {src}")

    jsonl_to_json(src, dst)


✅  Wrote 878 records to train.json


In [5]:
import json
import sys
from pathlib import Path

PREFIX = "../../../.."          # what you want to prepend


def process_json(src: Path, dst: Path) -> None:
    """Handle a single JSON array."""
    records = json.load(src.open())
    for rec in records:
        if "image" in rec:
            rec["image"] = f"{PREFIX}{rec['image']}"
    json.dump(records, dst.open("w"), ensure_ascii=False, indent=2)
    print(f"✅  Wrote {len(records)} records to {dst}")


def process_jsonl(src: Path, dst: Path) -> None:
    """Handle line-delimited JSON (jsonl)."""
    total = 0
    with src.open() as fin, dst.open("w") as fout:
        for line in fin:
            if not line.strip():
                continue
            rec = json.loads(line)
            if "image" in rec:
                rec["image"] = f"{PREFIX}{rec['image']}"
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
            total += 1
    print(f"✅  Wrote {total} lines to {dst}")


if __name__ == "__main__":

    src_path = Path("train.json")
    dst_path = Path("train_prefix.json")


    process_json(src_path, dst_path)


✅  Wrote 878 records to train_prefix.json


In [7]:
import json, sys, os
TOKEN = "<image>\n"          # or whatever DEFAULT_IMAGE_TOKEN equals

src = "train_prefix.json"      # input
dst = "train_wtoken.json"

records = json.load(open(src))
for rec in records:
    if "image" in rec:
        first = rec["conversations"][0]
        if first["from"] == "human" and TOKEN not in first["value"]:
            first["value"] = f"{TOKEN}{first['value']}"

json.dump(records, open(dst, "w"), ensure_ascii=False, indent=2)
print("✅ wrote", dst)


✅ wrote train_wtoken.json
