In [1]:
import json
import random

# Input and output paths
input_file = "dogs2.jsonl"
train_file = "train.jsonl"
val_file = "val.jsonl"

# Set random seed for reproducibility
random.seed(42)

# Load all records
with open(input_file, "r") as f:
    records = [json.loads(line.strip()) for line in f if line.strip()]

# Shuffle and split
random.shuffle(records)
split_idx = int(0.8 * len(records))
train_records = records[:split_idx]
val_records = records[split_idx:]

# Save train split
with open(train_file, "w") as f:
    for rec in train_records:
        f.write(json.dumps(rec) + "\n")

# Save validation split
with open(val_file, "w") as f:
    for rec in val_records:
        f.write(json.dumps(rec) + "\n")

print(f"Split complete: {len(train_records)} train / {len(val_records)} val")

Split complete: 878 train / 220 val


In [1]:
import os
import json
from PIL import Image

def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f if line.strip()]

def get_image_sizes(jsonl_path):
    records = load_jsonl(jsonl_path)
    sizes = []

    for i, rec in enumerate(records):
        try:
            image_path = rec.get("image")
            if not os.path.exists(image_path):
                continue

            with Image.open(image_path) as img:
                img = img.convert("RGB")
                width, height = img.size
                sizes.append((width, height))
        except Exception as e:
            print(f"[WARN] Skipping index {i}: {e}")

    if not sizes:
        print("❌ No valid images found.")
        return

    # Convert to total pixels for sorting
    sizes_by_area = [(w, h, w * h) for w, h in sizes]
    min_size = min(sizes_by_area, key=lambda x: x[2])
    max_size = max(sizes_by_area, key=lambda x: x[2])

    print(f"✅ Total images checked: {len(sizes)}")
    print(f"🔽 Min size: {min_size[0]}x{min_size[1]} = {min_size[2]} pixels")
    print(f"🔼 Max size: {max_size[0]}x{max_size[1]} = {max_size[2]} pixels")

# Example usage
get_image_sizes("train.jsonl")


✅ Total images checked: 878
🔽 Min size: 640x640 = 409600 pixels
🔼 Max size: 640x640 = 409600 pixels
