In [2]:
from pathlib import Path
from collections import defaultdict
import shutil
import random

# paths
image_dir = Path("/home/siu856582712/Documents/dataset/Clinical/images")
label_dir = Path("/home/siu856582712/Documents/dataset/Clinical/labels")
output_base = Path("/home/siu856582712/Documents/dataset/Clinical_split")

# target instance counts
target_train_per_class = 834
class_totals = {0: 3371, 1: 1462, 2: 5799, 3: 1192}
target_val = {k: int((v - target_train_per_class) * 2 / 3) for k, v in class_totals.items()}
target_test = {k: class_totals[k] - target_train_per_class - target_val[k] for k in range(4)}

# parse labels → per-image class counts
image_to_class_counts = defaultdict(lambda: defaultdict(int))
for label_path in label_dir.glob("*.txt"):
    img_name = f"{label_path.stem}.jpg"
    with open(label_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if not parts:
                continue
            cls = int(parts[0])
            image_to_class_counts[img_name][cls] += 1

# build (class → list of (image, count)) index
class_to_image_list = defaultdict(list)
for img, counts in image_to_class_counts.items():
    for cls, cnt in counts.items():
        class_to_image_list[cls].append((img, cnt))

# select training images greedily by per-class instance counts
train_set = set()
class_instance_counter = {i: 0 for i in range(4)}

for cls in range(4):
    random.shuffle(class_to_image_list[cls])
    for img, cnt in class_to_image_list[cls]:
        if class_instance_counter[cls] >= target_train_per_class:
            break
        train_set.add(img)
        class_instance_counter[cls] += cnt

# assign remaining images to val/test without overlap
remaining_images = set(image_to_class_counts.keys()) - train_set
val_set, test_set = set(), set()
val_counts = {i: 0 for i in range(4)}
test_counts = {i: 0 for i in range(4)}
used_set = set()

def assign_split(counts, targets, split_set, used, class_id):
    for img in list(remaining_images - used):
        cnt = image_to_class_counts[img].get(class_id, 0)
        if cnt == 0:
            continue
        if counts[class_id] >= targets[class_id]:
            break
        split_set.add(img)
        used.add(img)
        counts[class_id] += cnt

for cls in range(4):
    assign_split(val_counts, target_val, val_set, used_set, cls)
for cls in range(4):
    assign_split(test_counts, target_test, test_set, used_set, cls)

# copy files to split folders
def copy_split(split_set, split_name):
    img_out = output_base / split_name / "images"
    lbl_out = output_base / split_name / "labels"
    img_out.mkdir(parents=True, exist_ok=True)
    lbl_out.mkdir(parents=True, exist_ok=True)
    for img in split_set:
        shutil.copy(image_dir / img, img_out / img)
        lbl = f"{Path(img).stem}.txt"
        shutil.copy(label_dir / lbl, lbl_out / lbl)

copy_split(train_set, "train")
copy_split(val_set, "val")
copy_split(test_set, "test")

# summary
def count_instances(split_set):
    counts = {i: 0 for i in range(4)}
    for img in split_set:
        for cls, cnt in image_to_class_counts[img].items():
            counts[cls] += cnt
    return counts

print("Done. Summary:")
print(f"Train images: {len(train_set)} | Instances: {count_instances(train_set)}")
print(f"Val   images: {len(val_set)} | Instances: {count_instances(val_set)}")
print(f"Test  images: {len(test_set)} | Instances: {count_instances(test_set)}")

# write data.yaml for Ultralytics
yaml_path = output_base / "data.yaml"
with open(yaml_path, "w") as f:
    f.write(f"path: {output_base}\n")
    f.write("train: train/images\n")
    f.write("val: val/images\n")
    f.write("test: test/images\n")
    f.write("names: ['neg_cocci', 'pos_cocci', 'neg_bacilli', 'pos_bacilli']\n")
print(f"data.yaml saved to: {yaml_path}")


✅ Done! Summary:
Train images: 2008 | Instances: {0: 1200, 1: 999, 2: 1419, 3: 911}
Val   images: 3629   | Instances: {0: 1877, 1: 463, 2: 4086, 3: 281}
Test  images: 368  | Instances: {0: 294, 1: 0, 2: 294, 3: 0}

✅ YOLOv8 data.yaml saved to: /home/siu856582712/Documents/dataset/Clinical_split/data.yaml


In [None]:
#the seond attempt

In [3]:
from pathlib import Path
from collections import defaultdict
import shutil

# paths
image_dir = Path("/home/siu856582712/Documents/dataset/Clinical/images")
label_dir = Path("/home/siu856582712/Documents/dataset/Clinical/labels")
output_dir = Path("/home/siu856582712/Documents/dataset/Clinical_split_final")
output_dir.mkdir(parents=True, exist_ok=True)

# per-image class counts
image_to_class_counts = defaultdict(lambda: defaultdict(int))
class_totals = {0: 3371, 1: 1462, 2: 5799, 3: 1192}

for label_path in label_dir.glob("*.txt"):
    img_name = f"{label_path.stem}.jpg"
    with open(label_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if not parts:
                continue
            cls = int(parts[0])
            image_to_class_counts[img_name][cls] += 1

# sweep training targets; keep the best-balanced result
best_result = None
for target in range(900, 690, -10):
    train_set = set()
    train_counts = {i: 0 for i in range(4)}
    remaining = set(image_to_class_counts.keys())

    used_images = set()
    while True:
        changed = False
        for img in list(remaining):
            if img in used_images:
                continue
            add_this = False
            for cls in range(4):
                if train_counts[cls] >= target:
                    continue
                cnt = image_to_class_counts[img].get(cls, 0)
                if cnt > 0:
                    add_this = True
                    break
            if add_this:
                train_set.add(img)
                used_images.add(img)
                for cls, cnt in image_to_class_counts[img].items():
                    train_counts[cls] += cnt
                changed = True
        if not changed:
            break

    if all(train_counts[c] >= target for c in range(4)):
        balance = max(train_counts.values()) - min(train_counts.values())
        if best_result is None or balance < best_result["balance"]:
            best_result = {
                "target": target,
                "train_set": train_set.copy(),
                "train_counts": train_counts.copy(),
                "balance": balance,
            }

if best_result is None:
    raise RuntimeError("No balanced training split found in the given target range.")

# split remaining into val/test following residual proportions
train_set = best_result["train_set"]
train_counts = best_result["train_counts"]
remaining = set(image_to_class_counts.keys()) - train_set

val_set, test_set = set(), set()
val_counts = {i: 0 for i in range(4)}
test_counts = {i: 0 for i in range(4)}
val_target = {i: int((class_totals[i] - train_counts[i]) * 2 / 3) for i in range(4)}
test_target = {i: class_totals[i] - train_counts[i] - val_target[i] for i in range(4)}

used = set()

for cls in range(4):
    for img in list(remaining - used):
        cnt = image_to_class_counts[img].get(cls, 0)
        if cnt == 0:
            continue
        if val_counts[cls] < val_target[cls]:
            val_set.add(img)
            used.add(img)
            for c, n in image_to_class_counts[img].items():
                val_counts[c] += n

for cls in range(4):
    for img in list(remaining - used):
        cnt = image_to_class_counts[img].get(cls, 0)
        if cnt == 0:
            continue
        if test_counts[cls] < test_target[cls]:
            test_set.add(img)
            used.add(img)
            for c, n in image_to_class_counts[img].items():
                test_counts[c] += n

# copy files into split folders
def copy_files(split_set, split_name):
    img_out = output_dir / split_name / "images"
    lbl_out = output_dir / split_name / "labels"
    img_out.mkdir(parents=True, exist_ok=True)
    lbl_out.mkdir(parents=True, exist_ok=True)
    for img in split_set:
        shutil.copy(image_dir / img, img_out / img)
        lbl = f"{Path(img).stem}.txt"
        shutil.copy(label_dir / lbl, lbl_out / lbl)

copy_files(train_set, "train")
copy_files(val_set, "val")
copy_files(test_set, "test")

# write YOLOv8 data.yaml
with open(output_dir / "data.yaml", "w") as f:
    f.write(f"path: {output_dir}\n")
    f.write("train: train/images\n")
    f.write("val: val/images\n")
    f.write("test: test/images\n")
    f.write("names: ['neg_cocci', 'pos_cocci', 'neg_bacilli', 'pos_bacilli']\n")

# summary
def count_instances(split_set):
    counts = {i: 0 for i in range(4)}
    for img in split_set:
        for cls, cnt in image_to_class_counts[img].items():
            counts[cls] += cnt
    return counts

print("Final Split Summary:")
print(f"Train ({len(train_set)} images): {count_instances(train_set)}")
print(f"Val   ({len(val_set)} images): {count_instances(val_set)}")
print(f"Test  ({len(test_set)} images): {count_instances(test_set)}")
print(f"data.yaml created at: {output_dir / 'data.yaml'}")


✅ Final Split Summary:
Train (1585 images): {0: 831, 1: 749, 2: 1068, 3: 701}
Val   (2965 images): {0: 1880, 1: 475, 2: 3154, 3: 327}
Test  (1455 images): {0: 660, 1: 238, 2: 1577, 3: 164}
📄 YOLOv8 data.yaml created at: /home/siu856582712/Documents/dataset/Clinical_split_final/data.yaml


In [5]:
# # code for enhance the numbe of insatnces for class 2 , the spliting for the seond attempt
# Scan each YOLO label file in the training label folder.

# Filter out files that contain only class 2 instances.

# Count the number of class 2 instances in those files.

# Accumulate such files until we reach or slightly exceed ~343 instances.

# Split those selected files:

# 2/3 to validation set.

# 1/3 to test set.

# Move both .txt and .jpg files for those selected samples from train/ to either val/ or test/ folders.

# Double-check final class 2 instance counts and print confirmatio

In [4]:
import os
import shutil
from pathlib import Path

# === Define paths ===
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

# === Configuration ===
target_reduce_instances = 343  # reduce from 1068 to ~725

# === Step 1: Collect class-2-only files ===
selected_files = []
total_class2_instances = 0

for label_file in train_labels_dir.glob("*.txt"):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    classes = [int(line.strip().split()[0]) for line in lines]
    if all(c == 2 for c in classes):  # Only class 2
        instance_count = len(classes)
        selected_files.append((label_file.name, instance_count))
        total_class2_instances += instance_count
        if total_class2_instances >= target_reduce_instances:
            break

print(f"Selected {len(selected_files)} files with {total_class2_instances} class-2 instances to move.")

# === Step 2: Split into val/test (2:1) ===
val_split = int(len(selected_files) * 2 / 3)
val_files = selected_files[:val_split]
test_files = selected_files[val_split:]

def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))

        # Check both exist
        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# === Step 3: Move the files ===
move_files(val_files, train_labels_dir, train_images_dir, val_labels_dir, val_images_dir)
move_files(test_files, train_labels_dir, train_images_dir, test_labels_dir, test_images_dir)

# === Step 4: Final report ===
print(f"Moved {len(val_files)} files to validation set.")
print(f"Moved {len(test_files)} files to test set.")
print(" Balancing complete.")


Selected 213 files with 347 class-2 instances to move.
Moved 142 files to validation set.
Moved 71 files to test set.
✅ Balancing complete.


In [None]:
# Class 0 Reduction:

# Reduce from 831 → ~725 in the training set by moving ~106 class 0 instances.

# Only from .txt files that contain only class 0.

# Move 2/3 to val, 1/3 to test.

# Move corresponding .jpg image files as well.

# Print per-class instance counts:

# Count how many instances of each class (0–3) are in:

# Train

# Val

# Test

# Then show total per class across all.

In [6]:
import os
import shutil
from pathlib import Path
from collections import Counter

# === Define paths ===
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

# === Part 1: Rebalance Class 0 ===
target_reduce_instances_class0 = 106  # Reduce from 831 to ~725
selected_class0_files = []
total_class0_instances = 0

# Step 1: Find files with only class 0
for label_file in train_labels_dir.glob("*.txt"):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    classes = [int(line.strip().split()[0]) for line in lines]
    if all(c == 0 for c in classes):
        instance_count = len(classes)
        selected_class0_files.append((label_file.name, instance_count))
        total_class0_instances += instance_count
        if total_class0_instances >= target_reduce_instances_class0:
            break

print(f"\n📦 Selected {len(selected_class0_files)} class-0-only files with {total_class0_instances} instances to move...")

# Step 2: Split and move files
def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))

        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

val_split = int(len(selected_class0_files) * 2 / 3)
val_class0_files = selected_class0_files[:val_split]
test_class0_files = selected_class0_files[val_split:]

move_files(val_class0_files, train_labels_dir, train_images_dir, val_labels_dir, val_images_dir)
move_files(test_class0_files, train_labels_dir, train_images_dir, test_labels_dir, test_images_dir)

print(f"✅ Moved {len(val_class0_files)} class-0 files to validation set.")
print(f"✅ Moved {len(test_class0_files)} class-0 files to test set.")

# === Part 2: Count class instances in each split ===
def count_class_instances(labels_dir):
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        counts.update(classes)
    return counts

train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)

# === Print results ===
all_classes = [0, 1, 2, 3]
print("\n📊 Final Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts[c] for c in all_classes)
    values = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {values} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val", val_counts)
format_counts("Test", test_counts)

# === Total across all ===
total_counts = Counter()
for c in all_classes:
    total_counts[c] = train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)

format_counts("TOTAL", total_counts)



📦 Selected 73 class-0-only files with 106 instances to move...
✅ Moved 48 class-0 files to validation set.
✅ Moved 25 class-0 files to test set.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     725 |     749 |     721 |     701 |  2896
Val        |    1954 |     475 |    3368 |     327 |  6124
Test       |     692 |     238 |    1710 |     164 |  2804
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824


In [None]:
# Final Plan for Class 1 Rebalancing
# Target: Reduce class 1 in train/ from 749 to ~700 (removing ~49 or slightly more).

# Approach:

# Look for label files with only class 1.

# Collect enough to remove ≥49 class 1 instances.

# Move 2/3 to val/, 1/3 to test/, with their .jpg images.

# Then, print updated per-class counts across train, val, test, and total.

In [7]:
import os
import shutil
from pathlib import Path
from collections import Counter

# === Define paths ===
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

# === Shared move function ===
def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))
        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# === Step: Rebalance Class 1 ===
target_reduce_instances_class1 = 49  # Reduce from 749 to ~700
selected_class1_files = []
total_class1_instances = 0

for label_file in train_labels_dir.glob("*.txt"):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    classes = [int(line.strip().split()[0]) for line in lines]
    if all(c == 1 for c in classes):  # only class 1
        instance_count = len(classes)
        selected_class1_files.append((label_file.name, instance_count))
        total_class1_instances += instance_count
        if total_class1_instances >= target_reduce_instances_class1:
            break

print(f"\n📦 Selected {len(selected_class1_files)} class-1-only files with {total_class1_instances} instances to move...")

val_split = int(len(selected_class1_files) * 2 / 3)
val_class1_files = selected_class1_files[:val_split]
test_class1_files = selected_class1_files[val_split:]

move_files(val_class1_files, train_labels_dir, train_images_dir, val_labels_dir, val_images_dir)
move_files(test_class1_files, train_labels_dir, train_images_dir, test_labels_dir, test_images_dir)

print(f"✅ Moved {len(val_class1_files)} class-1 files to validation set.")
print(f"✅ Moved {len(test_class1_files)} class-1 files to test set.")

# === Final Count Report ===
def count_class_instances(labels_dir):
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        counts.update(classes)
    return counts

train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)

# === Print Summary ===
all_classes = [0, 1, 2, 3]
print("\n📊 Final Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts[c] for c in all_classes)
    values = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {values} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val", val_counts)
format_counts("Test", test_counts)

# === Totals Across All ===
total_counts = Counter()
for c in all_classes:
    total_counts[c] = train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)

format_counts("TOTAL", total_counts)



📦 Selected 36 class-1-only files with 49 instances to move...
✅ Moved 24 class-1 files to validation set.
✅ Moved 12 class-1 files to test set.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     725 |     700 |     721 |     701 |  2847
Val        |    1954 |     508 |    3368 |     327 |  6157
Test       |     692 |     254 |    1710 |     164 |  2820
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824


In [None]:
# Move enough .txt files from val to test such that about 190 class 0 instances shift over — only using files that contain only class 0.

In [8]:
import shutil
from pathlib import Path
from collections import Counter

# === Paths ===
val_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/test/images")
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")

# === Move function ===
def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))
        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# === Step 1: Move ~190 class-0 instances from val to test ===
target_move_instances = 190
selected_files = []
total_moved = 0

for label_file in val_labels_dir.glob("*.txt"):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    classes = [int(line.strip().split()[0]) for line in lines]
    if all(c == 0 for c in classes):
        instance_count = len(classes)
        selected_files.append((label_file.name, instance_count))
        total_moved += instance_count
        if total_moved >= target_move_instances:
            break

# Move them
move_files(selected_files, val_labels_dir, val_images_dir, test_labels_dir, test_images_dir)

print(f"\n✅ Moved {len(selected_files)} files containing {total_moved} class-0 instances from val → test.")

# === Step 2: Count Instances in All Splits ===
def count_class_instances(labels_dir):
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        counts.update(classes)
    return counts

# Collect counts
train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)

# === Print Results ===
all_classes = [0, 1, 2, 3]
print("\n📊 Final Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts[c] for c in all_classes)
    values = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {values} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val", val_counts)
format_counts("Test", test_counts)

# Totals
total_counts = Counter()
for c in all_classes:
    total_counts[c] = train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)
format_counts("TOTAL", total_counts)



✅ Moved 138 files containing 190 class-0 instances from val → test.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     725 |     700 |     721 |     701 |  2847
Val        |    1764 |     508 |    3368 |     327 |  5967
Test       |     882 |     254 |    1710 |     164 |  3010
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824


In [None]:
# Final Plan to Boost Class 3 in Train Set
# Current class 3 in training: 701

# Target: ~834
# → So we need to add ~133 instances of class 3 to the train set

# ✅ Steps:
# Look inside val/labels and test/labels for .txt files that contain only class 3.

# Accumulate enough files to get at least 133 class 3 instances.

# Move these files and their corresponding .jpg images to:

# train/labels

# train/images



In [9]:
import shutil
from pathlib import Path
from collections import Counter

# === Paths ===
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

# === Move function ===
def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))
        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# === Step: Add ~133 class-3-only instances to train ===
target_add_instances = 133
selected_files = []
moved_instances = 0

# First search in val
for label_file in val_labels_dir.glob("*.txt"):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    classes = [int(line.strip().split()[0]) for line in lines]
    if all(c == 3 for c in classes):
        instance_count = len(classes)
        selected_files.append((label_file.name, instance_count, "val"))
        moved_instances += instance_count
        if moved_instances >= target_add_instances:
            break

# If not enough from val, continue in test
if moved_instances < target_add_instances:
    for label_file in test_labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        if all(c == 3 for c in classes):
            instance_count = len(classes)
            selected_files.append((label_file.name, instance_count, "test"))
            moved_instances += instance_count
            if moved_instances >= target_add_instances:
                break

# === Move selected files to train ===
val_class3 = [(f, n) for f, n, src in selected_files if src == "val"]
test_class3 = [(f, n) for f, n, src in selected_files if src == "test"]

move_files(val_class3, val_labels_dir, val_images_dir, train_labels_dir, train_images_dir)
move_files(test_class3, test_labels_dir, test_images_dir, train_labels_dir, train_images_dir)

print(f"\n✅ Moved {len(val_class3)} files from val and {len(test_class3)} from test to train.")
print(f"✅ Added approximately {moved_instances} class-3 instances to train set.")

# === Final Count Summary ===
def count_class_instances(labels_dir):
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        counts.update(classes)
    return counts

# Collect counts
train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)

# === Print Results ===
all_classes = [0, 1, 2, 3]
print("\n📊 Final Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts[c] for c in all_classes)
    values = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {values} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val", val_counts)
format_counts("Test", test_counts)

# Totals
total_counts = Counter()
for c in all_classes:
    total_counts[c] = train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)
format_counts("TOTAL", total_counts)



✅ Moved 85 files from val and 0 from test to train.
✅ Added approximately 134 class-3 instances to train set.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     725 |     700 |     721 |     835 |  2981
Val        |    1764 |     508 |    3368 |     193 |  5833
Test       |     882 |     254 |    1710 |     164 |  3010
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824


In [None]:
# Current Test = 164 (too high)

# Target Test = 119

# So we must move ~45 class-3 instances from test → val

# Only from files that have only class 3

In [10]:
import shutil
from pathlib import Path
from collections import Counter

# === Define paths ===
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

# === Move function ===
def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))
        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# === Step: Move ~45 class-3-only instances from test → val ===
target_move_instances = 45
selected_files = []
moved_instances = 0

for label_file in test_labels_dir.glob("*.txt"):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    classes = [int(line.strip().split()[0]) for line in lines]
    if all(c == 3 for c in classes):
        instance_count = len(classes)
        selected_files.append((label_file.name, instance_count))
        moved_instances += instance_count
        if moved_instances >= target_move_instances:
            break

# Move them
move_files(selected_files, test_labels_dir, test_images_dir, val_labels_dir, val_images_dir)

print(f"\n✅ Moved {len(selected_files)} files with {moved_instances} class-3 instances from test → val.")

# === Final Count Report ===
def count_class_instances(labels_dir):
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        counts.update(classes)
    return counts

# Collect counts
train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)

# === Print Summary ===
all_classes = [0, 1, 2, 3]
print("\n📊 Final Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts[c] for c in all_classes)
    values = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {values} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val", val_counts)
format_counts("Test", test_counts)

# Totals
total_counts = Counter()
for c in all_classes:
    total_counts[c] = train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)
format_counts("TOTAL", total_counts)



✅ Moved 34 files with 46 class-3 instances from test → val.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     725 |     700 |     721 |     835 |  2981
Val        |    1764 |     508 |    3368 |     239 |  5879
Test       |     882 |     254 |    1710 |     118 |  2964
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824


In [None]:
# 🔹 Step 1 — Add ~113 class 2 instances to train:
# Search val/labels and test/labels for .txt files with only class 2

# Move enough to reach 113 class 2 instances into train

# Move both .txt and corresponding .jpg

# 🔹 Step 2 — Fix val:test ratio (2:1) for remaining class 2:
# After step 1, recompute remaining class 2 in val/test

# Calculate target 2:1 ratio (val = 2x, test = x)

# If current val:test ≠ 2:1, move class-2-only files from val → test until it is



In [11]:
import shutil
from pathlib import Path
from collections import Counter

# === Define paths ===
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

# === Move function ===
def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))
        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# === Step 1: Move ~113 class-2-only instances to train ===
target_add_instances = 113
selected_files = []
moved_instances = 0

# Search val and test for class-2-only label files
for src_label_dir, src_img_dir, src_name in [(val_labels_dir, val_images_dir, "val"),
                                             (test_labels_dir, test_images_dir, "test")]:
    for label_file in src_label_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        if all(c == 2 for c in classes):
            instance_count = len(classes)
            selected_files.append((label_file.name, instance_count, src_name))
            moved_instances += instance_count
            if moved_instances >= target_add_instances:
                break
    if moved_instances >= target_add_instances:
        break

# Group and move to train
val_to_train = [(f, n) for f, n, src in selected_files if src == "val"]
test_to_train = [(f, n) for f, n, src in selected_files if src == "test"]

move_files(val_to_train, val_labels_dir, val_images_dir, train_labels_dir, train_images_dir)
move_files(test_to_train, test_labels_dir, test_images_dir, train_labels_dir, train_images_dir)

print(f"\n✅ Moved {len(val_to_train)} files from val and {len(test_to_train)} from test to train.")
print(f"✅ Added approximately {moved_instances} class-2 instances to train.")

# === Step 2: Fix val:test 2:1 ratio for remaining class 2 ===
def count_class_instances(labels_dir):
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        counts.update(classes)
    return counts

val_counts = count_class_instances(val_labels_dir)
test_counts = count_class_instances(test_labels_dir)

current_val_2 = val_counts.get(2, 0)
current_test_2 = test_counts.get(2, 0)
remaining_total = current_val_2 + current_test_2
ideal_test = remaining_total // 3
ideal_val = remaining_total - ideal_test

print(f"\nClass 2 post-move: val={current_val_2}, test={current_test_2}")
print(f" Target 2:1 ratio → val={ideal_val}, test={ideal_test}")

# If test has too many, move from test → val
if current_test_2 > ideal_test:
    to_move = current_test_2 - ideal_test
    moved = 0
    files_to_move = []
    for label_file in test_labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        if all(c == 2 for c in classes):
            count = len(classes)
            files_to_move.append((label_file.name, count))
            moved += count
            if moved >= to_move:
                break
    move_files(files_to_move, test_labels_dir, test_images_dir, val_labels_dir, val_images_dir)
    print(f" Moved {len(files_to_move)} files from test → val to fix ratio.")

# If test has too few, move from val → test
elif current_test_2 < ideal_test:
    to_move = ideal_test - current_test_2
    moved = 0
    files_to_move = []
    for label_file in val_labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        if all(c == 2 for c in classes):
            count = len(classes)
            files_to_move.append((label_file.name, count))
            moved += count
            if moved >= to_move:
                break
    move_files(files_to_move, val_labels_dir, val_images_dir, test_labels_dir, test_images_dir)
    print(f" Moved {len(files_to_move)} files from val → test to fix ratio.")

# === Step 3: Final count print ===
train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)
all_classes = [0, 1, 2, 3]

print("\n📊 Final Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts[c] for c in all_classes)
    values = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {values} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val", val_counts)
format_counts("Test", test_counts)

# Totals
total_counts = Counter()
for c in all_classes:
    total_counts[c] = train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)
format_counts("TOTAL", total_counts)



✅ Moved 76 files from val and 0 from test to train.
✅ Added approximately 113 class-2 instances to train.

🎯 Class 2 post-move: val=3255, test=1710
📐 Target 2:1 ratio → val=3310, test=1655
🔁 Moved 39 files from test → val to fix ratio.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     725 |     700 |     834 |     835 |  3094
Val        |    1764 |     508 |    3311 |     239 |  5822
Test       |     882 |     254 |    1654 |     118 |  2908
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824


In [None]:
# 📈 Increase training class 1 to ~834
# → We need to move ~134 class 1 instances from val/test to train
# → Only using .txt files with only class 1

# ⚖️ After moving, adjust remaining val:test class 1 to a 2:1 ratio

In [12]:
import shutil
from pathlib import Path
from collections import Counter

# === Define paths ===
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

# === Move function ===
def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / (fname.replace(".txt", ".jpg"))
        if label_path.exists() and image_path.exists():
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# === Step 1: Move ~134 class-1-only instances to train ===
target_add_instances = 134
selected_files = []
moved_instances = 0

for src_label_dir, src_img_dir, src_name in [(val_labels_dir, val_images_dir, "val"),
                                             (test_labels_dir, test_images_dir, "test")]:
    for label_file in src_label_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        if all(c == 1 for c in classes):
            instance_count = len(classes)
            selected_files.append((label_file.name, instance_count, src_name))
            moved_instances += instance_count
            if moved_instances >= target_add_instances:
                break
    if moved_instances >= target_add_instances:
        break

# Group by source
val_to_train = [(f, n) for f, n, src in selected_files if src == "val"]
test_to_train = [(f, n) for f, n, src in selected_files if src == "test"]

# Move to train
move_files(val_to_train, val_labels_dir, val_images_dir, train_labels_dir, train_images_dir)
move_files(test_to_train, test_labels_dir, test_images_dir, train_labels_dir, train_images_dir)

print(f"\n✅ Moved {len(val_to_train)} files from val and {len(test_to_train)} from test to train.")
print(f"✅ Added approximately {moved_instances} class-1 instances to train.")

# === Step 2: Fix val:test 2:1 ratio for remaining class 1 ===
def count_class_instances(labels_dir):
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        counts.update(classes)
    return counts

val_counts = count_class_instances(val_labels_dir)
test_counts = count_class_instances(test_labels_dir)

current_val_1 = val_counts.get(1, 0)
current_test_1 = test_counts.get(1, 0)
remaining_total = current_val_1 + current_test_1
ideal_test = remaining_total // 3
ideal_val = remaining_total - ideal_test

print(f"\n🎯 Class 1 post-move: val={current_val_1}, test={current_test_1}")
print(f"📐 Target 2:1 ratio → val={ideal_val}, test={ideal_test}")

# Fix the imbalance
if current_test_1 > ideal_test:
    to_move = current_test_1 - ideal_test
    moved = 0
    files_to_move = []
    for label_file in test_labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        if all(c == 1 for c in classes):
            count = len(classes)
            files_to_move.append((label_file.name, count))
            moved += count
            if moved >= to_move:
                break
    move_files(files_to_move, test_labels_dir, test_images_dir, val_labels_dir, val_images_dir)
    print(f"🔁 Moved {len(files_to_move)} files from test → val to fix ratio.")

elif current_test_1 < ideal_test:
    to_move = ideal_test - current_test_1
    moved = 0
    files_to_move = []
    for label_file in val_labels_dir.glob("*.txt"):
        with open(label_file, 'r') as f:
            lines = f.readlines()
        classes = [int(line.strip().split()[0]) for line in lines]
        if all(c == 1 for c in classes):
            count = len(classes)
            files_to_move.append((label_file.name, count))
            moved += count
            if moved >= to_move:
                break
    move_files(files_to_move, val_labels_dir, val_images_dir, test_labels_dir, test_images_dir)
    print(f"🔁 Moved {len(files_to_move)} files from val → test to fix ratio.")

# === Step 3: Print final class counts ===
train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)
all_classes = [0, 1, 2, 3]

print("\n📊 Final Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts[c] for c in all_classes)
    values = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {values} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val", val_counts)
format_counts("Test", test_counts)

# Totals
total_counts = Counter()
for c in all_classes:
    total_counts[c] = train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)
format_counts("TOTAL", total_counts)



✅ Moved 97 files from val and 0 from test to train.
✅ Added approximately 134 class-1 instances to train.

🎯 Class 1 post-move: val=374, test=254
📐 Target 2:1 ratio → val=419, test=209
🔁 Moved 33 files from test → val to fix ratio.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     725 |     834 |     834 |     835 |  3228
Val        |    1764 |     419 |    3311 |     239 |  5733
Test       |     882 |     209 |    1654 |     118 |  2863
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824


In [None]:
# 📈 Increase train class 0 instances to ~834
# → Add ~109 class 0 instances to train
# → Only from files that contain only class 0

# ⚖️ Adjust the remaining val:test class 0 ratio to 2:1

In [13]:
from pathlib import Path
from collections import Counter
import shutil

# paths
train_labels_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/labels")
train_images_dir = Path("/home/siu856582712/Documents/dataset/dataset/train/images")
val_labels_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/labels")
val_images_dir   = Path("/home/siu856582712/Documents/dataset/dataset/val/images")
test_labels_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/labels")
test_images_dir  = Path("/home/siu856582712/Documents/dataset/dataset/test/images")

def read_classes(label_path: Path):
    classes = []
    with open(label_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if parts:
                classes.append(int(parts[0]))
    return classes

def move_files(file_list, from_labels, from_images, to_labels, to_images):
    for fname, _ in file_list:
        label_path = from_labels / fname
        image_path = from_images / fname.replace(".txt", ".jpg")
        if label_path.exists() and image_path.exists():
            (to_labels).mkdir(parents=True, exist_ok=True)
            (to_images).mkdir(parents=True, exist_ok=True)
            shutil.move(str(label_path), str(to_labels / fname))
            shutil.move(str(image_path), str(to_images / image_path.name))

# step 1 — move ~109 class-0-only files from val/test to train
target_add_instances = 109
selected_files, moved_instances = [], 0

for src_label_dir, src_img_dir, src_name in [
    (val_labels_dir, val_images_dir, "val"),
    (test_labels_dir, test_images_dir, "test"),
]:
    for label_file in src_label_dir.glob("*.txt"):
        classes = read_classes(label_file)
        if classes and all(c == 0 for c in classes):
            cnt = len(classes)
            selected_files.append((label_file.name, cnt, src_name))
            moved_instances += cnt
            if moved_instances >= target_add_instances:
                break
    if moved_instances >= target_add_instances:
        break

val_to_train  = [(f, n) for f, n, src in selected_files if src == "val"]
test_to_train = [(f, n) for f, n, src in selected_files if src == "test"]
move_files(val_to_train,  val_labels_dir,  val_images_dir,  train_labels_dir, train_images_dir)
move_files(test_to_train, test_labels_dir, test_images_dir, train_labels_dir, train_images_dir)

print(f"\nMoved {len(val_to_train)} files from val and {len(test_to_train)} from test to train.")
print(f"Added ~{moved_instances} class-0 instances to train.")

# step 2 — enforce val:test ≈ 2:1 for remaining class-0
def count_class_instances(labels_dir: Path) -> Counter:
    counts = Counter()
    for label_file in labels_dir.glob("*.txt"):
        counts.update(read_classes(label_file))
    return counts

val_counts  = count_class_instances(val_labels_dir)
test_counts = count_class_instances(test_labels_dir)

current_val_0  = val_counts.get(0, 0)
current_test_0 = test_counts.get(0, 0)
remaining_total = current_val_0 + current_test_0
ideal_test = remaining_total // 3
ideal_val  = remaining_total - ideal_test

print(f"\nClass 0 post-move: val={current_val_0}, test={current_test_0}")
print(f"Target ratio 2:1  → val={ideal_val}, test={ideal_test}")

def rebalance_zero(from_lbl, from_img, to_lbl, to_img, need_instances):
    moved, files = 0, []
    for label_file in from_lbl.glob("*.txt"):
        cls_list = read_classes(label_file)
        if cls_list and all(c == 0 for c in cls_list):
            cnt = len(cls_list)
            files.append((label_file.name, cnt))
            moved += cnt
            if moved >= need_instances:
                break
    move_files(files, from_lbl, from_img, to_lbl, to_img)
    return len(files)

if current_test_0 > ideal_test:
    moved_files = rebalance_zero(test_labels_dir, test_images_dir, val_labels_dir, val_images_dir,
                                 current_test_0 - ideal_test)
    print(f"Moved {moved_files} files test → val to fix ratio.")
elif current_test_0 < ideal_test:
    moved_files = rebalance_zero(val_labels_dir, val_images_dir, test_labels_dir, test_images_dir,
                                 ideal_test - current_test_0)
    print(f"Moved {moved_files} files val → test to fix ratio.")

# step 3 — final class counts
train_counts = count_class_instances(train_labels_dir)
val_counts   = count_class_instances(val_labels_dir)
test_counts  = count_class_instances(test_labels_dir)

all_classes = [0, 1, 2, 3]
print("\nFinal Class Counts:")
print("Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total")
print("-----------------------------------------------------------")

def format_counts(name, counts):
    total = sum(counts.get(c, 0) for c in all_classes)
    cols = " | ".join(f"{counts.get(c, 0):7d}" for c in all_classes)
    print(f"{name:<10} | {cols} | {total:5d}")

format_counts("Train", train_counts)
format_counts("Val",   val_counts)
format_counts("Test",  test_counts)

total_counts = Counter({c: train_counts.get(c, 0) + val_counts.get(c, 0) + test_counts.get(c, 0)
                        for c in all_classes})
format_counts("TOTAL", total_counts)



✅ Moved 78 files from val and 0 from test to train.
✅ Added approximately 110 class-0 instances to train.

🎯 Class 0 post-move: val=1654, test=882
📐 Target 2:1 ratio → val=1691, test=845
🔁 Moved 24 files from test → val to fix ratio.

📊 Final Class Counts:
Split      | Class 0 | Class 1 | Class 2 | Class 3 | Total
-----------------------------------------------------------
Train      |     835 |     834 |     834 |     835 |  3338
Val        |    1691 |     419 |    3311 |     239 |  5660
Test       |     845 |     209 |    1654 |     118 |  2826
TOTAL      |    3371 |    1462 |    5799 |    1192 | 11824
