In [None]:
!pip install ultralytics
%matplotlib inline

### Dataset split summary (images/labels/missing/empty)

In [None]:
from pathlib import Path

DATASET_DIR = Path("dataset")
SPLITS = ["train", "valid", "test"]
IMG_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}

def split_summary(split: str):
    img_dir = DATASET_DIR / split / "images"
    lbl_dir = DATASET_DIR / split / "labels"

    imgs = [p for p in img_dir.iterdir() if p.is_file() and p.suffix.lower() in IMG_EXTS]
    lbls = list(lbl_dir.glob("*.txt"))

    img_stems = {p.stem for p in imgs}
    lbl_stems = {p.stem for p in lbls}

    missing_labels = sorted(img_stems - lbl_stems)
    extra_labels   = sorted(lbl_stems - img_stems)

    empty_labels = []
    for lp in lbls:
        if lp.read_text(encoding="utf-8", errors="ignore").strip() == "":
            empty_labels.append(lp.name)

    return {
        "split": split,
        "images": len(imgs),
        "labels": len(lbls),
        "missing_labels": len(missing_labels),
        "extra_labels": len(extra_labels),
        "empty_label_files": len(empty_labels),
        "missing_label_names": missing_labels[:10],
        "extra_label_names": extra_labels[:10],
    }

rows = []
for s in SPLITS:
    if (DATASET_DIR / s).exists():
        rows.append(split_summary(s))

rows


[{'split': 'train',
  'images': 312,
  'labels': 312,
  'missing_labels': 0,
  'extra_labels': 0,
  'empty_label_files': 0,
  'missing_label_names': [],
  'extra_label_names': []},
 {'split': 'valid',
  'images': 39,
  'labels': 39,
  'missing_labels': 0,
  'extra_labels': 0,
  'empty_label_files': 0,
  'missing_label_names': [],
  'extra_label_names': []},
 {'split': 'test',
  'images': 39,
  'labels': 39,
  'missing_labels': 0,
  'extra_labels': 0,
  'empty_label_files': 0,
  'missing_label_names': [],
  'extra_label_names': []}]

### Boxes per split + per class

In [7]:
NUM_CLASSES = 13

def count_boxes_in_split(split: str, num_classes: int):
    lbl_dir = DATASET_DIR / split / "labels"
    boxes_per_class = [0] * num_classes
    total_boxes = 0
    bad_files = 0

    for lp in lbl_dir.glob("*.txt"):
        txt = lp.read_text(encoding="utf-8", errors="ignore").strip()
        if not txt:
            continue

        for line in txt.splitlines():
            parts = line.split()
            if len(parts) != 5:
                bad_files += 1
                continue
            try:
                cls = int(float(parts[0]))
            except:
                bad_files += 1
                continue

            if 0 <= cls < num_classes:
                boxes_per_class[cls] += 1
                total_boxes += 1

    return {
        "split": split,
        "total_boxes": total_boxes,
        "boxes_per_class": boxes_per_class,
        "bad_lines_or_files": bad_files
    }

box_stats = []
for s in SPLITS:
    if (DATASET_DIR / s).exists():
        box_stats.append(count_boxes_in_split(s, NUM_CLASSES))

box_stats


[{'split': 'train',
  'total_boxes': 436,
  'boxes_per_class': [18, 48, 56, 13, 49, 9, 46, 36, 48, 44, 11, 31, 27],
  'bad_lines_or_files': 0},
 {'split': 'valid',
  'total_boxes': 60,
  'boxes_per_class': [0, 9, 3, 0, 9, 2, 4, 4, 5, 8, 11, 3, 2],
  'bad_lines_or_files': 0},
 {'split': 'test',
  'total_boxes': 50,
  'boxes_per_class': [0, 9, 2, 1, 10, 0, 4, 1, 6, 5, 1, 3, 8],
  'bad_lines_or_files': 0}]

### Check same filenames exist in images + labels

In [5]:
from pathlib import Path

def check_split(split):
    img_dir = Path("dataset") / split / "images"
    lbl_dir = Path("dataset") / split / "labels"
    imgs = {p.stem for p in img_dir.glob("*")}
    lbls = {p.stem for p in lbl_dir.glob("*.txt")}
    print(f"{split}: images={len(imgs)} labels={len(lbls)} missing_labels={len(imgs - lbls)}")

for s in ["train", "valid", "test"]:
    if (Path("dataset") / s).exists():
        check_split(s)


train: images=312 labels=312 missing_labels=0
valid: images=39 labels=39 missing_labels=0
test: images=39 labels=39 missing_labels=0


## Model Training