# Diurnas - resplit treino/validacao
Notebook para recombinar e dividir o dataset em treino/validacao com proporcoes editaveis.
- Usa labels vazios como BACKGROUND (imagem sem objetos).
- Gera estrutura YOLO em OUTPUT_ROOT/train|valid/{images,labels}.


In [24]:
from pathlib import Path
import random
import shutil
import hashlib
from collections import Counter, defaultdict

# ==== CONFIG ====
CWD = Path.cwd()
PROJECT_ROOT = CWD.parent if CWD.name.lower() == 'notebooks' else CWD

SOURCE_ROOT = PROJECT_ROOT / 'data' / 'diurno'  # ou PROJECT_ROOT / 'dataset' / 'Diurnas'
OUTPUT_ROOT = PROJECT_ROOT / 'dataset' / 'Diurnas_resplit_v1'
VAL_RATIO = 0.2  # mude aqui (ex: 0.1, 0.2, 0.3)
SEED = 42

SPLIT_STRATEGY = 'primary_label'  # 'primary_label' ou 'multilabel_greedy'
COPY_MODE = 'copy'  # 'copy' ou 'move'
DRY_RUN = False  # True = nao copia/move arquivos

WRITE_DATA_YAML = True
CLASS_NAMES = ['N1', 'N2', 'N3']  # BACKGROUND nao entra aqui

ALLOW_RENAME_ON_CONFLICT = False  # True para renomear se houver nomes repetidos


In [25]:
IMG_EXTS = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.webp']

def find_image_for_label(label_path: Path):
    images_dir = label_path.parent.parent / 'images'
    if not images_dir.exists():
        return None
    for ext in IMG_EXTS:
        candidate = images_dir / f'{label_path.stem}{ext}'
        if candidate.exists():
            return candidate
    matches = list(images_dir.glob(label_path.stem + '.*'))
    return matches[0] if matches else None

def read_label_info(label_path: Path):
    text = label_path.read_text(encoding='utf-8', errors='ignore').strip()
    if not text:
        return set(), Counter()
    counts = Counter()
    for line in text.splitlines():
        parts = line.strip().split()
        if not parts:
            continue
        try:
            cls = int(float(parts[0]))
        except ValueError:
            continue
        counts[cls] += 1
    return set(counts.keys()), counts

def primary_label(counts: Counter):
    if not counts:
        return 'BACKGROUND'
    return sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))[0][0]

def collect_samples(source_root: Path):
    label_files = list(source_root.rglob('labels/*.txt'))
    samples = []
    missing_images = []
    for label_path in label_files:
        image_path = find_image_for_label(label_path)
        if image_path is None:
            missing_images.append(label_path)
            continue
        labels, counts = read_label_info(label_path)
        samples.append({
            'image_path': image_path,
            'label_path': label_path,
            'labels': labels,
            'label_counts': counts,
            'primary': primary_label(counts),
        })
    return samples, missing_images

def find_name_conflicts(samples):
    name_map = defaultdict(list)
    for s in samples:
        name_map[s['image_path'].name].append(s)
    return {name: items for name, items in name_map.items() if len(items) > 1}

def make_unique_name(path: Path):
    h = hashlib.md5(str(path).encode('utf-8')).hexdigest()[:8]
    return f'{path.stem}__{h}{path.suffix}'


In [26]:
samples, missing_images = collect_samples(SOURCE_ROOT)
print(f'Total samples: {len(samples)}')
print(f'Missing images for labels: {len(missing_images)}')
if missing_images[:5]:
    print('Example missing labels:')
    for p in missing_images[:5]:
        print('  ', p)

conflicts = find_name_conflicts(samples)
print(f'Filename conflicts: {len(conflicts)}')
if conflicts and not ALLOW_RENAME_ON_CONFLICT:
    raise RuntimeError('Found duplicate image names. Set ALLOW_RENAME_ON_CONFLICT=True or fix names.')


Total samples: 1788
Missing images for labels: 0
Filename conflicts: 0


In [27]:
def summarize_image_counts(samples):
    image_counts = Counter()
    instance_counts = Counter()
    for s in samples:
        if not s['labels']:
            image_counts['BACKGROUND'] += 1
        else:
            for cls in s['labels']:
                image_counts[cls] += 1
        for cls, cnt in s['label_counts'].items():
            instance_counts[cls] += cnt
    return image_counts, instance_counts

image_counts, instance_counts = summarize_image_counts(samples)
print('Image-level counts (presence per image):')
for k, v in sorted(image_counts.items(), key=lambda kv: str(kv[0])):
    print(f'  {k}: {v}')
print('Instance counts (rows in labels):')
for k, v in sorted(instance_counts.items(), key=lambda kv: str(kv[0])):
    print(f'  {k}: {v}')


Image-level counts (presence per image):
  0: 712
  1: 325
  2: 289
  BACKGROUND: 462
Instance counts (rows in labels):
  0: 712
  1: 325
  2: 290


In [28]:
def split_primary(samples, val_ratio, seed):
    groups = defaultdict(list)
    for s in samples:
        groups[s['primary']].append(s)
    rng = random.Random(seed)
    train, val = [], []
    for _, items in groups.items():
        rng.shuffle(items)
        n_val = int(round(len(items) * val_ratio))
        val.extend(items[:n_val])
        train.extend(items[n_val:])
    rng.shuffle(train)
    rng.shuffle(val)
    return train, val

def split_multilabel_greedy(samples, val_ratio, seed):
    rng = random.Random(seed)
    labels = set()
    total_counts = Counter()
    for s in samples:
        label_set = s['labels'] if s['labels'] else {'BACKGROUND'}
        for lbl in label_set:
            labels.add(lbl)
            total_counts[lbl] += 1

    labels = sorted(labels, key=lambda x: str(x))
    target_val = {lbl: int(round(total_counts[lbl] * val_ratio)) for lbl in labels}
    current_val = {lbl: 0 for lbl in labels}

    label_to_indices = {lbl: set() for lbl in labels}
    for idx, s in enumerate(samples):
        label_set = s['labels'] if s['labels'] else {'BACKGROUND'}
        for lbl in label_set:
            label_to_indices[lbl].add(idx)

    def need(lbl):
        return target_val[lbl] - current_val[lbl]

    def sample_score(idx):
        label_set = samples[idx]['labels'] if samples[idx]['labels'] else {'BACKGROUND'}
        return sum(max(need(lbl), 0) for lbl in label_set)

    remaining = set(range(len(samples)))
    val_set = set()

    while remaining:
        label = max(labels, key=lambda l: (need(l), -total_counts[l]))
        if need(label) <= 0:
            break
        candidates = list(label_to_indices[label] & remaining)
        if not candidates:
            current_val[label] = target_val[label]
            continue
        best = max(candidates, key=lambda idx: (sample_score(idx), rng.random()))
        val_set.add(best)
        remaining.remove(best)
        label_set = samples[best]['labels'] if samples[best]['labels'] else {'BACKGROUND'}
        for lbl in label_set:
            current_val[lbl] += 1

    train_set = set(remaining)
    target_val_size = int(round(len(samples) * val_ratio))
    if len(val_set) < target_val_size:
        needed = target_val_size - len(val_set)
        candidates = list(train_set)
        candidates.sort(key=sample_score, reverse=True)
        for idx in candidates[:needed]:
            train_set.remove(idx)
            val_set.add(idx)
    elif len(val_set) > target_val_size:
        extra = len(val_set) - target_val_size
        candidates = list(val_set)
        candidates.sort(key=sample_score)
        for idx in candidates[:extra]:
            val_set.remove(idx)
            train_set.add(idx)

    train = [samples[i] for i in train_set]
    val = [samples[i] for i in val_set]
    rng.shuffle(train)
    rng.shuffle(val)
    return train, val


In [29]:
if SPLIT_STRATEGY == 'primary_label':
    train_samples, val_samples = split_primary(samples, VAL_RATIO, SEED)
elif SPLIT_STRATEGY == 'multilabel_greedy':
    train_samples, val_samples = split_multilabel_greedy(samples, VAL_RATIO, SEED)
else:
    raise ValueError('Unknown SPLIT_STRATEGY')

print(f'Train: {len(train_samples)} | Val: {len(val_samples)}')


Train: 1431 | Val: 357


In [30]:
def summarize_split(samples, title):
    image_counts, instance_counts = summarize_image_counts(samples)
    print(title)
    print('  Image-level counts:')
    for k, v in sorted(image_counts.items(), key=lambda kv: str(kv[0])):
        print(f'    {k}: {v}')
    print('  Instance counts:')
    for k, v in sorted(instance_counts.items(), key=lambda kv: str(kv[0])):
        print(f'    {k}: {v}')

summarize_split(train_samples, 'Train split')
summarize_split(val_samples, 'Val split')


Train split
  Image-level counts:
    0: 570
    1: 260
    2: 231
    BACKGROUND: 370
  Instance counts:
    0: 570
    1: 260
    2: 232
Val split
  Image-level counts:
    0: 142
    1: 65
    2: 58
    BACKGROUND: 92
  Instance counts:
    0: 142
    1: 65
    2: 58


In [31]:
target_cls = 2
multi = []

for s in train_samples:
    cnt = s["label_counts"].get(target_cls, 0)
    if cnt > 1:
        multi.append((cnt, s["image_path"], s["label_path"]))

print(f"Imagens com >1 instancia da classe {target_cls}: {len(multi)}")
for cnt, img, lbl in sorted(multi, key=lambda x: x[0], reverse=True):
    print(f"{cnt}x | {img} | {lbl}")


Imagens com >1 instancia da classe 2: 1
2x | c:\Users\GainTech0014\Documents\yolov8_model\data\diurno\images\20260106095842.jpg | c:\Users\GainTech0014\Documents\yolov8_model\data\diurno\labels\20260106095842.txt


In [32]:
def ensure_dirs(root: Path):
    (root / 'train' / 'images').mkdir(parents=True, exist_ok=True)
    (root / 'train' / 'labels').mkdir(parents=True, exist_ok=True)
    (root / 'valid' / 'images').mkdir(parents=True, exist_ok=True)
    (root / 'valid' / 'labels').mkdir(parents=True, exist_ok=True)

def get_output_name(sample):
    if ALLOW_RENAME_ON_CONFLICT:
        return make_unique_name(sample['image_path'])
    return sample['image_path'].name

def copy_or_move(src: Path, dst: Path, mode: str):
    if mode == 'copy':
        shutil.copy2(src, dst)
    elif mode == 'move':
        shutil.move(src, dst)
    else:
        raise ValueError('COPY_MODE must be copy or move')

def write_split(samples, subset):
    for s in samples:
        out_name = get_output_name(s)
        img_dst = OUTPUT_ROOT / subset / 'images' / out_name
        lbl_dst = OUTPUT_ROOT / subset / 'labels' / f'{Path(out_name).stem}.txt'
        if DRY_RUN:
            continue
        copy_or_move(s['image_path'], img_dst, COPY_MODE)
        copy_or_move(s['label_path'], lbl_dst, COPY_MODE)

if not DRY_RUN:
    ensure_dirs(OUTPUT_ROOT)
    write_split(train_samples, 'train')
    write_split(val_samples, 'valid')
    print('Split written to:', OUTPUT_ROOT)
else:
    print('DRY_RUN=True: no files written')


Split written to: c:\Users\GainTech0014\Documents\yolov8_model\dataset\Diurnas_resplit_v1


In [33]:
def write_data_yaml(output_root: Path, class_names):
    if not class_names:
        return
    yaml_text = (
        'train: train/images\n'
        'val: valid/images\n\n'
        f'nc: {len(class_names)}\n'
        f'names: {class_names}\n'
    )
    if DRY_RUN:
        print('DRY_RUN=True: would write data.yaml with:')
        print(yaml_text)
        return
    (output_root / 'data.yaml').write_text(yaml_text, encoding='utf-8')

if WRITE_DATA_YAML:
    write_data_yaml(OUTPUT_ROOT, CLASS_NAMES)
