In [None]:
# @title
# --- montar Drive + preparar ambiente YOLOv5 ----
from pathlib import Path
import subprocess, sys, os

def sh(cmd, cwd=None):
    p = subprocess.run(cmd, text=True, capture_output=True, cwd=cwd)
    if p.returncode:
        print(p.stdout); print(p.stderr)
        raise RuntimeError(f"Erro {p.returncode}: {' '.join(cmd)}")
    return p.stdout

# 1) Monta Drive
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
except Exception as e:
    print("ℹ️ Não estou no Colab ou já montado:", e)

# 2) Caminhos (use o (original))
DATA_DIR = Path('/content/drive/MyDrive/Colab Notebooks/Br-AsPavDam/Br-AsPavDam (original)')
ROOT     = DATA_DIR.parent
IMGROOT  = DATA_DIR/'images'
LBLDIR   = DATA_DIR/'labels'

print('DATA_DIR =', DATA_DIR)
assert IMGROOT.exists() and LBLDIR.exists(), "images/ ou labels/ não encontrados dentro do (original)"

# 3) YOLOv5 (sem ultralytics v8)
YOLO_DIR = Path('/content/yolov5')
if YOLO_DIR.exists():
    # opcional: manter limpo
    sh(["git", "fetch", "--all"], cwd=str(YOLO_DIR))
    sh(["git", "reset", "--hard", "origin/master"], cwd=str(YOLO_DIR))
else:
    sh(["git", "clone", "https://github.com/ultralytics/yolov5", str(YOLO_DIR)])

# desinstalar ultralytics v8 (evita conflito)
sh([sys.executable, "-m", "pip", "uninstall", "-y", "ultralytics"])
# instalar requirements do YOLOv5
sh([sys.executable, "-m", "pip", "install", "-qr", "requirements.txt"], cwd=str(YOLO_DIR))

print("✅ Ambiente pronto. Agora rode o sanity check.")


Mounted at /content/drive
DATA_DIR = /content/drive/MyDrive/Colab Notebooks/Br-AsPavDam/Br-AsPavDam (original)
✅ Ambiente pronto. Agora rode o sanity check.


In [None]:
# @title
#SANITY CHECKS

# -*- coding: utf-8 -*-
"""
Sanity Check — BR-AsPavDam (original) -> pronto p/ K=5 StratifiedKFold (flat)
- NÃO altera nada no dataset nem nos arquivos originais
- Checa:
  1) Ambiente YOLOv5 (sem ultralytics v8)
  2) Estrutura de pastas/arquivos
  3) Consistência de classes (por subpasta)
  4) Labels existentes e válidas
  5) Duplicidade de stems entre classes (conflitos)
  6) Viabilidade do K-fold (menor classe >= K)
  7) Preview da estratificação (distribuição por fold)
  8) Valida IDs de classe dentro do range NAMES
Se algo crítico falhar, levanta RuntimeError com mensagem objetiva.
"""
import os, sys, re, json, collections
from pathlib import Path

# ==== CONFIG BÁSICA (ajuste se precisar) =====================================
ROOT     = Path('/content/drive/MyDrive/Colab Notebooks/Br-AsPavDam/')
DATA_DIR = ROOT/'Br-AsPavDam (original)'
IMGROOT  = DATA_DIR/'images'   # esperado: images/<Classe>/*.{jpg,png,...}
LBLDIR   = DATA_DIR/'labels'   # esperado: labels/*.txt (pasta ÚNICA)
YOLO_DIR = Path('/content/yolov5')

NAMES    = ['Fissures','Shoving','Ravelling','Pothole','Patch']
K_FOLDS  = 4
EXTS     = {'.jpg','.jpeg','.png','.bmp'}

# ==== 1) AMBIENTE =============================================================
print('== 1) Ambiente ======================================================')
# 1a) YOLOv5 repo
if not YOLO_DIR.exists():
    print(f"❌ YOLOv5 não encontrado em: {YOLO_DIR}")
    print("   -> Rode:  !git clone https://github.com/ultralytics/yolov5 /content/yolov5")
    raise RuntimeError("YOLOv5 ausente")
else:
    print(f"✅ YOLOv5 encontrado: {YOLO_DIR}")

# 1b) ultralytics (YOLOv8) não deve estar instalado
try:
    import ultralytics  # noqa
    print("⚠️  Pacote 'ultralytics' (YOLOv8) está instalado. Isso pode conflitar com YOLOv5.")
    print("   -> Recomendo NÃO usá-lo neste pipeline. Se der erro de import/pesos, desinstale:")
    print("      !pip uninstall -y ultralytics")
except Exception:
    print("✅ 'ultralytics' (v8) não aparenta estar carregado (bom para YOLOv5).")

# 1c) GPU opcional
try:
    import torch
    print(f"ℹ️ Torch CUDA disponível? {torch.cuda.is_available()}")
except Exception:
    print("ℹ️ Torch não importou; YOLOv5 tentará instalar requirements quando você treinar.")

# ==== 2) ESTRUTURA DE PASTAS ==================================================
print('\n== 2) Estrutura de dados =============================================')
errors = []
if not DATA_DIR.exists():
    errors.append(f"Base não encontrada: {DATA_DIR}")
if not IMGROOT.exists():
    errors.append(f"Pasta de imagens não encontrada: {IMGROOT}")
if not LBLDIR.exists():
    errors.append(f"Pasta de labels não encontrada: {LBLDIR}")

if errors:
    for e in errors: print("❌", e)
    raise RuntimeError("Estrutura de pastas ausente/inepta.")
else:
    print(f"✅ Estrutura base ok:\n   - {IMGROOT}\n   - {LBLDIR}")

# ==== 3) CONSISTÊNCIA DE CLASSES (subpastas) =================================
print('\n== 3) Consistência de classes (subpastas de images/) ==================')
name2idx = {n.lower(): i for i, n in enumerate(NAMES)}
subdirs  = [p for p in IMGROOT.iterdir() if p.is_dir()]
unknown  = [p.name for p in subdirs if p.name.lower() not in name2idx]
if unknown:
    print("❌ Subpastas não mapeadas em NAMES:", unknown)
    print("   -> Ajuste NAMES ou renomeie pastas.")
    raise RuntimeError("Classes desconhecidas nas subpastas de images/")
else:
    print("✅ Todas as subpastas de images/ estão mapeadas em NAMES.")

# ==== 4) ESCANEIA IMAGENS & LABELS ===========================================
print('\n== 4) Escaneando imagens & labels ====================================')
counts      = collections.Counter()
missing_lbl = 0
imgs_by_cls = collections.defaultdict(list)
stems_seen  = collections.defaultdict(list)  # stem -> [classe1, classe2,...]
bad_ext     = 0

for class_dir in sorted(subdirs, key=lambda p: p.name.lower()):
    cname = class_dir.name
    for img in class_dir.rglob('*'):
        if img.is_dir():
            continue
        if img.suffix.lower() not in EXTS:
            bad_ext += 1
            continue
        stem = img.stem
        stems_seen[stem].append(cname)
        lbl = LBLDIR/f"{stem}.txt"
        if not lbl.exists():
            missing_lbl += 1
            continue
        counts[cname] += 1
        imgs_by_cls[cname].append((img, lbl))

total_imgs = sum(counts.values())
print(f"✅ Total de imagens COM label: {total_imgs}")
print("   Por classe:", dict(counts))
if bad_ext:
    print(f"ℹ️ Arquivos ignorados por extensão não suportada: {bad_ext}")
if missing_lbl:
    print(f"⚠️ Labels ausentes (imagens ignoradas): {missing_lbl}")

# ==== 5) STEMS DUPLICADOS ENTRE CLASSES (conflito real) ======================
print('\n== 5) Conflitos de nome (stem) entre classes ==========================')
dups = {stem: cls for stem, cls in stems_seen.items() if len(set(cls)) > 1}
if dups:
    print(f"❌ Encontrados {len(dups)} stems iguais em classes diferentes (conflito sério):")
    # mostra só alguns para não poluir
    for i, (stem, cls) in enumerate(dups.items()):
        print(f"   - {stem}: {sorted(set(cls))}")
        if i >= 9:
            print("   ... (mais ocultos)")
            break
    print("   -> IMPACTO: duas imagens de classes distintas apontam para o MESMO labels/<stem>.txt.")
    print("      Isso invalida o dataset. Corrija nomes (stems únicos) ou gere labels distintos por imagem.")
    raise RuntimeError("Conflito de stems entre classes.")
else:
    print("✅ Não há stems duplicados entre classes (bom).")

# ==== 6) VALIDAÇÃO DO CONTEÚDO DAS LABELS ====================================
print('\n== 6) Validando conteúdo das labels (classe e formato) ================')
invalid_cls_ids = 0
empty_labels    = 0
bad_lines       = 0
for cname, pairs in imgs_by_cls.items():
    for img, lbl in pairs:
        try:
            lines = [ln.strip() for ln in open(lbl) if ln.strip()]
        except Exception:
            bad_lines += 1
            continue
        if not lines:
            empty_labels += 1
            continue
        for ln in lines:
            parts = ln.split()
            if len(parts) < 5:
                bad_lines += 1
                continue
            try:
                cid = int(parts[0])
            except ValueError:
                bad_lines += 1
                continue
            if not (0 <= cid < len(NAMES)):
                invalid_cls_ids += 1

if empty_labels:
    print(f"⚠️ Labels vazias (0 caixas): {empty_labels}")
if bad_lines:
    print(f"⚠️ Linhas de label inválidas (formato): {bad_lines}")
if invalid_cls_ids:
    print(f"❌ IDs de classe fora do range [0..{len(NAMES)-1}]: {invalid_cls_ids}")
    raise RuntimeError("Labels com class IDs inválidos.")
else:
    print("✅ Labels com IDs de classe válidos.")

# ==== 7) VIABILIDADE DO K-FOLD ===============================================
print('\n== 7) Viabilidade K-Fold ==============================================')
min_class = min(counts.values()) if counts else 0
if min_class < K_FOLDS:
    print(f"❌ Menor classe tem {min_class} amostras com label — insuficiente para K={K_FOLDS}.")
    print("   -> Remova K-fold ou colete mais dados/ajuste K.")
    raise RuntimeError("Menor classe < K.")
else:
    print(f"✅ K={K_FOLDS} viável: menor classe tem {min_class} amostras.")

# ==== 8) PREVIEW DA ESTRATIFICAÇÃO (distribuição por fold) ===================
print('\n== 8) Preview da estratificação (folds proporcionais) =================')
try:
    import numpy as np
    from sklearn.model_selection import StratifiedKFold
except Exception as e:
    print("❌ Precisa do scikit-learn e numpy para checar estratificação.")
    print("   -> !pip install -q scikit-learn numpy")
    raise

# monta X/y
y = []
X = []
for cname, pairs in imgs_by_cls.items():
    for img, lbl in pairs:
        X.append(img)
        y.append(name2idx[cname.lower()])
y = np.array(y)

skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
ok = True
for fold, (_, val_idx) in enumerate(skf.split(X, y), 1):
    fold_counts = collections.Counter(y[val_idx])
    dist = {NAMES[i]: int(fold_counts[i]) for i in sorted(fold_counts)}
    print(f"   Fold {fold}: {dist}")
    # sanity: cada fold deve ter pelo menos 1 amostra de cada classe
    if any(cnt == 0 for cnt in fold_counts.values()):
        ok = False
if not ok:
    print("❌ Algum fold ficou sem amostras de uma classe — improvável com K válido, mas cheque dados.")
    raise RuntimeError("Estratificação problemática (classe vazia em fold).")
else:
    print("✅ Estratificação OK: todas as classes aparecem em todos os folds.")

# ==== 9) RESUMO FINAL =========================================================
print('\n== ✅ RESUMO ===========================================================')
print(f"- Classes (NAMES): {NAMES}")
print(f"- Total de imagens COM label: {total_imgs}")
print(f"- Distribuição por classe: {dict(counts)}")
print(f"- Menor classe: {min_class}  |  K: {K_FOLDS}  → OK")
if missing_lbl:
    print(f"- Aviso: labels ausentes ignoradas: {missing_lbl}")
if empty_labels or bad_lines:
    print(f"- Avisos de labels: vazias={empty_labels}, formato_ruim={bad_lines}")
print("\nPronto para treinar com K=5 (flat folds). 🚀")


✅ YOLOv5 encontrado: /content/yolov5
Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
⚠️  Pacote 'ultralytics' (YOLOv8) está instalado. Isso pode conflitar com YOLOv5.
   -> Recomendo NÃO usá-lo neste pipeline. Se der erro de import/pesos, desinstale:
      !pip uninstall -y ultralytics
ℹ️ Torch CUDA disponível? True

✅ Estrutura base ok:
   - /content/drive/MyDrive/Colab Notebooks/Br-AsPavDam/Br-AsPavDam (original)/images
   - /content/drive/MyDrive/Colab Notebooks/Br-AsPavDam/Br-AsPavDam (original)/labels

✅ Todas as subpastas de images/ estão mapeadas em NAMES.

✅ Total de imagens COM label: 2167
   Por classe: {'Fissures': 1026, 'Patch': 452, 'Pothole': 74, 'Ravelling': 607, 'Shoving': 8}
⚠️ Labels ausentes (imagens ignorada

KeyboardInterrupt: 

In [None]:
# -*- coding: utf-8 -*-
"""
BR-AsPavDam – 4-Fold StratifiedKFold (flat folds) + IC95% + OOF
----------------------------------------------------------------
• 5 classes  : Fissures, Shoving, Ravelling, Pothole, Patch
• Dataset    : images/<Classe>/* ; labels/*.txt (pasta única)
• Folds      : K=4 estratificado pela pasta da imagem
• Treino     : 30 épocas · batch 16 · imgsz 640 · cos-lr · patience=10 · seed=42
• Validação  : val.py --conf-thres 0.25 --save-json --save-txt
• Métricas   : sklearn (precision, recall, f1 weighted) + Confusion Matrix (IoU=0.50)
• Estatística: média, desvio e IC95% (t-Student e bootstrap) entre folds
• Saída      : runs/train (original)/cv4_foldX* e cv4_results.json (médias e ICs)
"""
import os, json, shutil, subprocess, yaml, math
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# (Opcional) desativar W&B para evitar warnings
os.environ['WANDB_DISABLED'] = 'true'

# ---------------------------------------------------------------------
# 0) Monta o Google Drive (Colab)
# ---------------------------------------------------------------------
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

# ---------------------------------------------------------------------
# 1) Caminhos principais (NÃO tocamos no "(original)")
# ---------------------------------------------------------------------
ROOT      = Path('/content/drive/MyDrive/Colab Notebooks/Br-AsPavDam/')
DATA_DIR  = ROOT/'Br-AsPavDam (original)'
PRET_W    = ROOT/'rddc2020/yolov5/weights/IMSC/last_100_100_640_16.pt'

YOLO_DIR  = Path('/content/yolov5')
TRAIN_PY  = YOLO_DIR/'train.py'
VAL_PY    = YOLO_DIR/'val.py'

RUNS      = ROOT/'runs/train (original)'
NAMES     = ['Fissures','Shoving','Ravelling','Pothole','Patch']
K_FOLDS   = 4
IMG_SIZE  = 640
BATCH     = 16
EPOCHS    = 30
PATIENCE  = 10
SEED      = 42
CONF_THR  = 0.25
IOU_THR   = 0.50

# ---------------------------------------------------------------------
# 2) Helper para shell
# ---------------------------------------------------------------------
def run(cmd, desc, cwd=None):
    print(f'\n🛠️ {desc}\n' + '─'*60)
    p = subprocess.run(cmd, text=True, capture_output=True, cwd=cwd)
    print(p.stdout); print(p.stderr)
    if p.returncode:
        raise RuntimeError(f'Erro {p.returncode}: {" ".join(cmd)}')

# ---------------------------------------------------------------------
# 3) Setup YOLOv5 (sem "ultralytics" v8)
# ---------------------------------------------------------------------
if not YOLO_DIR.exists():
    run(['git','clone','https://github.com/ultralytics/yolov5', str(YOLO_DIR)], 'Clonando YOLOv5')
# (opcional) pinar um commit estável já testado:
# run(['git','reset','--hard','5c4c2a9'], 'Pinando commit estável', cwd=str(YOLO_DIR))
run(['pip','install','-qr','requirements.txt'], 'Instalando requirements YOLOv5', cwd=str(YOLO_DIR))

# ---------------------------------------------------------------------
# 4) Sanity mínimo
# ---------------------------------------------------------------------
if not PRET_W.exists():
    raise FileNotFoundError(f'Peso não encontrado: {PRET_W}')
IMGROOT = DATA_DIR/'images'
LBLDIR  = DATA_DIR/'labels'
if not IMGROOT.exists() or not LBLDIR.exists():
    raise FileNotFoundError('Esperado: images/<Classe>/* e labels/*.txt dentro do (original)')

# ---------------------------------------------------------------------
# 5) Carrega dataset e vetor de estratificação (pela pasta da imagem)
# ---------------------------------------------------------------------
EXTS     = ('.jpg', '.jpeg', '.png', '.bmp')
NAME2IDX = {n.lower(): i for i, n in enumerate(NAMES)}

imgs, y_for_strat = [], []
missing_labels = 0
for class_dir in sorted(IMGROOT.iterdir()):
    if not class_dir.is_dir():
        continue
    cname = class_dir.name.lower()
    if cname not in NAME2IDX:
        print(f"⚠️  Ignorando pasta não mapeada: {class_dir.name}")
        continue
    cidx = NAME2IDX[cname]
    for img in sorted(class_dir.rglob('*')):
        if img.suffix.lower() not in EXTS:
            continue
        lbl = LBLDIR/f"{img.stem}.txt"
        if not lbl.exists():
            missing_labels += 1
            continue
        imgs.append(img)
        y_for_strat.append(cidx)

if not imgs:
    raise RuntimeError("Nenhuma imagem válida encontrada. Verifique IMGROOT/LBLDIR.")
y_for_strat = np.array(y_for_strat)
print("✅ Total de imagens:", len(imgs))
print("📦 Distribuição por classe:", {NAMES[i]: int((y_for_strat==i).sum()) for i in range(len(NAMES))})
if missing_labels:
    print(f"ℹ️ Labels ausentes ignoradas: {missing_labels}")

# ---------------------------------------------------------------------
# 6) Utilitários p/ IOU, leitura de .txt YOLO e métricas sklearn
# ---------------------------------------------------------------------
def xywhn_to_xyxy(box):  # [cx,cy,w,h] -> [x1,y1,x2,y2] (normalizado)
    cx, cy, w, h = box
    x1, y1 = cx - w/2, cy - h/2
    x2, y2 = cx + w/2, cy + h/2
    return np.array([x1,y1,x2,y2], dtype=float)

def iou(a, b):
    xa1,ya1,xa2,ya2 = a
    xb1,yb1,xb2,yb2 = b
    inter = max(0,min(xa2,xb2)-max(xa1,xb1)) * max(0,min(ya2,yb2)-max(ya1,yb1))
    area_a = max(0,xa2-xa1) * max(0,ya2-ya1)
    area_b = max(0,xb2-xb1) * max(0,yb2-yb1)
    union  = area_a + area_b - inter + 1e-16
    return inter/union

def load_gt_txt(txt_path):
    items=[]
    if not txt_path.exists():
        return items
    for ln in open(txt_path):
        p = ln.strip().split()
        if len(p) < 5:
            continue
        c = int(p[0]); cx,cy,w,h = map(float, p[1:5])
        items.append({'cls': c, 'xyxy': xywhn_to_xyxy([cx,cy,w,h])})
    return items

def load_pred_txt(txt_path):
    items=[]
    if not txt_path.exists():
        return items
    for ln in open(txt_path):
        p = ln.strip().split()
        if len(p) < 6:
            continue
        c = int(p[0])
        try:
            conf = float(p[1]); coords = list(map(float, p[2:6]))  # cls conf cx cy w h
        except ValueError:
            conf = float(p[-1]); coords = list(map(float, p[1:5])) # cls cx cy w h conf
        cx,cy,w,h = coords
        items.append({'cls': c, 'conf': conf, 'xyxy': xywhn_to_xyxy([cx,cy,w,h])})
    return items

def sklearn_metrics_from_txt(gt_dir, pred_dir, num_classes, iou_thr=0.50, conf_thr=0.25,
                             class_names=None, save_png=None, return_y=False):
    BG = num_classes       # background p/ FP
    MISSED = num_classes+1 # classe para FN
    y_true, y_pred = [], []

    gt_dir = Path(gt_dir); pred_dir = Path(pred_dir)
    for gt_file in gt_dir.glob('*.txt'):
        stem = gt_file.stem
        pred_file = pred_dir/f'{stem}.txt'

        g = load_gt_txt(gt_file)
        p_all = load_pred_txt(pred_file)
        p = [d for d in p_all if d.get('conf',1.0) >= conf_thr]
        p.sort(key=lambda d: d.get('conf',0), reverse=True)

        used_pred = set()
        for gobj in g:
            best_j, best_iou = -1, 0.0
            for j, pobj in enumerate(p):
                if j in used_pred:
                    continue
                ii = iou(gobj['xyxy'], pobj['xyxy'])
                if ii > best_iou:
                    best_iou, best_j = ii, j
            if best_iou >= iou_thr and best_j >= 0:
                used_pred.add(best_j)
                y_true.append(gobj['cls'])
                y_pred.append(p[best_j]['cls'])
            else:
                y_true.append(gobj['cls'])
                y_pred.append(MISSED)

        for j, pobj in enumerate(p):
            if j not in used_pred:
                y_true.append(BG)
                y_pred.append(pobj['cls'])

    names  = (class_names or [str(i) for i in range(num_classes)]) + ['background','missed']
    labels = list(range(num_classes)) + [BG, MISSED]

    rpt_txt  = classification_report(y_true, y_pred, labels=labels, target_names=names, zero_division=0)
    rpt_dict = classification_report(y_true, y_pred, labels=labels, target_names=names, zero_division=0, output_dict=True)
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    if save_png:
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=names)
        disp.plot(xticks_rotation=45)
        plt.tight_layout()
        plt.savefig(save_png, dpi=200)
        plt.close()
    if return_y:
        return rpt_txt, rpt_dict, cm, y_true, y_pred
    return rpt_txt, rpt_dict, cm

# ---------------------------------------------------------------------
# 7) K-Fold + treino/val + métricas
# ---------------------------------------------------------------------
skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)
fold_metrics = []
fold_maps    = []
all_oof_true = []
all_oof_pred = []

def safe_copy_pair(img_path, lbl_dir_single, dst_img_dir, dst_lbl_dir):
    dst_img_dir.mkdir(parents=True, exist_ok=True)
    dst_lbl_dir.mkdir(parents=True, exist_ok=True)
    cls_prefix = img_path.parent.name  # ex.: Fissures
    stem       = img_path.stem
    img_fn     = f"{cls_prefix}__{img_path.name}"
    lbl_fn     = f"{cls_prefix}__{stem}.txt"
    shutil.copy(img_path, dst_img_dir/img_fn)
    shutil.copy(lbl_dir_single/f"{stem}.txt", dst_lbl_dir/lbl_fn)

for fold, (train_idx, val_idx) in enumerate(skf.split(imgs, y_for_strat), 1):
    print(f"\n📂 Fold {fold}/{K_FOLDS}")

    # pasta do fold (temporária, flat)
    fold_dir = ROOT/f'cv4_fold{fold}'
    if fold_dir.exists(): shutil.rmtree(fold_dir)
    (fold_dir/'images/train').mkdir(parents=True)
    (fold_dir/'images/val').mkdir(parents=True)
    (fold_dir/'labels/train').mkdir(parents=True)
    (fold_dir/'labels/val').mkdir(parents=True)

    # copiar imagens + labels (com prefixo da classe para evitar colisão)
    for subset, idxs in [('train', train_idx), ('val', val_idx)]:
        dst_img = fold_dir/f'images/{subset}'
        dst_lbl = fold_dir/f'labels/{subset}'
        for i in idxs:
            safe_copy_pair(imgs[i], LBLDIR, dst_img, dst_lbl)

    # config temporário do fold
    cfg_tmp = fold_dir/'config.yaml'
    yaml.safe_dump({
        'path' : str(fold_dir),
        'train': 'images/train',
        'val'  : 'images/val',
        'names': {i:n for i,n in enumerate(NAMES)}
    }, open(cfg_tmp,'w'))

    # treino (flags corrigidas)
    run([
        'python', str(TRAIN_PY),
        '--imgsz', str(IMG_SIZE),
        '--batch-size', str(BATCH),
        '--epochs', str(EPOCHS),
        '--data', str(cfg_tmp),
        '--weights', str(PRET_W),
        '--project', str(RUNS),
        '--name', f'cv4_fold{fold} (original)',
        '--exist-ok',
        '--patience', str(PATIENCE),
        '--cos-lr',
        '--seed', str(SEED)
    ], f'Treinando Fold {fold}', cwd=str(YOLO_DIR))

    # validação (flags corrigidas)
    best = RUNS/f'cv4_fold{fold} (original)'/'weights/best.pt'
    val_out = RUNS/f'cv4_fold{fold}_val (original)'
    if best.exists():
        run([
            'python', str(VAL_PY),
            '--data', str(cfg_tmp),
            '--weights', str(best),
            '--imgsz', str(IMG_SIZE),
            '--conf-thres', str(CONF_THR),
            '--project', str(RUNS),
            '--name', f'cv4_fold{fold}_val (original)',
            '--exist-ok',
            '--save-json',
            '--save-txt'
        ], f'Validação Fold {fold}', cwd=str(YOLO_DIR))

        # métricas sklearn (weighted) + matriz de confusão + OOF
        gt_dir   = fold_dir/'labels/val'
        pred_dir = val_out/'labels'  # onde o val.py salva os .txt
        rep_txt, rep_dict, cm, y_true, y_pred = sklearn_metrics_from_txt(
            gt_dir, pred_dir,
            num_classes=len(NAMES),
            iou_thr=IOU_THR, conf_thr=CONF_THR,
            class_names=NAMES,
            save_png=(val_out/'confusion_matrix_sklearn.png'),
            return_y=True
        )
        print("\n📊 Classification Report (inclui weighted avg):\n", rep_txt)
        fold_metrics.append(rep_dict['weighted avg'])
        all_oof_true.extend(y_true)
        all_oof_pred.extend(y_pred)

        # tentar coletar mAP do YOLO por fold (se disponível)
        fold_map = {}
        res_csv = val_out/'results.csv'
        res_json = val_out/'results.json'
        try:
            if res_csv.exists():
                import pandas as pd
                df = pd.read_csv(res_csv)
                last = df.iloc[-1].to_dict()
                for k_old, k_new in [
                    ('map_0.5', 'mAP50'),
                    ('map_0.5:0.95', 'mAP50-95'),
                    ('metrics/mAP_0.5', 'mAP50'),
                    ('metrics/mAP_0.5:0.95', 'mAP50-95'),
                    ('metrics/mAP50(B)', 'mAP50'),
                    ('metrics/mAP50-95(B)', 'mAP50-95'),
                    ('map', 'mAP50-95')
                ]:
                    if k_old in last: fold_map[k_new] = float(last[k_old])
            elif res_json.exists():
                j = json.load(open(res_json))
                for k_old, k_new in [
                    ('map', 'mAP50-95'),
                    ('map50', 'mAP50'),
                    ('metrics/mAP_0.5', 'mAP50'),
                    ('metrics/mAP_0.5:0.95', 'mAP50-95')
                ]:
                    if k_old in j: fold_map[k_new] = float(j[k_old])
        except Exception as e:
            print(f"ℹ️ Não foi possível ler mAP do YOLO neste fold: {e}")

        if fold_map:
            fold_maps.append(fold_map)
    else:
        print(f"⚠️  best.pt não encontrado no fold {fold}")

# ---------------------------------------------------------------------
# 8) Estatísticas entre folds (IC95%) e OOF agregado
# ---------------------------------------------------------------------
def mean_sd_ci95(values):
    n = len(values)
    m = float(np.mean(values))
    sd = float(np.std(values, ddof=1)) if n > 1 else 0.0
    t_table = {1:12.706, 2:4.303, 3:3.182, 4:2.776, 5:2.571, 6:2.447, 7:2.365, 8:2.306, 9:2.262}
    t_crit = t_table.get(n-1, 1.96)
    half = t_crit * (sd / max(1, np.sqrt(n))) if n > 1 else 0.0
    return m, sd, (m - half, m + half)

def bootstrap_ci(values, B=10000, alpha=0.05, seed=42):
    rng = np.random.default_rng(seed)
    vals = np.array(values, dtype=float)
    boots = []
    n = len(vals)
    for _ in range(B):
        sample = rng.choice(vals, size=n, replace=True)
        boots.append(sample.mean())
    lo, hi = np.percentile(boots, [100*alpha/2, 100*(1-alpha/2)])
    return float(lo), float(hi)

stats = {}
if fold_metrics:
    keys = [k for k in fold_metrics[0].keys() if k in ('precision','recall','f1-score')]
    for k in keys:
        vals = [fm[k] for fm in fold_metrics]
        m, sd, (lo, hi) = mean_sd_ci95(vals)
        blo, bhi = bootstrap_ci(vals)
        stats[k] = {
            'mean': round(m, 6),
            'std': round(sd, 6),
            'ci95_lo_t': round(lo, 6),
            'ci95_hi_t': round(hi, 6),
            'ci95_lo_boot': round(blo, 6),
            'ci95_hi_boot': round(bhi, 6),
            'n_folds': len(vals)
        }

    if fold_maps:
        all_keys = set().union(*[fm.keys() for fm in fold_maps])
        stats['yolo_map'] = {}
        for k in all_keys:
            vals = [fm[k] for fm in fold_maps if k in fm]
            if not vals:
                continue
            m, sd, (lo, hi) = mean_sd_ci95(vals)
            blo, bhi = bootstrap_ci(vals)
            stats['yolo_map'][k] = {
                'mean': round(m, 6),
                'std': round(sd, 6),
                'ci95_lo_t': round(lo, 6),
                'ci95_hi_t': round(hi, 6),
                'ci95_lo_boot': round(blo, 6),
                'ci95_hi_boot': round(bhi, 6),
                'n_folds': len(vals)
            }

# salva json final
RUNS.mkdir(parents=True, exist_ok=True)
with open(RUNS/'cv4_results.json','w') as f:
    json.dump(stats, f, indent=2)

print("\n✅ Estatísticas entre folds (IC95%):")
print(json.dumps(stats, indent=2))

# OOF agregado (opcional, recomendado)
if all_oof_true:
    names  = NAMES + ['background','missed']
    labels = list(range(len(NAMES))) + [len(NAMES), len(NAMES)+1]
    oof_report_txt  = classification_report(all_oof_true, all_oof_pred, labels=labels, target_names=names, zero_division=0)
    oof_cm = confusion_matrix(all_oof_true, all_oof_pred, labels=labels)
    with open(RUNS/'cv4_oof_report.txt','w') as f:
        f.write(oof_report_txt)
    disp = ConfusionMatrixDisplay(confusion_matrix=oof_cm, display_labels=names)
    disp.plot(xticks_rotation=45)
    plt.tight_layout()
    plt.savefig(RUNS/'cv4_oof_confusion_matrix.png', dpi=200)
    plt.close()
    print("\n📊 OOF Classification Report (todo o dataset, sem vazamento):\n", oof_report_txt)

print("\n🎉 Pipeline CV K=4 finalizado.")


[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
  with torch.cuda.amp.autocast(amp):

      18/29      16.6G    0.03358    0.02765   0.006872         78        640:  86%|████████▋ | 88/102 [03:40<00:30,  2.20s/it]
  with torch.cuda.amp.autocast(amp):

      18/29      16.6G    0.03359    0.02766   0.006875         64        640:  87%|████████▋ | 89/102 [03:41<00:22,  1.75s/it]
  with torch.cuda.amp.autocast(amp):

      18/29      16.6G    0.03361    0.02771   0.006899         66        640:  88%|████████▊ | 90/102 [03:41<00:17,  1.44s/it]
  with torch.cuda.amp.autocast(amp):

      18/29      16.6G    0.03361    0.02766   0.006958         55        640:  89%|████████▉ | 91/102 [03:42<00:13,  1.23s/it]
  with torch.cuda.amp.autocast(amp):

      18/29      16.6G    0.03361    0.02774    0.00695         81        640:  90%|█████████ | 92/102 [03:57<00:10,  1.07s/it]
  with torch.cuda.amp.autocast(amp):

      18/29      16.6G     0.0336    0.02775   0.006923    

In [7]:
# RELATÓRIOS

# ======= RELATÓRIO PARCIAL (folds presentes: 1 e 2) ==================
import json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

ROOT      = Path('/content/drive/MyDrive/Colab Notebooks/Br-AsPavDam')
RUNS_DIR  = ROOT/'runs/train (original)'
NAMES     = ['Fissures','Shoving','Ravelling','Pothole','Patch']
IOU_THR   = 0.50
CONF_THR  = 0.25
FOLDS_TO_USE = [1,2]  # usa só o que existe

def xywhn_to_xyxy(box):
    cx,cy,w,h = box
    return np.array([cx-w/2, cy-h/2, cx+w/2, cy+h/2], float)

def iou(a,b):
    xa1,ya1,xa2,ya2 = a; xb1,yb1,xb2,yb2 = b
    inter = max(0,min(xa2,xb2)-max(xa1,xb1)) * max(0,min(ya2,yb2)-max(yb1,ya1))
    area_a = max(0,xa2-xa1) * max(0,ya2-ya1)
    area_b = max(0,xb2-xb1) * max(0,yb2-yb1)
    return inter / (area_a + area_b - inter + 1e-16)

def load_gt_txt(p):
    out=[];
    if not p.exists(): return out
    for ln in open(p):
        s=ln.strip().split()
        if len(s)<5: continue
        c=int(s[0]); cx,cy,w,h=map(float,s[1:5])
        out.append({'cls':c,'xyxy':xywhn_to_xyxy([cx,cy,w,h])})
    return out

def load_pred_txt(p):
    out=[]
    if not p.exists(): return out
    for ln in open(p):
        s=ln.strip().split()
        if len(s)<5: continue
        c=int(s[0])
        if len(s)>=6: conf=float(s[1]); cx,cy,w,h=map(float,s[2:6])
        else:         conf=1.0;        cx,cy,w,h=map(float,s[1:5])
        out.append({'cls':c,'conf':conf,'xyxy':xywhn_to_xyxy([cx,cy,w,h])})
    return out

def sklearn_from_txt(gt_dir, pred_dir, num_classes, iou_thr=0.5, conf_thr=0.25, class_names=None, cm_png=None):
    BG=num_classes; MIS=num_classes+1
    y_true=[]; y_pred=[]
    gt_dir=Path(gt_dir); pred_dir=Path(pred_dir)
    for gt_file in gt_dir.glob('*.txt'):
        stem=gt_file.stem
        g=load_gt_txt(gt_file)
        p_all=load_pred_txt(pred_dir/f'{stem}.txt')
        p=[d for d in p_all if d.get('conf',1.0)>=conf_thr]
        p.sort(key=lambda d:d.get('conf',0), reverse=True)
        used=set()
        for gi in g:
            best=-1; best_iou=0
            for j,pj in enumerate(p):
                if j in used: continue
                ii=iou(gi['xyxy'], pj['xyxy'])
                if ii>best_iou: best_iou=ii; best=j
            if best_iou>=iou_thr and best>=0:
                used.add(best); y_true.append(gi['cls']); y_pred.append(p[best]['cls'])
            else:
                y_true.append(gi['cls']); y_pred.append(MIS)
        for j,pj in enumerate(p):
            if j not in used:
                y_true.append(BG); y_pred.append(pj['cls'])

    names=(class_names or [str(i) for i in range(num_classes)])+['background','missed']
    labels=list(range(num_classes))+[BG,MIS]

    rpt_txt  = classification_report(y_true, y_pred, labels=labels, target_names=names, zero_division=0)
    rpt_dict = classification_report(y_true, y_pred, labels=labels, target_names=names, zero_division=0, output_dict=True)
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    if cm_png:
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=names)
        disp.plot(xticks_rotation=45); plt.tight_layout(); plt.savefig(cm_png, dpi=200); plt.close()

    return rpt_txt, rpt_dict, cm

def read_map_metrics(val_dir):
    # tenta ler mAP de results.csv ou results.json no val_dir
    val_dir = Path(val_dir)
    m50=m5095=None
    csvp = val_dir/'results.csv'
    jsp  = val_dir/'results.json'
    if csvp.exists():
        df=pd.read_csv(csvp)
        last=df.iloc[-1].to_dict()
        for k_old, k_new in [('map_0.5','mAP50'), ('map_0.5:0.95','mAP50-95'),
                             ('metrics/mAP_0.5','mAP50'), ('metrics/mAP_0.5:0.95','mAP50-95'),
                             ('map50','mAP50'), ('map','mAP50-95')]:
            if k_old in last:
                if k_new=='mAP50': m50=float(last[k_old])
                else: m5095=float(last[k_old])
    elif jsp.exists():
        j=json.load(open(jsp))
        if 'map50' in j: m50=float(j['map50'])
        if 'map'   in j: m5095=float(j['map'])
        if 'metrics/mAP_0.5' in j: m50=float(j['metrics/mAP_0.5'])
        if 'metrics/mAP_0.5:0.95' in j: m5095=float(j['metrics/mAP_0.5:0.95'])
    return m50, m5095

# ====== processa folds disponíveis ======
rows=[]; perclass_by_fold={}
for fold in FOLDS_TO_USE:
    fold_dir  = ROOT/f'cv4_fold{fold}'
    gt_dir    = fold_dir/'labels/val'
    val_dir   = RUNS_DIR/f'cv4_fold{fold}_val (original)'
    pred_dir  = val_dir/'labels'

    if not gt_dir.exists() or not pred_dir.exists():
        print(f"⚠️ Fold {fold} incompleto: {gt_dir} ou {pred_dir} ausente.")
        continue

    rpt_txt, rpt_dict, cm = sklearn_from_txt(
        gt_dir, pred_dir, num_classes=len(NAMES),
        iou_thr=IOU_THR, conf_thr=CONF_THR, class_names=NAMES,
        cm_png=(val_dir/'confusion_matrix_sklearn.png')
    )
    # weighted
    w = rpt_dict['weighted avg']
    row = {
        'fold': fold,
        'precision_w': w['precision'],
        'recall_w':    w['recall'],
        'f1_w':        w['f1-score'],
    }
    # per-class (0..4)
    perclass_df = pd.DataFrame({cls: rpt_dict[cls] for cls in NAMES}).T[['precision','recall','f1-score','support']]
    perclass_by_fold[fold] = perclass_df
    # mAPs (se existirem)
    m50, m5095 = read_map_metrics(val_dir)
    row['mAP50']    = m50
    row['mAP50-95'] = m5095
    rows.append(row)

# Tabelas finais
if rows:
    df = pd.DataFrame(rows).set_index('fold').sort_index()
    print("\n== MÉTRICAS POR FOLD (weighted + mAP) ==")
    display(df)

    print("\n== MÉDIA ENTRE FOLDS DISPONÍVEIS ==")
    print(df.mean(numeric_only=True))

    # salva CSVs/JSON
    out_dir = RUNS_DIR
    df.to_csv(out_dir/'cv_partial_fold_metrics.csv')
    # empilha per-class
    perclass_stacked = []
    for f, pc in perclass_by_fold.items():
        tmp = pc.copy()
        tmp.insert(0,'fold', f)
        perclass_stacked.append(tmp.reset_index().rename(columns={'index':'class'}))
    if perclass_stacked:
        perclass_all = pd.concat(perclass_stacked, ignore_index=True)
        perclass_all.to_csv(out_dir/'cv_partial_per_class.csv', index=False)

    summary = {
        'folds_used': [int(i) for i in df.index.tolist()],
        'weighted_mean': {k: float(v) for k,v in df[['precision_w','recall_w','f1_w']].mean(numeric_only=True).items()},
        'map_mean':     {k: float(v) for k,v in df[['mAP50','mAP50-95']].mean(numeric_only=True).items() if not np.isnan(v)},
        'note': 'Resultados parciais com folds disponíveis (k=2). IoU=0.50, conf=0.25, sklearn weighted.'
    }
    json.dump(summary, open(out_dir/'cv_partial_summary.json','w'), indent=2)
    print("\nArquivos gerados em:", out_dir)
    print("- cv_partial_fold_metrics.csv")
    print("- cv_partial_per_class.csv")
    print("- cv_partial_summary.json")
else:
    print("Nenhum fold completo (GT + predições) encontrado para 1 e 2.")



== MÉTRICAS POR FOLD (weighted + mAP) ==


Unnamed: 0_level_0,precision_w,recall_w,f1_w,mAP50,mAP50-95
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.392873,0.407801,0.400059,,
2,0.405614,0.402358,0.402251,,



== MÉDIA ENTRE FOLDS DISPONÍVEIS ==
precision_w    0.399244
recall_w       0.405080
f1_w           0.401155
dtype: float64

Arquivos gerados em: /content/drive/MyDrive/Colab Notebooks/Br-AsPavDam/runs/train (original)
- cv_partial_fold_metrics.csv
- cv_partial_per_class.csv
- cv_partial_summary.json
