In [3]:
import gc
import os
import torch
import pandas as pd
from ultralytics import YOLO
from pathlib import Path

# --- Configuration ---
DATASET_BASE_PATH = Path("D:/dental/AKUDENTALlast")  # Base path to your folds
EPOCHS = 150
BATCH_SIZE = 4
IMGSZ = 512
PROJECT_NAME = "AKUDENTAL_5_Fold_SEG_Results_large"

MODEL_SIZE_MAP = {
    "large": "yolo11l-seg.pt"   # change to yolov8l-seg.pt if needed
}
MODEL_SIZES_TO_TRAIN = ["large"]

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# Store results for final summary
all_results = []

# Loop through each of the 5 folds
for fold_num in range(5):
    fold_name = f"FOLD_{fold_num}"
    dataset_folder = DATASET_BASE_PATH / f"AKUDENTAL_YOLO_{fold_name}"
    yaml_path = dataset_folder / "dataset.yaml"

    if not yaml_path.exists():
        print(f"❌ YAML file not found for {fold_name} at {yaml_path}. Skipping.")
        continue

    print(f"\n{'='*25}\n K-FOLD VALIDATION: STARTING {fold_name}\n{'='*25}")

    for model_size in MODEL_SIZES_TO_TRAIN:
        model_name = MODEL_SIZE_MAP[model_size]
        run_name = f"{model_name.split('.')[0]}_{fold_name}"

        print(f"\n--- Training Model: {model_name} on {fold_name} ---")

        # Clean up memory
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # 1. TRAINING
        model = YOLO(model_name)
        model.train(
            data=str(yaml_path),
            epochs=EPOCHS,
            batch=BATCH_SIZE,
            imgsz=IMGSZ,
            device=device,
            project=PROJECT_NAME,
            name=run_name,
            patience=50,
            optimizer="AdamW",
            seed=42,
            fliplr=0,
            mosaic=0
        )

        # 2. LOAD BEST MODEL
        best_model_path = Path(PROJECT_NAME) / run_name / "weights" / "best.pt"
        if not best_model_path.exists():
            print(f"❌ Could not find best weights at {best_model_path}. Skipping inference.")
            continue

        model = YOLO(best_model_path)

        # 3. SAVE PREDICTIONS ON TEST SET
        print(f"\n--- Predicting on Test Images for {run_name} ---")
        test_images_path = dataset_folder / "images" / "test"
        model.predict(
            source=str(test_images_path),
            save=True,
            project=PROJECT_NAME,
            name=f"predict_{run_name}",
            exist_ok=True,
            conf=0.5
        )
        print(f"✅ Prediction images saved for {run_name}")

        # 4. VALIDATION & METRICS
        print(f"\n--- Calculating Metrics for {run_name} ---")

        # Validation split
        metrics_val = model.val(
            data=str(yaml_path),
            split='val',
            project=PROJECT_NAME,
            name=f"validate_{run_name}_val",
            save_json=True,
            save_txt=True,
            exist_ok=True
        )

        # Test split
        metrics_test = model.val(
            data=str(yaml_path),
            split='test',
            project=PROJECT_NAME,
            name=f"validate_{run_name}_test",
            save_json=True,
            save_txt=True,
            exist_ok=True
        )

        # Build full summary
        result_summary = {
            'fold': fold_num,
            'model_size': model_size,

            # Validation (Boxes)
            'mAP50-95(B)_val': metrics_val.box.map,
            'mAP50(B)_val': metrics_val.box.map50,
            'mAP75(B)_val': metrics_val.box.map75,
            'precision(B)_val': metrics_val.box.mp,
            'recall(B)_val': metrics_val.box.mr,
            'f1(B)_val': metrics_val.box.f1,

            # Validation (Masks)
            'mAP50-95(M)_val': metrics_val.seg.map,
            'mAP50(M)_val': metrics_val.seg.map50,
            'mAP75(M)_val': metrics_val.seg.map75,
            'precision(M)_val': metrics_val.seg.mp,
            'recall(M)_val': metrics_val.seg.mr,
            'f1(M)_val': metrics_val.seg.f1,

            # Test (Boxes)
            'mAP50-95(B)_test': metrics_test.box.map,
            'mAP50(B)_test': metrics_test.box.map50,
            'mAP75(B)_test': metrics_test.box.map75,
            'precision(B)_test': metrics_test.box.mp,
            'recall(B)_test': metrics_test.box.mr,
            'f1(B)_test': metrics_test.box.f1,

            # Test (Masks)
            'mAP50-95(M)_test': metrics_test.seg.map,
            'mAP50(M)_test': metrics_test.seg.map50,
            'mAP75(M)_test': metrics_test.seg.map75,
            'precision(M)_test': metrics_test.seg.mp,
            'recall(M)_test': metrics_test.seg.mr,
            'f1(M)_test': metrics_test.seg.f1,
        }

        all_results.append(result_summary)

        # Save per-fold CSV
        fold_metrics_path = Path(PROJECT_NAME) / f"{run_name}_metrics.csv"
        pd.DataFrame([result_summary]).to_csv(fold_metrics_path, index=False)
        print(f"📊 Metrics saved at {fold_metrics_path}")

        # Cleanup
        del model
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

# --- Final Summary ---
print(f"\n{'='*25}\n✅ ALL FOLDS AND MODELS TRAINED SUCCESSFULLY \n{'='*25}")

results_df = pd.DataFrame(all_results)
print("📈 Performance Metrics Across All Folds:")
print(results_df.to_string())

# Save combined metrics
csv_path = Path(PROJECT_NAME) / "all_folds_metrics.csv"
results_df.to_csv(csv_path, index=False)
print(f"\nFull results summary saved to: {csv_path}")


🚀 Using device: cuda

 K-FOLD VALIDATION: STARTING FOLD_0

--- Training Model: yolo11l-seg.pt on FOLD_0 ---
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_0\dataset.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=150, erasing=0.4, exist_ok=False, fliplr=0, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=512, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11l-seg.pt, momentum=0.937, mosaic=0, 

In [46]:
import os
import cv2
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ultralytics import YOLO
from torchvision.ops import box_iou
import seaborn as sns
from tqdm.auto import tqdm
from matplotlib.patches import Rectangle, Polygon, Patch
import yaml
import json
import warnings

warnings.filterwarnings('ignore')

# Helper: hex to RGB
def hex_to_rgb(hex_color):
    hex_color = hex_color.lstrip('#')
    return [int(hex_color[i:i + 2], 16) for i in (0, 2, 4)]

# Load color map
def load_colormap(colormap_path, class_to_idx):
    try:
        with open(colormap_path, 'r') as f:
            cmap = json.load(f)
        norm = {}
        for name, hex_col in cmap.items():
            if name.lower() == 'background' or name not in class_to_idx:
                continue
            cid = class_to_idx[name]
            rgb = hex_to_rgb(hex_col)
            norm[cid] = [c / 255.0 for c in rgb]
        if not norm:
            defaults = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]]
            for cid in class_to_idx.values():
                norm[cid] = defaults[cid % len(defaults)]
        return norm
    except Exception as e:
        print(f"Warning: could not load colormap ({e}), using defaults.")
        defaults = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]]
        return {cid: defaults[cid % len(defaults)] for cid in class_to_idx.values()}

# Load YOLO .txt annotations for segmentation
def load_annotations_seg(fold_dir, split):
    """Load both bounding boxes and segmentation masks from YOLO format"""
    ann = {}
    img_dir = os.path.join(fold_dir, 'images', split)
    lbl_dir = os.path.join(fold_dir, 'labels', split)

    if not os.path.exists(img_dir) or not os.path.exists(lbl_dir):
        print(f"Missing directories: {img_dir} or {lbl_dir}")
        return ann

    for fn in os.listdir(lbl_dir):
        if not fn.endswith('.txt'):
            continue
        name = os.path.splitext(fn)[0]
        lbl_path = os.path.join(lbl_dir, fn)

        img_path = None
        for ext in ('.jpg', '.jpeg', '.png'):
            cand = os.path.join(img_dir, name + ext)
            if os.path.exists(cand):
                img_path = cand
                break

        if img_path is None: continue

        img = cv2.imread(img_path)
        if img is None: continue

        h, w = img.shape[:2]
        boxes, labels, segments = [], [], []

        with open(lbl_path) as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 5: continue

                cid = int(parts[0])
                coords = list(map(float, parts[1:]))

                if len(coords) > 5:  # Segmentation format (polygon)
                    poly = np.array(coords).reshape(-1, 2)
                    poly[:, 0] *= w
                    poly[:, 1] *= h

                    x_min, y_min = poly.min(axis=0)
                    x_max, y_max = poly.max(axis=0)

                    boxes.append([x_min, y_min, x_max - x_min, y_max - y_min])
                    labels.append(cid)
                    segments.append(poly)

        if boxes:
            ann[name] = {
                'boxes': np.array(boxes),
                'labels': np.array(labels),
                'segments': segments,
                'image_path': img_path,
                'image_size': (h, w)
            }

    return ann

# Compute IoU for segmentation masks
def compute_mask_iou(mask1, mask2):
    """Compute IoU between two binary masks"""
    if mask1 is None or mask2 is None: return 0.0
    intersection = np.logical_and(mask1, mask2).sum()
    union = np.logical_or(mask1, mask2).sum()
    return intersection / union if union > 0 else 0.0

# Create mask from polygon
def polygon_to_mask(polygon, image_shape):
    """Convert polygon to binary mask"""
    if polygon is None: return None
    mask = np.zeros(image_shape[:2], dtype=np.uint8)
    if len(polygon) >= 3:
        poly_int = np.array(polygon, dtype=np.int32).reshape(-1, 1, 2)
        cv2.fillPoly(mask, [poly_int], 1)
    return mask.astype(bool)

# REVISED FUNCTION: Computes metrics for BOTH bbox and segm, including mAP@75
def compute_per_image_metrics_seg(gt_boxes, gt_labels, gt_segments,
                                  pred_boxes, pred_labels, pred_scores, pred_masks,
                                  num_classes, image_shape, iou_threshold=0.5, conf_threshold=0.5):
    """Computes per-image metrics for both bounding box and segmentation, including mAP@75."""
    metrics = {
        'map_bbox': 0., 'map_50_bbox': 0., 'map_75_bbox': 0.,
        'map_segm': 0., 'map_50_segm': 0., 'map_75_segm': 0.,
        'mask_iou': 0., 'dice_score': 0.
    }
    for c in range(num_classes):
        metrics[f'map_class_bbox{c}'] = 0.
        metrics[f'map_class_segm{c}'] = 0.
        metrics[f'mask_iou_class{c}'] = 0.

    def calculate_ap(iou_matrix, num_gt, num_pred, iou_thresh):
        if num_gt == 0 or num_pred == 0: return 0.0
        
        used_gt, tp = set(), 0
        precisions, recalls = [], []
        
        for i in range(num_pred):
            best_iou, best_j = iou_thresh - 1e-6, -1
            if i < iou_matrix.shape[0]:
                for j, iou in enumerate(iou_matrix[i]):
                    if iou > best_iou and j not in used_gt:
                        best_iou, best_j = iou, j
            
            if best_j != -1:
                used_gt.add(best_j)
                tp += 1
            
            fp = (i + 1) - tp
            precisions.append(tp / (tp + fp))
            recalls.append(tp / num_gt)

        if not precisions: return 0.0
        
        recalls = np.concatenate(([0.], recalls, [1.]))
        precisions = np.concatenate(([0.], precisions, [0.]))
        for i in range(len(precisions) - 2, -1, -1):
            precisions[i] = max(precisions[i], precisions[i+1])
        
        indices = np.where(recalls[:-1] != recalls[1:])[0] + 1
        ap = np.sum((recalls[indices] - recalls[indices-1]) * precisions[indices])
        return ap

    try:
        if not len(pred_scores): return metrics
        keep = np.array(pred_scores) >= conf_threshold
        if not np.any(keep): return metrics
        pred_boxes = np.array(pred_boxes)[keep]
        pred_labels = np.array(pred_labels)[keep]
        pred_scores = np.array(pred_scores)[keep]
        filtered_pred_masks = [m for m, k in zip(pred_masks, keep) if k] if pred_masks else []

        if not len(gt_boxes): return metrics
        gt_boxes = np.array(gt_boxes)
        gt_labels = np.array(gt_labels)

        gt_xy = np.zeros_like(gt_boxes)
        gt_xy[:, 0:2] = gt_boxes[:, 0:2]
        gt_xy[:, 2:4] = gt_boxes[:, 0:2] + gt_boxes[:, 2:4]
        gt_masks = [polygon_to_mask(seg, image_shape) for seg in gt_segments]

        iou_thresholds = [round(x, 2) for x in np.arange(0.5, 1.0, 0.05)]
        ap_per_bbox = {t: [0.] * num_classes for t in iou_thresholds}
        ap_per_segm = {t: [0.] * num_classes for t in iou_thresholds}
        mask_iou_per_class = [0.] * num_classes
        
        for cls in range(num_classes):
            gt_idx = np.where(gt_labels == cls)[0]
            pred_idx = np.where(pred_labels == cls)[0]

            if not len(gt_idx) or not len(pred_idx): continue

            order = pred_idx[np.argsort(pred_scores[pred_idx])[::-1]]
            
            box_iou_mat = box_iou(torch.tensor(pred_boxes[order]), torch.tensor(gt_xy[gt_idx])).numpy()
            
            class_pred_masks = [filtered_pred_masks[i] for i in order if i < len(filtered_pred_masks)]
            class_gt_masks = [gt_masks[i] for i in gt_idx]
            mask_iou_mat = np.zeros((len(class_pred_masks), len(class_gt_masks)))
            for i, p_mask in enumerate(class_pred_masks):
                for j, g_mask in enumerate(class_gt_masks):
                    if p_mask is not None and g_mask is not None:
                        mask_iou_mat[i, j] = compute_mask_iou(p_mask, g_mask)

            for t in iou_thresholds:
                ap_per_bbox[t][cls] = calculate_ap(box_iou_mat, len(gt_idx), len(order), t)
                ap_per_segm[t][cls] = calculate_ap(mask_iou_mat, len(gt_idx), len(order), t)

            if mask_iou_mat.size > 0:
                best_matches = mask_iou_mat.max(axis=1)
                mask_iou_per_class[cls] = np.mean(best_matches) if len(best_matches) > 0 else 0.0

        def aggregate_map(ap_results):
            valid_classes = [c for c in range(num_classes) if any(ap_results[t][c] > 0 for t in iou_thresholds)]
            if not valid_classes: return 0., 0., 0.
            map50 = np.mean([ap_results[0.5][c] for c in valid_classes])
            map75 = np.mean([ap_results[0.75][c] for c in valid_classes])
            mean_ap = np.mean([np.mean([ap_results[t][c] for c in valid_classes]) for t in iou_thresholds])
            return mean_ap, map50, map75

        metrics['map_bbox'], metrics['map_50_bbox'], metrics['map_75_bbox'] = aggregate_map(ap_per_bbox)
        metrics['map_segm'], metrics['map_50_segm'], metrics['map_75_segm'] = aggregate_map(ap_per_segm)
        
        valid_mask_classes = [c for c in range(num_classes) if mask_iou_per_class[c] > 0]
        if valid_mask_classes:
            metrics['mask_iou'] = np.mean([mask_iou_per_class[c] for c in valid_mask_classes])
            metrics['dice_score'] = 2 * metrics['mask_iou'] / (1 + metrics['mask_iou']) if metrics['mask_iou'] > 0 else 0.0

        for c in range(num_classes):
            metrics[f'map_class_bbox{c}'] = np.mean([ap_per_bbox[t][c] for t in iou_thresholds])
            metrics[f'map_class_segm{c}'] = np.mean([ap_per_segm[t][c] for t in iou_thresholds])
            metrics[f'mask_iou_class{c}'] = mask_iou_per_class[c]
            
        return metrics

    except Exception as e:
        print(f"Error in per-image metrics: {e}")
        return metrics


# Save comparison images with segmentation
def save_yolo_seg_images(image, target, prediction, metrics, output_dir,
                         color_mapping, idx_to_class, img_name=None):
    try:
        os.makedirs(output_dir, exist_ok=True)
        img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        fig, axs = plt.subplots(1, 3, figsize=(24, 8), facecolor='white')
        fig.suptitle(f"Analysis for: {img_name}", fontsize=16)

        # Plot Original
        axs[0].imshow(img_rgb)
        axs[0].set_title("Original Image")
        axs[0].axis('off')

        # Plot Ground Truth
        axs[1].imshow(img_rgb)
        axs[1].set_title("Ground Truth")
        axs[1].axis('off')
        for seg, lbl in zip(target.get('segments', []), target.get('labels', [])):
            if seg is not None:
                col = color_mapping.get(int(lbl), [1, 0, 0])
                polygon = Polygon(seg.reshape(-1, 2), closed=True, facecolor=col, alpha=0.5,
                                  edgecolor=col, linewidth=2)
                axs[1].add_patch(polygon)

        # Plot Predictions
        axs[2].imshow(img_rgb)
        axs[2].set_title(f"Predictions (Segm mAP@50: {metrics.get('map_50_segm', 0):.3f})")
        axs[2].axis('off')

        pred_masks = prediction.get('masks', [])
        pred_labels = prediction.get('labels', [])
        pred_scores = prediction.get('scores', [])

        keep = pred_scores >= 0.5
        if any(keep) and pred_masks is not None and len(pred_masks) == len(keep):
            filtered_masks = [m for m, k in zip(pred_masks, keep) if k]
            filtered_labels = pred_labels[keep]
            for mask, lbl in zip(filtered_masks, filtered_labels):
                if mask is not None:
                    col = color_mapping.get(int(lbl), [1, 0, 0])
                    overlay = np.zeros_like(img_rgb)
                    # Ensure mask has the same H,W dimensions as the overlay
                    if mask.shape == overlay.shape[:2]:
                         overlay[mask] = [int(c * 255) for c in col]
                         axs[2].imshow(overlay, alpha=0.6)

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig(os.path.join(output_dir, "comparison.png"), dpi=100)
        plt.close()

    except Exception as e:
        print(f"Error saving comparison images for {img_name}: {e}")

# Main Evaluation Orchestrator
def evaluate_yolo_seg_5fold(
        dataset_base_dir, weights_base_dir, colormap_path, base_results_dir, device="cuda",
        iou_threshold=0.5, conf_threshold=0.25
):
    class_names = [
        "11", "12", "13", "14", "15", "16", "17", "18", "21", "22", "23", "24", "25",
        "26", "27", "28", "31", "32", "33", "34", "35", "36", "37", "38", "41", "42",
        "43", "44", "45", "46", "47", "48", "Bridge", "Filling-Crown", "Implant"
    ]
    num_classes = len(class_names)
    c2i = {name: i for i, name in enumerate(class_names)}
    i2c = {i: name for i, name in enumerate(class_names)}
    cmap = load_colormap(colormap_path, c2i)

    all_results_summary = []

    for fold in range(5):
        print(f"\n===== Evaluating FOLD {fold} =====")
        fold_dir = os.path.join(dataset_base_dir, f"AKUDENTAL_YOLO_FOLD_{fold}")
        weights_path = os.path.join(weights_base_dir, f"yolo11l-seg_FOLD_{fold}", "weights", "best.pt")

        if not os.path.exists(weights_path):
            print(f"  Weights not found, skipping: {weights_path}")
            continue

        try:
            model = YOLO(weights_path).to(device)
            print(f"  Loaded model from: {weights_path}")
        except Exception as e:
            print(f"  Error loading model: {e}")
            continue

        res_dir = os.path.join(base_results_dir, f"eval_fold_{fold}")
        os.makedirs(res_dir, exist_ok=True)

        for split in ("val", "test"):
            print(f"\n  Processing '{split}' split...")
            anns = load_annotations_seg(fold_dir, split)
            if not anns:
                print(f"    No annotations found for '{split}'.")
                continue

            print(f"    Found {len(anns)} annotated images.")
            per_image_results = []

            img_out_dir = os.path.join(res_dir, split, "images")
            
            for name, ann in tqdm(anns.items(), desc=f"  {split} progress"):
                img = cv2.imread(ann["image_path"])
                if img is None: continue

                try:
                    preds = model.predict(ann["image_path"], conf=conf_threshold, verbose=False)[0]

                    pb = preds.boxes.xyxy.cpu().numpy()
                    pl = preds.boxes.cls.cpu().numpy().astype(int)
                    ps = preds.boxes.conf.cpu().numpy()

                    pred_masks = []
                    if preds.masks is not None:
                        mask_data = preds.masks.data.cpu().numpy()
                        orig_h, orig_w = ann["image_size"]
                        for mask in mask_data:
                            resized_mask = cv2.resize(mask, (orig_w, orig_h), interpolation=cv2.INTER_NEAREST)
                            pred_masks.append(resized_mask > 0.5)

                    met_img = compute_per_image_metrics_seg(
                        ann["boxes"], ann["labels"], ann["segments"],
                        pb, pl, ps, pred_masks,
                        num_classes, ann["image_size"], iou_threshold, conf_threshold
                    )
                    per_image_results.append({"image": name, **met_img})
                    
                    # Limit visualization to first 5 images of val split in first fold to save time
                    if fold == 0 and split == 'val' and len(per_image_results) <= 5:
                        out_img_subdir = os.path.join(img_out_dir, name)
                        save_yolo_seg_images(img, ann, {"boxes": pb, "labels": pl, "scores": ps, "masks": pred_masks},
                                             met_img, out_img_subdir, cmap, i2c, img_name=name)

                except Exception as e:
                    print(f"    Error processing {ann['image_path']}: {e}")

            if not per_image_results: continue

            df_per_image = pd.DataFrame(per_image_results)
            df_per_image.to_csv(os.path.join(res_dir, f"{split}_per_image_metrics.csv"), index=False)

            summary = {
                "fold": fold, "split": split, "num_images": len(per_image_results),
                "map_bbox_mean": df_per_image['map_bbox'].mean(),
                "map_50_bbox_mean": df_per_image['map_50_bbox'].mean(),
                "map_75_bbox_mean": df_per_image['map_75_bbox'].mean(),
                "map_bbox_std": df_per_image['map_bbox'].std(),
                "map_50_bbox_std": df_per_image['map_50_bbox'].std(),
                "map_75_bbox_std": df_per_image['map_75_bbox'].std(),
                "map_segm_mean": df_per_image['map_segm'].mean(),
                "map_50_segm_mean": df_per_image['map_50_segm'].mean(),
                "map_75_segm_mean": df_per_image['map_75_segm'].mean(),
                "map_segm_std": df_per_image['map_segm'].std(),
                "map_50_segm_std": df_per_image['map_50_segm'].std(),
                "map_75_segm_std": df_per_image['map_75_segm'].std(),
                "mask_iou_mean": df_per_image['mask_iou'].mean(),
                "dice_score_mean": df_per_image['dice_score'].mean(),
                "mask_iou_std": df_per_image['mask_iou'].std(),
            }
            all_results_summary.append(summary)

    # Final Aggregation and Reporting
    if all_results_summary:
        final_df = pd.DataFrame(all_results_summary)
        final_df.to_csv(os.path.join(base_results_dir, "summary_by_fold.csv"), index=False)

        agg_df = final_df.groupby('split').agg(['mean', 'std']).round(4)
        agg_df.to_csv(os.path.join(base_results_dir, "aggregated_summary.csv"))

        print("\n===== Aggregated Results (Mean and Std across folds) =====")
        print(agg_df)


if __name__ == "__main__":
    # --- Configuration ---
    # PLEASE UPDATE THESE PATHS
    dataset_base_dir = r"D:\dental\AKUDENTALlast"
    weights_base_dir = r"D:\dental\AKUDENTAL_5_Fold_SEG_Results_large"
    colormap_path = r"D:\dental\color_mapping.json"
    base_results_dir = r"D:\dental\evaluation_results_final_all_metrics"

    print("Starting YOLO Segmentation 5-Fold Evaluation...")
    print(f"Dataset: {dataset_base_dir}")
    print(f"Weights: {weights_base_dir}")
    print(f"Results Dir: {base_results_dir}")

    evaluate_yolo_seg_5fold(
        dataset_base_dir=dataset_base_dir,
        weights_base_dir=weights_base_dir,
        colormap_path=colormap_path,
        base_results_dir=base_results_dir,
        device="cuda" if torch.cuda.is_available() else "cpu",
        conf_threshold=0.5  # A confidence of 0.5 is a good default for mAP calculation
    )

    print("\n" + "=" * 50)
    print("Evaluation Complete!")
    print(f"All results saved to: {base_results_dir}")
    print("=" * 50)

Starting YOLO Segmentation 5-Fold Evaluation...
Dataset: D:\dental\AKUDENTALlast
Weights: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large
Results Dir: D:\dental\evaluation_results_final_all_metrics

===== Evaluating FOLD 0 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_0\weights\best.pt

  Processing 'val' split...
    Found 33 annotated images.


  val progress:   0%|          | 0/33 [00:00<?, ?it/s]


  Processing 'test' split...
    Found 34 annotated images.


  test progress:   0%|          | 0/34 [00:00<?, ?it/s]


===== Evaluating FOLD 1 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_1\weights\best.pt

  Processing 'val' split...
    Found 33 annotated images.


  val progress:   0%|          | 0/33 [00:00<?, ?it/s]


  Processing 'test' split...
    Found 34 annotated images.


  test progress:   0%|          | 0/34 [00:00<?, ?it/s]


===== Evaluating FOLD 2 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_2\weights\best.pt

  Processing 'val' split...
    Found 33 annotated images.


  val progress:   0%|          | 0/33 [00:00<?, ?it/s]


  Processing 'test' split...
    Found 34 annotated images.


  test progress:   0%|          | 0/34 [00:00<?, ?it/s]


===== Evaluating FOLD 3 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_3\weights\best.pt

  Processing 'val' split...
    Found 33 annotated images.


  val progress:   0%|          | 0/33 [00:00<?, ?it/s]


  Processing 'test' split...
    Found 33 annotated images.


  test progress:   0%|          | 0/33 [00:00<?, ?it/s]


===== Evaluating FOLD 4 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_4\weights\best.pt

  Processing 'val' split...
    Found 33 annotated images.


  val progress:   0%|          | 0/33 [00:00<?, ?it/s]


  Processing 'test' split...
    Found 33 annotated images.


  test progress:   0%|          | 0/33 [00:00<?, ?it/s]


===== Aggregated Results (Mean and Std across folds) =====
      fold         num_images         map_bbox_mean         map_50_bbox_mean  \
      mean     std       mean     std          mean     std             mean   
split                                                                          
test   2.0  1.5811       33.6  0.5477        0.7059  0.0150           0.9841   
val    2.0  1.5811       33.0  0.0000        0.7012  0.0194           0.9759   

              map_75_bbox_mean          ... map_50_segm_std          \
          std             mean     std  ...            mean     std   
split                                   ...                           
test   0.0055           0.8382  0.0280  ...          0.0533  0.0672   
val    0.0146           0.8275  0.0354  ...          0.0653  0.0637   

      map_75_segm_std         mask_iou_mean         dice_score_mean         \
                 mean     std          mean     std            mean    std   
split                      

In [48]:
import os
import cv2
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ultralytics import YOLO
import seaborn as sns
from tqdm.auto import tqdm
from matplotlib.patches import Rectangle, Polygon, Patch
import yaml
import json
import warnings
from pathlib import Path
import shutil

warnings.filterwarnings('ignore')

class UltralyticsYOLOEvaluator:
    def __init__(self, dataset_base_dir, weights_base_dir, colormap_path, 
                 base_results_dir, device="cuda", conf_threshold=0.25):
        self.dataset_base_dir = dataset_base_dir
        self.weights_base_dir = weights_base_dir
        self.colormap_path = colormap_path
        self.base_results_dir = base_results_dir
        self.device = device
        self.conf_threshold = conf_threshold
        
        # Class names for dental dataset
        self.class_names = [
            "11", "12", "13", "14", "15", "16", "17", "18", "21", "22", "23", "24", "25",
            "26", "27", "28", "31", "32", "33", "34", "35", "36", "37", "38", "41", "42",
            "43", "44", "45", "46", "47", "48", "Bridge", "Filling-Crown", "Implant"
        ]
        self.num_classes = len(self.class_names)
        self.class_to_idx = {name: i for i, name in enumerate(self.class_names)}
        self.idx_to_class = {i: name for i, name in enumerate(self.class_names)}
        
        self.color_mapping = self.load_colormap()
        
    def hex_to_rgb(self, hex_color):
        """Convert hex color to RGB"""
        hex_color = hex_color.lstrip('#')
        return [int(hex_color[i:i + 2], 16) for i in (0, 2, 4)]

    def load_colormap(self):
        """Load color mapping from JSON file"""
        try:
            with open(self.colormap_path, 'r') as f:
                cmap = json.load(f)
            
            normalized_cmap = {}
            for name, hex_col in cmap.items():
                if name.lower() == 'background' or name not in self.class_to_idx:
                    continue
                cid = self.class_to_idx[name]
                rgb = self.hex_to_rgb(hex_col)
                normalized_cmap[cid] = [c / 255.0 for c in rgb]
            
            # Fill missing classes with default colors
            if not normalized_cmap:
                defaults = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]]
                for cid in self.class_to_idx.values():
                    normalized_cmap[cid] = defaults[cid % len(defaults)]
                    
            return normalized_cmap
            
        except Exception as e:
            print(f"Warning: could not load colormap ({e}), using defaults.")
            defaults = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]]
            return {cid: defaults[cid % len(defaults)] for cid in self.class_to_idx.values()}

    def create_dataset_yaml(self, fold_dir, split):
        """Create a temporary dataset.yaml file for Ultralytics evaluation"""
        yaml_path = os.path.join(fold_dir, f"temp_{split}_dataset.yaml")
        
        dataset_config = {
            'path': fold_dir,
            'train': f'images/train',  # Not used in validation, but required
            'val': f'images/{split}',
            'test': f'images/{split}' if split == 'test' else None,
            'nc': self.num_classes,
            'names': self.class_names
        }
        
        # Remove None values
        dataset_config = {k: v for k, v in dataset_config.items() if v is not None}
        
        with open(yaml_path, 'w') as f:
            yaml.dump(dataset_config, f, default_flow_style=False)
            
        return yaml_path

    def evaluate_with_ultralytics(self, model, dataset_yaml_path, split='val'):
        """Use Ultralytics built-in validation method"""
        try:
            # Run validation using Ultralytics
            results = model.val(
                data=dataset_yaml_path,
                conf=self.conf_threshold,
                iou=0.5,
                device=self.device,
                save_json=True,
                save_hybrid=True,
                plots=True,
                verbose=True,
                split=split
            )
            
            return results
            
        except Exception as e:
            print(f"Error during Ultralytics validation: {e}")
            return None

    def extract_per_image_metrics(self, results, image_names):
        """Extract per-image metrics from Ultralytics results if available"""
        per_image_metrics = []
        
        try:
            # Get overall metrics
            metrics = results.results_dict if hasattr(results, 'results_dict') else {}
            
            # For each image, create a record
            for img_name in image_names:
                img_metrics = {
                    'image': img_name,
                    'map_50_95': metrics.get('metrics/mAP50-95(B)', 0.0),
                    'map_50': metrics.get('metrics/mAP50(B)', 0.0),
                    'map_75': metrics.get('metrics/mAP75(B)', 0.0),
                    'map_50_95_mask': metrics.get('metrics/mAP50-95(M)', 0.0),
                    'map_50_mask': metrics.get('metrics/mAP50(M)', 0.0),
                    'map_75_mask': metrics.get('metrics/mAP75(M)', 0.0),
                    'precision': metrics.get('metrics/precision(B)', 0.0),
                    'recall': metrics.get('metrics/recall(B)', 0.0),
                }
                per_image_metrics.append(img_metrics)
                
        except Exception as e:
            print(f"Error extracting per-image metrics: {e}")
            
        return per_image_metrics

    def save_individual_image_predictions(self, model, fold_dir, split, output_dir, max_images=10):
        """Save individual image predictions with visualizations"""
        images_dir = os.path.join(fold_dir, 'images', split)
        if not os.path.exists(images_dir):
            return []
            
        image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        image_files = image_files[:max_images]  # Limit for demonstration
        
        individual_results = []
        
        for img_file in tqdm(image_files, desc=f"Processing individual images ({split})"):
            img_path = os.path.join(images_dir, img_file)
            img_name = os.path.splitext(img_file)[0]
            
            try:
                # Run prediction on single image
                results = model.predict(
                    source=img_path,
                    conf=self.conf_threshold,
                    save=False,
                    verbose=False
                )
                
                if results and len(results) > 0:
                    result = results[0]
                    
                    # Extract metrics for this image
                    img_metrics = {
                        'image': img_name,
                        'num_detections': len(result.boxes) if result.boxes is not None else 0,
                        'num_masks': len(result.masks.data) if result.masks is not None else 0,
                        'avg_confidence': float(result.boxes.conf.mean()) if result.boxes is not None and len(result.boxes.conf) > 0 else 0.0,
                        'max_confidence': float(result.boxes.conf.max()) if result.boxes is not None and len(result.boxes.conf) > 0 else 0.0,
                        'min_confidence': float(result.boxes.conf.min()) if result.boxes is not None and len(result.boxes.conf) > 0 else 0.0,
                    }
                    
                    # Add class-specific counts
                    if result.boxes is not None and len(result.boxes.cls) > 0:
                        classes, counts = np.unique(result.boxes.cls.cpu().numpy(), return_counts=True)
                        for cls_id, count in zip(classes, counts):
                            class_name = self.class_names[int(cls_id)]
                            img_metrics[f'count_{class_name}'] = int(count)
                    
                    individual_results.append(img_metrics)
                    
                    # Save visualization
                    self.save_prediction_visualization(result, img_path, output_dir, img_name)
                    
            except Exception as e:
                print(f"Error processing {img_file}: {e}")
                
        return individual_results

    def save_prediction_visualization(self, result, img_path, output_dir, img_name):
        """Save prediction visualization"""
        try:
            os.makedirs(os.path.join(output_dir, 'visualizations'), exist_ok=True)
            
            # Load original image
            img = cv2.imread(img_path)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            fig, axes = plt.subplots(1, 2, figsize=(16, 8))
            
            # Original image
            axes[0].imshow(img_rgb)
            axes[0].set_title(f'Original: {img_name}')
            axes[0].axis('off')
            
            # Predictions
            axes[1].imshow(img_rgb)
            axes[1].set_title(f'Predictions (conf>{self.conf_threshold})')
            axes[1].axis('off')
            
            # Draw bounding boxes
            if result.boxes is not None and len(result.boxes) > 0:
                boxes = result.boxes.xyxy.cpu().numpy()
                classes = result.boxes.cls.cpu().numpy()
                scores = result.boxes.conf.cpu().numpy()
                
                for box, cls, score in zip(boxes, classes, scores):
                    if score >= self.conf_threshold:
                        x1, y1, x2, y2 = box
                        class_name = self.class_names[int(cls)]
                        color = self.color_mapping.get(int(cls), [1, 0, 0])
                        
                        rect = Rectangle((x1, y1), x2-x1, y2-y1, 
                                       linewidth=2, edgecolor=color, 
                                       facecolor='none')
                        axes[1].add_patch(rect)
                        
                        axes[1].text(x1, y1-5, f'{class_name}: {score:.2f}',
                                   bbox=dict(facecolor=color, alpha=0.7),
                                   fontsize=8, color='white')
            
            # Draw masks if available
            if result.masks is not None and len(result.masks.data) > 0:
                masks = result.masks.data.cpu().numpy()
                classes = result.boxes.cls.cpu().numpy()
                
                for mask, cls in zip(masks, classes):
                    if mask is not None:
                        color = self.color_mapping.get(int(cls), [1, 0, 0])
                        # Create colored overlay
                        colored_mask = np.zeros((*mask.shape, 3))
                        colored_mask[mask > 0.5] = color
                        axes[1].imshow(colored_mask, alpha=0.4)
            
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'visualizations', f'{img_name}_prediction.png'), 
                       dpi=150, bbox_inches='tight')
            plt.close()
            
        except Exception as e:
            print(f"Error saving visualization for {img_name}: {e}")

    def evaluate_fold(self, fold, per_image_evaluation=True, max_images_per_split=10):
        """Evaluate a single fold using Ultralytics methods"""
        print(f"\n===== Evaluating FOLD {fold} =====")
        
        fold_dir = os.path.join(self.dataset_base_dir, f"AKUDENTAL_YOLO_FOLD_{fold}")
        weights_path = os.path.join(self.weights_base_dir, f"yolo11l-seg_FOLD_{fold}", "weights", "best.pt")
        
        if not os.path.exists(weights_path):
            print(f"  Weights not found, skipping: {weights_path}")
            return None
            
        try:
            model = YOLO(weights_path).to(self.device)
            print(f"  Loaded model from: {weights_path}")
        except Exception as e:
            print(f"  Error loading model: {e}")
            return None
            
        fold_results = {'fold': fold}
        res_dir = os.path.join(self.base_results_dir, f"fold_{fold}")
        os.makedirs(res_dir, exist_ok=True)
        
        for split in ['val', 'test']:
            print(f"\n  Evaluating '{split}' split...")
            
            # Create temporary dataset YAML
            dataset_yaml = self.create_dataset_yaml(fold_dir, split)
            
            try:
                # Run Ultralytics evaluation
                results = self.evaluate_with_ultralytics(model, dataset_yaml, split)
                
                if results is not None:
                    # Extract metrics
                    metrics = results.results_dict if hasattr(results, 'results_dict') else {}
                    
                    split_results = {
                        f'{split}_map_50_95_bbox': metrics.get('metrics/mAP50-95(B)', 0.0),
                        f'{split}_map_50_bbox': metrics.get('metrics/mAP50(B)', 0.0),
                        f'{split}_map_75_bbox': metrics.get('metrics/mAP75(B)', 0.0),
                        f'{split}_map_50_95_mask': metrics.get('metrics/mAP50-95(M)', 0.0),
                        f'{split}_map_50_mask': metrics.get('metrics/mAP50(M)', 0.0),
                        f'{split}_map_75_mask': metrics.get('metrics/mAP75(M)', 0.0),
                        f'{split}_precision': metrics.get('metrics/precision(B)', 0.0),
                        f'{split}_recall': metrics.get('metrics/recall(B)', 0.0),
                    }
                    
                    fold_results.update(split_results)
                    
                    print(f"    {split.upper()} Results:")
                    print(f"      mAP@50-95 (bbox): {split_results[f'{split}_map_50_95_bbox']:.4f}")
                    print(f"      mAP@50 (bbox): {split_results[f'{split}_map_50_bbox']:.4f}")
                    print(f"      mAP@50-95 (mask): {split_results[f'{split}_map_50_95_mask']:.4f}")
                    print(f"      mAP@50 (mask): {split_results[f'{split}_map_50_mask']:.4f}")
                    
                    # Per-image evaluation if requested
                    if per_image_evaluation:
                        print(f"    Running per-image evaluation for {split}...")
                        individual_results = self.save_individual_image_predictions(
                            model, fold_dir, split, res_dir, max_images_per_split
                        )
                        
                        if individual_results:
                            df_individual = pd.DataFrame(individual_results)
                            df_individual.to_csv(
                                os.path.join(res_dir, f'{split}_per_image_results.csv'), 
                                index=False
                            )
                            print(f"    Saved {len(individual_results)} individual image results")
                
            except Exception as e:
                print(f"  Error evaluating {split}: {e}")
            finally:
                # Clean up temporary YAML file
                if os.path.exists(dataset_yaml):
                    os.remove(dataset_yaml)
        
        return fold_results

    def run_5fold_evaluation(self, per_image_evaluation=True, max_images_per_split=10):
        """Run evaluation on all 5 folds"""
        print("Starting YOLO Segmentation 5-Fold Evaluation with Ultralytics...")
        print(f"Dataset: {self.dataset_base_dir}")
        print(f"Weights: {self.weights_base_dir}")
        print(f"Results Dir: {self.base_results_dir}")
        print(f"Per-image evaluation: {per_image_evaluation}")
        
        os.makedirs(self.base_results_dir, exist_ok=True)
        
        all_fold_results = []
        
        for fold in range(5):
            fold_result = self.evaluate_fold(fold, per_image_evaluation, max_images_per_split)
            if fold_result is not None:
                all_fold_results.append(fold_result)
        
        if all_fold_results:
            # Save summary results
            df_summary = pd.DataFrame(all_fold_results)
            df_summary.to_csv(os.path.join(self.base_results_dir, 'fold_summary.csv'), index=False)
            
            # Calculate aggregated statistics
            numeric_cols = df_summary.select_dtypes(include=[np.number]).columns
            numeric_cols = [col for col in numeric_cols if col != 'fold']
            
            agg_stats = df_summary[numeric_cols].agg(['mean', 'std']).round(4)
            agg_stats.to_csv(os.path.join(self.base_results_dir, 'aggregated_results.csv'))
            
            print("\n" + "="*60)
            print("FINAL AGGREGATED RESULTS (Mean ± Std)")
            print("="*60)
            
            for col in numeric_cols:
                mean_val = agg_stats.loc['mean', col]
                std_val = agg_stats.loc['std', col]
                print(f"{col:30s}: {mean_val:.4f} ± {std_val:.4f}")
            
            print("="*60)
            print(f"Results saved to: {self.base_results_dir}")
            print("="*60)
        
        return all_fold_results


# Usage example
if __name__ == "__main__":
    # Configuration - UPDATE THESE PATHS
    config = {
        'dataset_base_dir': r"D:\dental\AKUDENTALlast",
        'weights_base_dir': r"D:\dental\AKUDENTAL_5_Fold_SEG_Results_large",
        'colormap_path': r"D:\dental\color_mapping.json",
        'base_results_dir': r"D:\dental\evaluation_results_ultralytics_native",
        'device': "cuda" if torch.cuda.is_available() else "cpu",
        'conf_threshold': 0.25
    }
    
    # Initialize evaluator
    evaluator = UltralyticsYOLOEvaluator(**config)
    
    # Run evaluation
    results = evaluator.run_5fold_evaluation(
        per_image_evaluation=True,    # Set to False to skip per-image analysis
        max_images_per_split=10       # Limit images for visualization (set higher as needed)
    )
    
    print("\nEvaluation Complete!")

Starting YOLO Segmentation 5-Fold Evaluation with Ultralytics...
Dataset: D:\dental\AKUDENTALlast
Weights: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large
Results Dir: D:\dental\evaluation_results_ultralytics_native
Per-image evaluation: True

===== Evaluating FOLD 0 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_0\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 2699.11192.2 MB/s, size: 790.4 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_0\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.0Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━

Processing individual images (val):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

  Evaluating 'test' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3543.2436.8 MB/s, size: 719.9 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_0\labels\test.cache... 34 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 34/34 37.3Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 13.4s11.8s
                   all         34       1033      0.915      0.891      0.932      0.666      0.916      0.891       0.93      0.532
  11 - Central Incisor         31         31      0.975      0.968      0.981       0.69      0.975      0.968      0.981      0.553
  12 - Lateral Incisor         31         31      0.953      0.935      0.965      0.651      0.953      0.935      0.965  

Processing individual images (test):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

===== Evaluating FOLD 1 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_1\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3560.8278.2 MB/s, size: 646.0 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_1\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.1Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 12.6s11.6s
                   all         33       1028      0.945      0.933      0.955      0.685      0.934      0.923      0.943      0.541
  11 - Central Incisor         32   

Processing individual images (val):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

  Evaluating 'test' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3042.2285.2 MB/s, size: 719.8 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_1\labels\test.cache... 34 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 34/34  0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 14.3s12.2s
                   all         34       1004      0.924       0.94      0.953       0.69      0.904       0.92      0.936      0.539
  11 - Central Incisor         34         34      0.938      0.884      0.963      0.729      0.906      0.855      0.914      0.548
  12 - Lateral Incisor         33         33          1      0.939       0.97      0.692          1      0.939       0.97      0.519

Processing individual images (test):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

===== Evaluating FOLD 2 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_2\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3675.9721.6 MB/s, size: 649.2 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_2\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.0Kit/s 0.0s


KeyboardInterrupt: 

In [50]:
import os
import pandas as pd
import yaml
from ultralytics import YOLO
import torch
from pathlib import Path

def create_dataset_yaml(fold_dir, split):
    """Create temporary dataset.yaml for evaluation"""
    yaml_path = os.path.join(fold_dir, f"temp_{split}.yaml")
    
    # Dental dataset class names
    class_names = [
        "11", "12", "13", "14", "15", "16", "17", "18", "21", "22", "23", "24", "25",
        "26", "27", "28", "31", "32", "33", "34", "35", "36", "37", "38", "41", "42",
        "43", "44", "45", "46", "47", "48", "Bridge", "Filling-Crown", "Implant"
    ]
    
    dataset_config = {
        'path': fold_dir,
        'train': 'images/train',  # Required but not used
        'val': f'images/{split}',
        'nc': len(class_names),
        'names': class_names
    }
    
    with open(yaml_path, 'w') as f:
        yaml.dump(dataset_config, f, default_flow_style=False)
    
    return yaml_path

def evaluate_fold_fixed(fold, dataset_base_dir, weights_base_dir, device="cuda"):
    """Evaluate one fold using correct metric access"""
    
    fold_dir = os.path.join(dataset_base_dir, f"AKUDENTAL_YOLO_FOLD_{fold}")
    weights_path = os.path.join(weights_base_dir, f"yolo11l-seg_FOLD_{fold}", "weights", "best.pt")
    
    if not os.path.exists(weights_path):
        print(f"Weights not found for fold {fold}: {weights_path}")
        return None
    
    try:
        model = YOLO(weights_path).to(device)
        print(f"Evaluating Fold {fold}...")
    except Exception as e:
        print(f"Error loading model for fold {fold}: {e}")
        return None
    
    results = {'fold': fold}
    
    # Evaluate both val and test splits
    for split in ['val', 'test']:
        dataset_yaml = create_dataset_yaml(fold_dir, split)
        
        try:
            # Run evaluation
            metrics = model.val(
                data=dataset_yaml,
                conf=0.25,
                iou=0.5,
                device=device,
                verbose=False,
                plots=False,
                save=False
            )
            
            # CORRECT way to access metrics according to Ultralytics docs
            # For bounding boxes
            bbox_map50_95 = float(metrics.box.map) if metrics.box.map is not None else 0.0
            bbox_map50 = float(metrics.box.map50) if metrics.box.map50 is not None else 0.0  
            bbox_map75 = float(metrics.box.map75) if metrics.box.map75 is not None else 0.0
            
            # For segmentation masks (if available)
            mask_map50_95 = float(metrics.seg.map) if hasattr(metrics, 'seg') and metrics.seg.map is not None else 0.0
            mask_map50 = float(metrics.seg.map50) if hasattr(metrics, 'seg') and metrics.seg.map50 is not None else 0.0
            mask_map75 = float(metrics.seg.map75) if hasattr(metrics, 'seg') and metrics.seg.map75 is not None else 0.0
            
            results[f'{split}_mAP_50_bbox'] = bbox_map50
            results[f'{split}_mAP_75_bbox'] = bbox_map75
            results[f'{split}_mAP_50_95_bbox'] = bbox_map50_95
            results[f'{split}_mAP_50_mask'] = mask_map50
            results[f'{split}_mAP_75_mask'] = mask_map75
            results[f'{split}_mAP_50_95_mask'] = mask_map50_95
            
            print(f"  {split.upper()}: mAP@50(bbox)={bbox_map50:.4f}, mAP@75(bbox)={bbox_map75:.4f}, "
                  f"mAP@50(mask)={mask_map50:.4f}, mAP@75(mask)={mask_map75:.4f}")
                
        except Exception as e:
            print(f"  Error evaluating {split}: {e}")
        finally:
            # Cleanup
            if os.path.exists(dataset_yaml):
                os.remove(dataset_yaml)
    
    return results

def run_fixed_5fold_evaluation(dataset_base_dir, weights_base_dir, output_dir):
    """Run corrected evaluation on all 5 folds"""
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    print(f"Dataset: {dataset_base_dir}")
    print(f"Weights: {weights_base_dir}")
    print("="*60)
    
    all_results = []
    
    # Evaluate each fold
    for fold in range(5):
        fold_result = evaluate_fold_fixed(fold, dataset_base_dir, weights_base_dir, device)
        if fold_result:
            all_results.append(fold_result)
    
    if not all_results:
        print("No results obtained!")
        return
    
    # Create DataFrame and save results
    df = pd.DataFrame(all_results)
    os.makedirs(output_dir, exist_ok=True)
    df.to_csv(os.path.join(output_dir, 'corrected_fold_results.csv'), index=False)
    
    # Calculate averages
    numeric_cols = [col for col in df.columns if col != 'fold']
    averages = df[numeric_cols].mean()
    stds = df[numeric_cols].std()
    
    # Create summary
    summary = []
    for col in numeric_cols:
        summary.append({
            'metric': col,
            'mean': averages[col],
            'std': stds[col]
        })
    
    summary_df = pd.DataFrame(summary)
    summary_df.to_csv(os.path.join(output_dir, 'corrected_summary.csv'), index=False)
    
    # Print results
    print("\n" + "="*60)
    print("CORRECTED RESULTS - AVERAGE mAP SCORES ACROSS 5 FOLDS")
    print("="*60)
    
    # Group by split and metric type
    splits = ['val', 'test']
    metric_types = ['mAP_50_bbox', 'mAP_75_bbox', 'mAP_50_95_bbox', 
                   'mAP_50_mask', 'mAP_75_mask', 'mAP_50_95_mask']
    
    for split in splits:
        print(f"\n{split.upper()} SET:")
        print("-" * 30)
        print("  Bounding Box:")
        for metric in ['mAP_50_bbox', 'mAP_75_bbox', 'mAP_50_95_bbox']:
            col_name = f"{split}_{metric}"
            if col_name in averages:
                print(f"    {metric:12s}: {averages[col_name]:.4f} ± {stds[col_name]:.4f}")
        
        print("  Segmentation:")  
        for metric in ['mAP_50_mask', 'mAP_75_mask', 'mAP_50_95_mask']:
            col_name = f"{split}_{metric}"
            if col_name in averages:
                print(f"    {metric:12s}: {averages[col_name]:.4f} ± {stds[col_name]:.4f}")
    
    print("\n" + "="*60)
    print(f"Corrected results saved to: {output_dir}")
    print("="*60)
    
    return df, summary_df

# Test single fold function
def test_single_fold_metrics(fold, dataset_base_dir, weights_base_dir):
    """Test metric extraction on a single fold for debugging"""
    
    fold_dir = os.path.join(dataset_base_dir, f"AKUDENTAL_YOLO_FOLD_{fold}")
    weights_path = os.path.join(weights_base_dir, f"yolo11l-seg_FOLD_{fold}", "weights", "best.pt")
    
    if not os.path.exists(weights_path):
        print(f"Weights not found: {weights_path}")
        return
        
    model = YOLO(weights_path)
    dataset_yaml = create_dataset_yaml(fold_dir, 'val')
    
    try:
        print(f"Testing metrics extraction for fold {fold}...")
        metrics = model.val(data=dataset_yaml, verbose=True)
        
        print(f"\nAvailable attributes in metrics object:")
        print(f"- hasattr 'box': {hasattr(metrics, 'box')}")
        print(f"- hasattr 'seg': {hasattr(metrics, 'seg')}")
        
        if hasattr(metrics, 'box'):
            print(f"Box metrics:")
            print(f"  - map (50:95): {metrics.box.map}")
            print(f"  - map50: {metrics.box.map50}")  
            print(f"  - map75: {metrics.box.map75}")
            print(f"  - maps: {metrics.box.maps}")
        
        if hasattr(metrics, 'seg'):
            print(f"Segmentation metrics:")
            print(f"  - map (50:95): {metrics.seg.map}")
            print(f"  - map50: {metrics.seg.map50}")
            print(f"  - map75: {metrics.seg.map75}")
        
        # Also check results_dict for comparison
        if hasattr(metrics, 'results_dict'):
            print(f"\nResults dict keys: {list(metrics.results_dict.keys())}")
            
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        if os.path.exists(dataset_yaml):
            os.remove(dataset_yaml)

# Usage
if __name__ == "__main__":
    # UPDATE THESE PATHS
    dataset_base_dir = r"D:\dental\AKUDENTALlast"
    weights_base_dir = r"D:\dental\AKUDENTAL_5_Fold_SEG_Results_large"
    output_dir = r"D:\dental\corrected_eval_results"
    
    # First, test single fold to verify metrics are working
    print("Testing single fold first...")
    test_single_fold_metrics(0, dataset_base_dir, weights_base_dir)
    
    # Then run full evaluation
    print("\n" + "="*60)
    print("Running full 5-fold evaluation...")
    results = run_fixed_5fold_evaluation(
        dataset_base_dir=dataset_base_dir,
        weights_base_dir=weights_base_dir,
        output_dir=output_dir
    )
    
    print("\n✅ Corrected evaluation complete!")

Testing single fold first...
Testing metrics extraction for fold 0...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 2421.1368.0 MB/s, size: 664.5 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_0\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.1Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 1.2it/s 2.5s1.6ss
                   all         33        950      0.886      0.863        0.9      0.609      0.844      0.809      0.846      0.474
  11 - Central Incisor         30         30      0.873          1      0.992      0.638      0.847      0.967      0.971      0.502
  12 - Lateral Incisor         31

In [45]:
import pandas as pd
import glob
from scipy import stats
import os
import re

def load_yolo_data_by_fold(base_path, split_name='test'):
    """
    Loads YOLO's fold-by-fold CSVs, extracts the fold number from the
    file path, adds it as a 'fold' column, and combines them.
    """
    search_pattern = os.path.join(base_path, f"eval_fold_*", f"{split_name}_per_image_metrics.csv")
    csv_files = glob.glob(search_pattern)

    if not csv_files:
        print(f"Warning: No YOLO CSV files found for pattern: {search_pattern}")
        return pd.DataFrame()

    print(f"Found {len(csv_files)} fold-specific CSV files for YOLO on '{split_name}' split.")

    all_dfs = []
    for f in csv_files:
        try:
            fold_dir_name = os.path.basename(os.path.dirname(f))
            fold_number = int(re.search(r'\d+', fold_dir_name).group())
        except (AttributeError, IndexError):
            print(f"Warning: Could not extract fold number from path '{f}'. Skipping this file.")
            continue

        df = pd.read_csv(f)
        df['fold'] = fold_number
        all_dfs.append(df)

    if not all_dfs:
        return pd.DataFrame()

    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df

def run_comparison(yolo_path, mrcnn_test_path, mrcnn_val_path, metrics_to_test):
    """
    Main function to load, merge, and compare model results for both val and test splits.
    """
    for split in ['val', 'test']:
        print("\n" + "="*50)
        print(f"🚀 STARTING ANALYSIS FOR '{split.upper()}' SPLIT")
        print("="*50)

        # --- 1. Load Data ---
        df_yolo = load_yolo_data_by_fold(yolo_path, split)
        
        mrcnn_csv_path = mrcnn_val_path if split == 'val' else mrcnn_test_path
        
        print(f"\nLoading merged Mask R-CNN data from: {mrcnn_csv_path}")
        try:
            df_maskrcnn = pd.read_csv(mrcnn_csv_path)
        except FileNotFoundError:
            print(f"ERROR: Mask R-CNN file not found for '{split}' split at '{mrcnn_csv_path}'. Skipping split.")
            continue

        # --- 2. Validate and Merge Data ---
        if df_yolo.empty or df_maskrcnn.empty:
            print(f"\nData for '{split}' split is missing for one or both models. Skipping comparison.")
            continue
        
        if 'image_key' in df_maskrcnn.columns:
            df_maskrcnn.rename(columns={'image_key': 'image'}, inplace=True)
        
        if 'fold' not in df_maskrcnn.columns or 'image' not in df_maskrcnn.columns:
            print(f"\nCRITICAL ERROR: Mask R-CNN CSV for '{split}' is missing 'image' or 'fold' column. Skipping.")
            continue

        try:
            df_yolo['fold'] = df_yolo['fold'].astype(int)
            df_yolo['image'] = df_yolo['image'].astype(str)
            df_maskrcnn['fold'] = df_maskrcnn['fold'].astype(int)
            df_maskrcnn['image'] = df_maskrcnn['image'].astype(str)
            df_maskrcnn['image'] = df_maskrcnn['image'].str.replace(r'\.\w+$', '', regex=True)
        except Exception as e:
            print(f"\nERROR converting data types for '{split}' split: {e}. Skipping.")
            continue

        merged_df = pd.merge(df_yolo, df_maskrcnn, on=['image', 'fold'], suffixes=('_yolo', '_mrcnn'))
        print(f"\nSuccessfully merged {len(merged_df)} common image/fold pairs for comparison on '{split}' split.")

        # --- 3. Perform and Print Paired T-Tests ---
        if len(merged_df) > 0:
            print(f"\n--- Paired T-Test Results for '{split.upper()}' Split ---")
            for metric in metrics_to_test:
                metric_yolo = f'{metric}_yolo'
                metric_mrcnn = f'{metric}_mrcnn'

                if metric_yolo in merged_df.columns and metric_mrcnn in merged_df.columns:
                    valid_pairs = merged_df[[metric_yolo, metric_mrcnn]].dropna()
                    if len(valid_pairs) < 2:
                        print(f"\nMetric: {metric} - SKIPPED (Not enough valid data points)")
                        continue

                    t_statistic, p_value = stats.ttest_rel(valid_pairs[metric_yolo], valid_pairs[metric_mrcnn])

                    print(f"\nMetric: {metric}")
                    print(f"  - Mean YOLOv11-seg:  {valid_pairs[metric_yolo].mean():.4f}")
                    print(f"  - Mean Mask R-CNN:   {valid_pairs[metric_mrcnn].mean():.4f}")
                    print(f"  - T-statistic:       {t_statistic:.4f}")
                    print(f"  - P-value:           {p_value:.4f}")

                    if p_value < 0.05:
                        winner = "YOLOv11-seg" if valid_pairs[metric_yolo].mean() > valid_pairs[metric_mrcnn].mean() else "Mask R-CNN"
                        print(f"  - ✅ Result: The difference is statistically significant. The {winner} model is better.")
                    else:
                        print(f"  - ❌ Result: The difference is not statistically significant.")
                else:
                    print(f"\nMetric: {metric} - SKIPPED (column not found in both datasets)")
        else:
            print("\nNo common images found to compare after merging.")

# --- Main Execution ---
if __name__ == "__main__":
    # 1. DEFINE YOUR PATHS AND METRICS
    # ------------------------------------
    yolo_base_path = r"D:\dental\evaluation_results_final_all_metrics"
    
    # **IMPORTANT**: Provide the full paths to BOTH Mask R-CNN files
    mrcnn_test_csv = r"D:\dental\5fold_experiment_fixed\evaluation_results\new_test_per_image_metrics.csv"
    mrcnn_val_csv = r"D:\dental\5fold_experiment_fixed\evaluation_results\validation_per_image_metrics.csv"

    metrics = ['map_50_bbox', 'map_segm', 'map_50_segm', 'map_75_segm']

    # 2. RUN THE COMPARISON
    # ------------------------------------
    run_comparison(yolo_base_path, mrcnn_test_csv, mrcnn_val_csv, metrics)


🚀 STARTING ANALYSIS FOR 'VAL' SPLIT
Found 5 fold-specific CSV files for YOLO on 'val' split.

Loading merged Mask R-CNN data from: D:\dental\5fold_experiment_fixed\evaluation_results\validation_per_image_metrics.csv

Successfully merged 13 common image/fold pairs for comparison on 'val' split.

--- Paired T-Test Results for 'VAL' Split ---

Metric: map_50_bbox - SKIPPED (column not found in both datasets)

Metric: map_segm
  - Mean YOLOv11-seg:  0.3989
  - Mean Mask R-CNN:   0.6559
  - T-statistic:       -8.8196
  - P-value:           0.0000
  - ✅ Result: The difference is statistically significant. The Mask R-CNN model is better.

Metric: map_50_segm
  - Mean YOLOv11-seg:  0.9380
  - Mean Mask R-CNN:   0.8952
  - T-statistic:       1.4348
  - P-value:           0.1769
  - ❌ Result: The difference is not statistically significant.

Metric: map_75_segm
  - Mean YOLOv11-seg:  0.2520
  - Mean Mask R-CNN:   0.8165
  - T-statistic:       -13.3511
  - P-value:           0.0000
  - ✅ Result:

In [53]:
import os
import cv2
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ultralytics import YOLO
import seaborn as sns
from tqdm.auto import tqdm
from matplotlib.patches import Rectangle, Polygon, Patch
import yaml
import json
import warnings
from pathlib import Path
import shutil

warnings.filterwarnings('ignore')

class UltralyticsYOLOEvaluator:
    def __init__(self, dataset_base_dir, weights_base_dir, colormap_path, 
                 base_results_dir, device="cuda", conf_threshold=0.25):
        self.dataset_base_dir = dataset_base_dir
        self.weights_base_dir = weights_base_dir
        self.colormap_path = colormap_path
        self.base_results_dir = base_results_dir
        self.device = device
        self.conf_threshold = conf_threshold
        
        # Class names for dental dataset
        self.class_names = [
            "11", "12", "13", "14", "15", "16", "17", "18", "21", "22", "23", "24", "25",
            "26", "27", "28", "31", "32", "33", "34", "35", "36", "37", "38", "41", "42",
            "43", "44", "45", "46", "47", "48", "Bridge", "Filling-Crown", "Implant"
        ]
        self.num_classes = len(self.class_names)
        self.class_to_idx = {name: i for i, name in enumerate(self.class_names)}
        self.idx_to_class = {i: name for i, name in enumerate(self.class_names)}
        
        self.color_mapping = self.load_colormap()
        
    def hex_to_rgb(self, hex_color):
        """Convert hex color to RGB"""
        hex_color = hex_color.lstrip('#')
        return [int(hex_color[i:i + 2], 16) for i in (0, 2, 4)]

    def load_colormap(self):
        """Load color mapping from JSON file"""
        try:
            with open(self.colormap_path, 'r') as f:
                cmap = json.load(f)
            
            normalized_cmap = {}
            for name, hex_col in cmap.items():
                if name.lower() == 'background' or name not in self.class_to_idx:
                    continue
                cid = self.class_to_idx[name]
                rgb = self.hex_to_rgb(hex_col)
                normalized_cmap[cid] = [c / 255.0 for c in rgb]
            
            # Fill missing classes with default colors
            if not normalized_cmap:
                defaults = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]]
                for cid in self.class_to_idx.values():
                    normalized_cmap[cid] = defaults[cid % len(defaults)]
                    
            return normalized_cmap
            
        except Exception as e:
            print(f"Warning: could not load colormap ({e}), using defaults.")
            defaults = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [1, 0, 1]]
            return {cid: defaults[cid % len(defaults)] for cid in self.class_to_idx.values()}

    def create_dataset_yaml(self, fold_dir, split):
        """Create a temporary dataset.yaml file for Ultralytics evaluation"""
        yaml_path = os.path.join(fold_dir, f"temp_{split}_dataset.yaml")
        
        dataset_config = {
            'path': fold_dir,
            'train': f'images/train',  # Not used in validation, but required
            'val': f'images/{split}',
            'test': f'images/{split}' if split == 'test' else None,
            'nc': self.num_classes,
            'names': self.class_names
        }
        
        # Remove None values
        dataset_config = {k: v for k, v in dataset_config.items() if v is not None}
        
        with open(yaml_path, 'w') as f:
            yaml.dump(dataset_config, f, default_flow_style=False)
            
        return yaml_path

    def evaluate_with_ultralytics(self, model, dataset_yaml_path, split='val'):
        """Use Ultralytics built-in validation method"""
        try:
            # Run validation using Ultralytics
            results = model.val(
                data=dataset_yaml_path,
                conf=self.conf_threshold,
                iou=0.5,
                device=self.device,
                save_json=True,
                save_hybrid=True,
                plots=True,
                verbose=True,
                split=split
            )
            
            return results
            
        except Exception as e:
            print(f"Error during Ultralytics validation: {e}")
            return None

    def extract_per_class_metrics(self, results):
        """Extract per-class metrics from Ultralytics results"""
        per_class_metrics = []
        
        try:
            # Extract box metrics
            box_maps = None
            box_map50 = None
            box_map75 = None
            box_precision = None
            box_recall = None
            
            if hasattr(results, 'box'):
                box_maps = results.box.maps if hasattr(results.box, 'maps') else None
                box_map50 = results.box.ap50 if hasattr(results.box, 'ap50') else None
                box_map75 = results.box.ap75 if hasattr(results.box, 'ap75') else None
                box_precision = results.box.p if hasattr(results.box, 'p') else None
                box_recall = results.box.r if hasattr(results.box, 'r') else None
            
            # Extract segmentation metrics (using 'seg' attribute, not 'mask')
            seg_maps = None
            seg_map50 = None
            seg_map75 = None
            seg_precision = None
            seg_recall = None
            
            if hasattr(results, 'seg'):
                seg_maps = results.seg.maps if hasattr(results.seg, 'maps') else None
                seg_map50 = results.seg.ap50 if hasattr(results.seg, 'ap50') else None
                seg_map75 = results.seg.ap75 if hasattr(results.seg, 'ap75') else None
                seg_precision = results.seg.p if hasattr(results.seg, 'p') else None
                seg_recall = results.seg.r if hasattr(results.seg, 'r') else None
            
            # Create per-class records
            for class_idx, class_name in enumerate(self.class_names):
                class_metrics = {
                    'class_id': class_idx,
                    'class_name': class_name,
                    'bbox_map50_95': float(box_maps[class_idx]) if box_maps is not None and class_idx < len(box_maps) else 0.0,
                    'bbox_map50': float(box_map50[class_idx]) if box_map50 is not None and class_idx < len(box_map50) else 0.0,
                    'bbox_map75': float(box_map75[class_idx]) if box_map75 is not None and class_idx < len(box_map75) else 0.0,
                    'bbox_precision': float(box_precision[class_idx]) if box_precision is not None and class_idx < len(box_precision) else 0.0,
                    'bbox_recall': float(box_recall[class_idx]) if box_recall is not None and class_idx < len(box_recall) else 0.0,
                    'mask_map50_95': float(seg_maps[class_idx]) if seg_maps is not None and class_idx < len(seg_maps) else 0.0,
                    'mask_map50': float(seg_map50[class_idx]) if seg_map50 is not None and class_idx < len(seg_map50) else 0.0,
                    'mask_map75': float(seg_map75[class_idx]) if seg_map75 is not None and class_idx < len(seg_map75) else 0.0,
                    'mask_precision': float(seg_precision[class_idx]) if seg_precision is not None and class_idx < len(seg_precision) else 0.0,
                    'mask_recall': float(seg_recall[class_idx]) if seg_recall is not None and class_idx < len(seg_recall) else 0.0,
                }
                per_class_metrics.append(class_metrics)
            
            print(f"    Successfully extracted metrics for {len(per_class_metrics)} classes")
            if seg_maps is not None:
                print(f"    Mask metrics available: mAP@50-95 range [{seg_maps.min():.4f}, {seg_maps.max():.4f}]")
                    
        except Exception as e:
            print(f"    Error extracting per-class metrics: {e}")
            import traceback
            traceback.print_exc()
            
        return per_class_metrics

    def calculate_total_averages(self, per_class_df):
        """Calculate total averages from per-class metrics"""
        numeric_cols = per_class_df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['class_id']]
        
        averages = {}
        for col in numeric_cols:
            # Filter out zero values for more accurate averaging (optional)
            non_zero_values = per_class_df[col][per_class_df[col] > 0]
            if len(non_zero_values) > 0:
                averages[f'{col}_mean'] = float(non_zero_values.mean())
                averages[f'{col}_std'] = float(non_zero_values.std())
            else:
                averages[f'{col}_mean'] = 0.0
                averages[f'{col}_std'] = 0.0
            
            # Also include overall mean (including zeros)
            averages[f'{col}_overall_mean'] = float(per_class_df[col].mean())
        
        return averages

    def extract_per_image_metrics(self, results, image_names):
        """Extract per-image metrics from Ultralytics results if available"""
        per_image_metrics = []
        
        try:
            # Get overall metrics
            metrics = results.results_dict if hasattr(results, 'results_dict') else {}
            
            # For each image, create a record
            for img_name in image_names:
                img_metrics = {
                    'image': img_name,
                    'map_50_95': metrics.get('metrics/mAP50-95(B)', 0.0),
                    'map_50': metrics.get('metrics/mAP50(B)', 0.0),
                    'map_75': metrics.get('metrics/mAP75(B)', 0.0),
                    'map_50_95_mask': metrics.get('metrics/mAP50-95(M)', 0.0),
                    'map_50_mask': metrics.get('metrics/mAP50(M)', 0.0),
                    'map_75_mask': metrics.get('metrics/mAP75(M)', 0.0),
                    'precision': metrics.get('metrics/precision(B)', 0.0),
                    'recall': metrics.get('metrics/recall(B)', 0.0),
                }
                per_image_metrics.append(img_metrics)
                
        except Exception as e:
            print(f"Error extracting per-image metrics: {e}")
            
        return per_image_metrics

    def save_individual_image_predictions(self, model, fold_dir, split, output_dir, max_images=10):
        """Save individual image predictions with visualizations"""
        images_dir = os.path.join(fold_dir, 'images', split)
        if not os.path.exists(images_dir):
            return []
            
        image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        image_files = image_files[:max_images]  # Limit for demonstration
        
        individual_results = []
        
        for img_file in tqdm(image_files, desc=f"Processing individual images ({split})"):
            img_path = os.path.join(images_dir, img_file)
            img_name = os.path.splitext(img_file)[0]
            
            try:
                # Run prediction on single image
                results = model.predict(
                    source=img_path,
                    conf=self.conf_threshold,
                    save=False,
                    verbose=False
                )
                
                if results and len(results) > 0:
                    result = results[0]
                    
                    # Extract metrics for this image
                    img_metrics = {
                        'image': img_name,
                        'num_detections': len(result.boxes) if result.boxes is not None else 0,
                        'num_masks': len(result.masks.data) if result.masks is not None else 0,
                        'avg_confidence': float(result.boxes.conf.mean()) if result.boxes is not None and len(result.boxes.conf) > 0 else 0.0,
                        'max_confidence': float(result.boxes.conf.max()) if result.boxes is not None and len(result.boxes.conf) > 0 else 0.0,
                        'min_confidence': float(result.boxes.conf.min()) if result.boxes is not None and len(result.boxes.conf) > 0 else 0.0,
                    }
                    
                    # Add class-specific counts
                    if result.boxes is not None and len(result.boxes.cls) > 0:
                        classes, counts = np.unique(result.boxes.cls.cpu().numpy(), return_counts=True)
                        for cls_id, count in zip(classes, counts):
                            class_name = self.class_names[int(cls_id)]
                            img_metrics[f'count_{class_name}'] = int(count)
                    
                    individual_results.append(img_metrics)
                    
                    # Save visualization
                    self.save_prediction_visualization(result, img_path, output_dir, img_name)
                    
            except Exception as e:
                print(f"Error processing {img_file}: {e}")
                
        return individual_results

    def save_prediction_visualization(self, result, img_path, output_dir, img_name):
        """Save prediction visualization"""
        try:
            os.makedirs(os.path.join(output_dir, 'visualizations'), exist_ok=True)
            
            # Load original image
            img = cv2.imread(img_path)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            fig, axes = plt.subplots(1, 2, figsize=(16, 8))
            
            # Original image
            axes[0].imshow(img_rgb)
            axes[0].set_title(f'Original: {img_name}')
            axes[0].axis('off')
            
            # Predictions
            axes[1].imshow(img_rgb)
            axes[1].set_title(f'Predictions (conf>{self.conf_threshold})')
            axes[1].axis('off')
            
            # Draw bounding boxes
            if result.boxes is not None and len(result.boxes) > 0:
                boxes = result.boxes.xyxy.cpu().numpy()
                classes = result.boxes.cls.cpu().numpy()
                scores = result.boxes.conf.cpu().numpy()
                
                for box, cls, score in zip(boxes, classes, scores):
                    if score >= self.conf_threshold:
                        x1, y1, x2, y2 = box
                        class_name = self.class_names[int(cls)]
                        color = self.color_mapping.get(int(cls), [1, 0, 0])
                        
                        rect = Rectangle((x1, y1), x2-x1, y2-y1, 
                                       linewidth=2, edgecolor=color, 
                                       facecolor='none')
                        axes[1].add_patch(rect)
                        
                        axes[1].text(x1, y1-5, f'{class_name}: {score:.2f}',
                                   bbox=dict(facecolor=color, alpha=0.7),
                                   fontsize=8, color='white')
            
            # Draw masks if available
            if result.masks is not None and len(result.masks.data) > 0:
                masks = result.masks.data.cpu().numpy()
                classes = result.boxes.cls.cpu().numpy()
                
                for mask, cls in zip(masks, classes):
                    if mask is not None:
                        color = self.color_mapping.get(int(cls), [1, 0, 0])
                        # Create colored overlay
                        colored_mask = np.zeros((*mask.shape, 3))
                        colored_mask[mask > 0.5] = color
                        axes[1].imshow(colored_mask, alpha=0.4)
            
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'visualizations', f'{img_name}_prediction.png'), 
                       dpi=150, bbox_inches='tight')
            plt.close()
            
        except Exception as e:
            print(f"Error saving visualization for {img_name}: {e}")

    def evaluate_fold(self, fold, per_image_evaluation=True, max_images_per_split=10):
        """Evaluate a single fold using Ultralytics methods"""
        print(f"\n===== Evaluating FOLD {fold} =====")
        
        fold_dir = os.path.join(self.dataset_base_dir, f"AKUDENTAL_YOLO_FOLD_{fold}")
        weights_path = os.path.join(self.weights_base_dir, f"yolo11l-seg_FOLD_{fold}", "weights", "best.pt")
        
        if not os.path.exists(weights_path):
            print(f"  Weights not found, skipping: {weights_path}")
            return None
            
        try:
            model = YOLO(weights_path).to(self.device)
            print(f"  Loaded model from: {weights_path}")
        except Exception as e:
            print(f"  Error loading model: {e}")
            return None
            
        fold_results = {'fold': fold}
        res_dir = os.path.join(self.base_results_dir, f"fold_{fold}")
        os.makedirs(res_dir, exist_ok=True)
        
        for split in ['val', 'test']:
            print(f"\n  Evaluating '{split}' split...")
            
            # Create temporary dataset YAML
            dataset_yaml = self.create_dataset_yaml(fold_dir, split)
            
            try:
                # Run Ultralytics evaluation
                results = self.evaluate_with_ultralytics(model, dataset_yaml, split)
                
                if results is not None:
                    # Extract metrics
                    metrics = results.results_dict if hasattr(results, 'results_dict') else {}
                    
                    split_results = {
                        f'{split}_map_50_95_bbox': metrics.get('metrics/mAP50-95(B)', 0.0),
                        f'{split}_map_50_bbox': metrics.get('metrics/mAP50(B)', 0.0),
                        f'{split}_map_75_bbox': metrics.get('metrics/mAP75(B)', 0.0),
                        f'{split}_map_50_95_mask': metrics.get('metrics/mAP50-95(M)', 0.0),
                        f'{split}_map_50_mask': metrics.get('metrics/mAP50(M)', 0.0),
                        f'{split}_map_75_mask': metrics.get('metrics/mAP75(M)', 0.0),
                        f'{split}_precision': metrics.get('metrics/precision(B)', 0.0),
                        f'{split}_recall': metrics.get('metrics/recall(B)', 0.0),
                    }
                    
                    fold_results.update(split_results)
                    
                    print(f"    {split.upper()} Results:")
                    print(f"      mAP@50-95 (bbox): {split_results[f'{split}_map_50_95_bbox']:.4f}")
                    print(f"      mAP@50 (bbox): {split_results[f'{split}_map_50_bbox']:.4f}")
                    print(f"      mAP@50-95 (mask): {split_results[f'{split}_map_50_95_mask']:.4f}")
                    print(f"      mAP@50 (mask): {split_results[f'{split}_map_50_mask']:.4f}")
                    
                    # Extract and save per-class metrics
                    print(f"    Extracting per-class metrics for {split}...")
                    per_class_metrics = self.extract_per_class_metrics(results)
                    
                    if per_class_metrics:
                        df_per_class = pd.DataFrame(per_class_metrics)
                        per_class_csv = os.path.join(res_dir, f'{split}_per_class_metrics.csv')
                        df_per_class.to_csv(per_class_csv, index=False)
                        print(f"    Saved per-class metrics to: {per_class_csv}")
                        
                        # Calculate and save total averages
                        total_averages = self.calculate_total_averages(df_per_class)
                        total_averages['fold'] = fold
                        total_averages['split'] = split
                        
                        df_averages = pd.DataFrame([total_averages])
                        averages_csv = os.path.join(res_dir, f'{split}_total_averages.csv')
                        df_averages.to_csv(averages_csv, index=False)
                        print(f"    Saved total averages to: {averages_csv}")
                        
                        # Print summary
                        print(f"    Per-class averages (non-zero):")
                        print(f"      BBox mAP@50: {total_averages['bbox_map50_mean']:.4f} ± {total_averages['bbox_map50_std']:.4f}")
                        print(f"      Mask mAP@50: {total_averages['mask_map50_mean']:.4f} ± {total_averages['mask_map50_std']:.4f}")
                    
                    # Per-image evaluation if requested
                    if per_image_evaluation:
                        print(f"    Running per-image evaluation for {split}...")
                        individual_results = self.save_individual_image_predictions(
                            model, fold_dir, split, res_dir, max_images_per_split
                        )
                        
                        if individual_results:
                            df_individual = pd.DataFrame(individual_results)
                            df_individual.to_csv(
                                os.path.join(res_dir, f'{split}_per_image_results.csv'), 
                                index=False
                            )
                            print(f"    Saved {len(individual_results)} individual image results")
                
            except Exception as e:
                print(f"  Error evaluating {split}: {e}")
            finally:
                # Clean up temporary YAML file
                if os.path.exists(dataset_yaml):
                    os.remove(dataset_yaml)
        
        return fold_results

    def run_5fold_evaluation(self, per_image_evaluation=True, max_images_per_split=10):
        """Run evaluation on all 5 folds"""
        print("Starting YOLO Segmentation 5-Fold Evaluation with Ultralytics...")
        print(f"Dataset: {self.dataset_base_dir}")
        print(f"Weights: {self.weights_base_dir}")
        print(f"Results Dir: {self.base_results_dir}")
        print(f"Per-image evaluation: {per_image_evaluation}")
        
        os.makedirs(self.base_results_dir, exist_ok=True)
        
        all_fold_results = []
        all_per_class_metrics = {'val': [], 'test': []}
        all_total_averages = {'val': [], 'test': []}
        
        for fold in range(5):
            fold_result = self.evaluate_fold(fold, per_image_evaluation, max_images_per_split)
            if fold_result is not None:
                all_fold_results.append(fold_result)
                
                # Aggregate per-class metrics across folds
                for split in ['val', 'test']:
                    per_class_csv = os.path.join(self.base_results_dir, f"fold_{fold}", f'{split}_per_class_metrics.csv')
                    if os.path.exists(per_class_csv):
                        df = pd.read_csv(per_class_csv)
                        df['fold'] = fold
                        all_per_class_metrics[split].append(df)
                    
                    averages_csv = os.path.join(self.base_results_dir, f"fold_{fold}", f'{split}_total_averages.csv')
                    if os.path.exists(averages_csv):
                        df = pd.read_csv(averages_csv)
                        all_total_averages[split].append(df)
        
        if all_fold_results:
            # Save summary results
            df_summary = pd.DataFrame(all_fold_results)
            df_summary.to_csv(os.path.join(self.base_results_dir, 'fold_summary.csv'), index=False)
            
            # Calculate aggregated statistics
            numeric_cols = df_summary.select_dtypes(include=[np.number]).columns
            numeric_cols = [col for col in numeric_cols if col != 'fold']
            
            agg_stats = df_summary[numeric_cols].agg(['mean', 'std']).round(4)
            agg_stats.to_csv(os.path.join(self.base_results_dir, 'aggregated_results.csv'))
            
            # Save aggregated per-class metrics
            for split in ['val', 'test']:
                if all_per_class_metrics[split]:
                    df_all_per_class = pd.concat(all_per_class_metrics[split], ignore_index=True)
                    df_all_per_class.to_csv(
                        os.path.join(self.base_results_dir, f'{split}_all_folds_per_class_metrics.csv'),
                        index=False
                    )
                    print(f"\nSaved aggregated per-class metrics for {split}")
                    
                    # Calculate per-class averages across folds
                    grouped = df_all_per_class.groupby('class_name')
                    numeric_cols_class = df_all_per_class.select_dtypes(include=[np.number]).columns
                    numeric_cols_class = [col for col in numeric_cols_class if col not in ['class_id', 'fold']]
                    
                    per_class_agg = grouped[numeric_cols_class].agg(['mean', 'std']).round(4)
                    per_class_agg.to_csv(
                        os.path.join(self.base_results_dir, f'{split}_per_class_aggregated.csv')
                    )
                    print(f"Saved per-class aggregated statistics for {split}")
                
                if all_total_averages[split]:
                    df_all_averages = pd.concat(all_total_averages[split], ignore_index=True)
                    df_all_averages.to_csv(
                        os.path.join(self.base_results_dir, f'{split}_all_folds_total_averages.csv'),
                        index=False
                    )
                    print(f"Saved total averages across all folds for {split}")
            
            print("\n" + "="*60)
            print("FINAL AGGREGATED RESULTS (Mean ± Std)")
            print("="*60)
            
            for col in numeric_cols:
                mean_val = agg_stats.loc['mean', col]
                std_val = agg_stats.loc['std', col]
                print(f"{col:30s}: {mean_val:.4f} ± {std_val:.4f}")
            
            print("="*60)
            print(f"Results saved to: {self.base_results_dir}")
            print("="*60)
        
        return all_fold_results


# Usage example
if __name__ == "__main__":
    # Configuration - UPDATE THESE PATHS
    config = {
        'dataset_base_dir': r"D:\dental\AKUDENTALlast",
        'weights_base_dir': r"D:\dental\AKUDENTAL_5_Fold_SEG_Results_large",
        'colormap_path': r"D:\dental\color_mapping.json",
        'base_results_dir': r"D:\dental\evaluation_results_ultralytics_native_plot",
        'device': "cuda" if torch.cuda.is_available() else "cpu",
        'conf_threshold': 0.25
    }
    
    # Initialize evaluator
    evaluator = UltralyticsYOLOEvaluator(**config)
    
    # Run evaluation
    results = evaluator.run_5fold_evaluation(
        per_image_evaluation=True,    # Set to False to skip per-image analysis
        max_images_per_split=10       # Limit images for visualization (set higher as needed)
    )
    
    print("\nEvaluation Complete!")

Starting YOLO Segmentation 5-Fold Evaluation with Ultralytics...
Dataset: D:\dental\AKUDENTALlast
Weights: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large
Results Dir: D:\dental\evaluation_results_ultralytics_native_plot
Per-image evaluation: True

===== Evaluating FOLD 0 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_0\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 2446.6640.4 MB/s, size: 736.3 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_0\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.0Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100

Processing individual images (val):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

  Evaluating 'test' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 2511.0935.1 MB/s, size: 713.2 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_0\labels\test.cache... 34 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 34/34 34.0Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 13.0s11.5s
                   all         34       1033      0.915      0.891      0.932      0.666      0.916      0.891       0.93      0.532
  11 - Central Incisor         31         31      0.975      0.968      0.981       0.69      0.975      0.968      0.981      0.553
  12 - Lateral Incisor         31         31      0.953      0.935      0.965      0.651      0.953      0.935      0.965  

Processing individual images (test):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

===== Evaluating FOLD 1 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_1\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3058.3564.3 MB/s, size: 639.0 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_1\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.0Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 12.7s11.5s
                   all         33       1028      0.945      0.933      0.955      0.685      0.934      0.923      0.943      0.541
  11 - Central Incisor         32   

Processing individual images (val):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

  Evaluating 'test' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3531.3586.2 MB/s, size: 719.7 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_1\labels\test.cache... 34 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 34/34 33.9Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 12.3s10.7s
                   all         34       1004      0.924       0.94      0.953       0.69      0.904       0.92      0.936      0.539
  11 - Central Incisor         34         34      0.938      0.884      0.963      0.729      0.906      0.855      0.914      0.548
  12 - Lateral Incisor         33         33          1      0.939       0.97      0.692          1      0.939       0.97  

Processing individual images (test):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

===== Evaluating FOLD 2 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_2\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 2523.2545.6 MB/s, size: 776.6 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_2\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33  0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 12.2s11.1s
                   all         33        982      0.935      0.936      0.961      0.695      0.937      0.914      0.949      0.548
  11 - Central Incisor         32         32 

Processing individual images (val):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

  Evaluating 'test' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3502.1533.3 MB/s, size: 701.9 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_2\labels\test.cache... 34 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 34/34  0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 12.7s11.2s
                   all         34       1018      0.913      0.916       0.94      0.674      0.901      0.904       0.93      0.527
  11 - Central Incisor         33         33      0.941       0.97      0.975      0.711      0.941       0.97      0.975      0.586
  12 - Lateral Incisor         33         33      0.956      0.939      0.981       0.71      0.955      0.939      0.981      0.537

Processing individual images (test):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

===== Evaluating FOLD 3 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_3\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3260.4384.8 MB/s, size: 742.1 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_3\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33  0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.3it/s 11.8s10.8s
                   all         33        976      0.922      0.905      0.942      0.683      0.883      0.925      0.933      0.529
  11 - Central Incisor         30         30 

Processing individual images (val):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

  Evaluating 'test' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3333.3507.5 MB/s, size: 673.7 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_3\labels\test.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.0Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 13.1s12.0s
                   all         33       1038      0.925      0.912      0.939      0.695      0.911      0.898      0.928      0.533
  11 - Central Incisor         33         33      0.982       0.97      0.984      0.739       0.89      0.879       0.92      0.561
  12 - Lateral Incisor         33         33          1      0.989      0.995      0.674      0.969      0.958      0.973  

Processing individual images (test):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

===== Evaluating FOLD 4 =====
  Loaded model from: D:\dental\AKUDENTAL_5_Fold_SEG_Results_large\yolo11l-seg_FOLD_4\weights\best.pt

  Evaluating 'val' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
YOLO11l-seg summary (fused): 203 layers, 27,611,577 parameters, 0 gradients, 142.0 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 3313.9617.3 MB/s, size: 684.6 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_4\labels\val.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 33.0Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.2it/s 12.1s11.1s
                   all         33       1027      0.904      0.935      0.943       0.69      0.894      0.931      0.934      0.528
  11 - Central Incisor         32   

Processing individual images (val):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

  Evaluating 'test' split...
Ultralytics 8.3.203  Python-3.10.13 torch-2.2.1+cu118 CUDA:0 (NVIDIA GeForce RTX 3090, 24575MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 2936.6525.2 MB/s, size: 673.9 KB)
[K[34m[1mval: [0mScanning D:\dental\AKUDENTALlast\AKUDENTAL_YOLO_FOLD_4\labels\test.cache... 33 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 33/33 32.9Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 3/3 0.3it/s 11.6s10.4s
                   all         33        899      0.867      0.916      0.925      0.655      0.862      0.911      0.922      0.541
  11 - Central Incisor         29         29          1          1      0.995      0.749          1          1      0.995      0.646
  12 - Lateral Incisor         30         30      0.935      0.967       0.98        0.7      0.935      0.967       0.98  

Processing individual images (test):   0%|          | 0/10 [00:00<?, ?it/s]

    Saved 10 individual image results

Saved aggregated per-class metrics for val
Saved per-class aggregated statistics for val
Saved total averages across all folds for val

Saved aggregated per-class metrics for test
Saved per-class aggregated statistics for test
Saved total averages across all folds for test

FINAL AGGREGATED RESULTS (Mean ± Std)
val_map_50_95_bbox            : 0.6779 ± 0.0237
val_map_50_bbox               : 0.9410 ± 0.0222
val_map_75_bbox               : 0.0000 ± 0.0000
val_map_50_95_mask            : 0.5302 ± 0.0170
val_map_50_mask               : 0.9277 ± 0.0280
val_map_75_mask               : 0.0000 ± 0.0000
val_precision                 : 0.9203 ± 0.0207
val_recall                    : 0.9131 ± 0.0345
test_map_50_95_bbox           : 0.6759 ± 0.0165
test_map_50_bbox              : 0.9378 ± 0.0103
test_map_75_bbox              : 0.0000 ± 0.0000
test_map_50_95_mask           : 0.5344 ± 0.0057
test_map_50_mask              : 0.9292 ± 0.0050
test_map_75_mask        