In [1]:
import cv2
from PIL import Image
from ultralytics import YOLO
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
model = YOLO("./runs/train/train/weights/best.pt")

In [None]:
results = model.predict(source="./MoNuSegTestData/test/images/", max_det=9999, iou=0.5)  # Display preds. Accepts all YOLO predict arguments

In [None]:
# evaluate test performance
def evaluate_test_set(results):
    # count total nuclei detected
    total_nuclei = 0
    for result in results:
        total_nuclei += len(result.boxes)
    
    print(f"test set total image number: {len(results)}")
    print(f"total nuclei detected: {total_nuclei}")
    print(f"average nuclei detected per image: {total_nuclei/len(results):.2f}")
    
    # calculate average confidence
    confidences = []
    for result in results:
        confidences.extend(result.boxes.conf.tolist())
    avg_confidence = sum(confidences) / len(confidences)
    print(f"average confidence: {avg_confidence:.4f}")
    
    # visualize detection result distribution
    plt.figure(figsize=(10, 6))
    plt.hist(confidences, bins=20)
    plt.title('detection confidence distribution')
    plt.xlabel('confidence')
    plt.ylabel('frequency')
    plt.show()

# start evaluation
evaluate_test_set(results)


In [6]:
def calculate_metrics(results, label_dir):
    # initialize counters
    total_gt = 0  # total ground truth boxes
    total_det = 0  # total detected boxes
    total_tp = 0  # true positives
    
    # iterate over each image result
    for result in results:
        # get image name
        image_name = result.path.split('/')[-1].replace('.png', '.txt')
        label_path = os.path.join(label_dir, image_name)
        
        # read ground truth
        gt_boxes = []
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f:
                    # convert YOLO format to xyxy format
                    class_id, x_center, y_center, width, height = map(float, line.strip().split())
                    x1 = (x_center - width/2) * result.orig_shape[1]
                    y1 = (y_center - height/2) * result.orig_shape[0]
                    x2 = (x_center + width/2) * result.orig_shape[1]
                    y2 = (y_center + height/2) * result.orig_shape[0]
                    gt_boxes.append([x1, y1, x2, y2])
        
        # get detected boxes
        det_boxes = result.boxes.xyxy.cpu().numpy()
        det_scores = result.boxes.conf.cpu().numpy()
        
        # calculate IoU and match detected boxes and ground truth boxes
        ious = []
        for gt_box in gt_boxes:
            for det_box in det_boxes:
                iou = calculate_iou(gt_box, det_box)
                ious.append(iou)
        
        # use IoU threshold 0.5 to match detected boxes and ground truth boxes
        ious = np.array(ious).reshape(len(gt_boxes), len(det_boxes))
        matched = np.zeros(len(det_boxes))
        tp = 0
        
        for i in range(len(gt_boxes)):
            max_iou = np.max(ious[i])
            if max_iou >= 0.5:
                max_idx = np.argmax(ious[i])
                if not matched[max_idx]:
                    tp += 1
                    matched[max_idx] = 1
        
        total_gt += len(gt_boxes)
        total_det += len(det_boxes)
        total_tp += tp
    
    # calculate metrics
    precision = total_tp / total_det if total_det > 0 else 0
    recall = total_tp / total_gt if total_gt > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    
    return precision, recall, f1

def calculate_iou(box1, box2):
    # calculate IoU of two boxes
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = box1_area + box2_area - intersection
    
    return intersection / union if union > 0 else 0

# calculate test set metrics
label_dir = "./MoNuSegTestData/yololabel/"
precision, recall, f1 = calculate_metrics(results, label_dir)


Precision: 0.7957
Recall: 0.9101
F1-score: 0.8491


In [6]:
def calculate_map(results, label_dir, iou_thresholds=None):
    """
    calculate mAP50 and mAP50-95

    Args:
        results: list of prediction results
        label_dir: directory of label files
        iou_thresholds: list of IoU thresholds, default is [0.5] and [0.5:0.95:0.05]
    """
    if iou_thresholds is None:
        iou_thresholds = [0.5] + list(np.arange(0.5, 1.0, 0.05))
    
    # store AP for each IoU threshold
    aps = []
    
    for iou_threshold in iou_thresholds:
        # store AP for each class
        class_aps = []
        
        # get all detected boxes and ground truth boxes
        all_detections = []
        all_ground_truths = []
        
        for result in results:
            image_path = result.path.split('/')[-1]
            label_path = os.path.join(label_dir, image_path.replace('.png', '.txt'))
            
            # get predicted boxes
            det_boxes = result.boxes.xyxy.cpu().numpy()
            det_scores = result.boxes.conf.cpu().numpy()
            
            # get ground truth boxes
            gt_boxes = []
            if os.path.exists(label_path):
                with open(label_path, 'r') as f:
                    for line in f:
                        class_id, x_center, y_center, width, height = map(float, line.strip().split())
                        # convert to xyxy format
                        x1 = (x_center - width/2) * 1000
                        y1 = (y_center - height/2) * 1000
                        x2 = (x_center + width/2) * 1000
                        y2 = (y_center + height/2) * 1000
                        gt_boxes.append([x1, y1, x2, y2])
            
            all_detections.append((det_boxes, det_scores))
            all_ground_truths.append(gt_boxes)
        
        # calculate AP
        ap = calculate_ap(all_detections, all_ground_truths, iou_threshold)
        aps.append(ap)
    
    # calculate mAP50 and mAP50-95
    map50 = aps[0]  # the first is IoU=0.5 AP
    map50_95 = np.mean(aps)  # the average AP of all IoU thresholds
    
    print(f"mAP50: {map50:.4f}")
    print(f"mAP50-95: {map50_95:.4f}")
    
    return map50, map50_95

def calculate_ap(all_detections, all_ground_truths, iou_threshold):
    """
    calculate AP for a single IoU threshold
    """
    # collect all predicted boxes and scores
    all_boxes = []
    all_scores = []
    all_gt_boxes = []
    
    for det_boxes, det_scores in all_detections:
        all_boxes.extend(det_boxes)
        all_scores.extend(det_scores)
    
    for gt_boxes in all_ground_truths:
        all_gt_boxes.extend(gt_boxes)
    
    # sort by confidence
    indices = np.argsort(all_scores)[::-1]
    all_boxes = np.array(all_boxes)[indices]
    all_scores = np.array(all_scores)[indices]
    
    # calculate precision and recall
    tp = np.zeros(len(all_boxes))
    fp = np.zeros(len(all_boxes))
    gt_matched = np.zeros(len(all_gt_boxes))
    
    for i, box in enumerate(all_boxes):
        max_iou = 0
        max_idx = -1
        
        for j, gt_box in enumerate(all_gt_boxes):
            if not gt_matched[j]:
                iou = calculate_iou(box, gt_box)
                if iou > max_iou:
                    max_iou = iou
                    max_idx = j
        
        if max_iou >= iou_threshold:
            tp[i] = 1
            gt_matched[max_idx] = 1
        else:
            fp[i] = 1
    
    # calculate cumulative values
    tp_cumsum = np.cumsum(tp)
    fp_cumsum = np.cumsum(fp)
    
    # calculate precision and recall
    precision = tp_cumsum / (tp_cumsum + fp_cumsum)
    recall = tp_cumsum / len(all_gt_boxes)
    
    # calculate AP (using 11-point interpolation)
    ap = 0
    for t in np.arange(0, 1.1, 0.1):
        if np.sum(recall >= t) == 0:
            p = 0
        else:
            p = np.max(precision[recall >= t])
        ap = ap + p / 11.0
    
    return ap

# calculate mAP50 and mAP50-95
map50, map50_95 = calculate_map(results, label_dir)


mAP50: 0.8789
mAP50-95: 0.5237
