# В этом ноутбуке сравнивались обученные модели RetinaNet и YOLOv8

In [47]:
from ultralytics import YOLO

import os
import cv2
import time
import torch
import gradio as gr
import numpy as np
from model import create_model
from datasets import create_valid_dataset, create_valid_loader
from sklearn.metrics import average_precision_score
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from tqdm import tqdm

In [4]:
DEVICE = 'cpu'
NUM_CLASSES = 2

In [7]:
model = create_model(num_classes=NUM_CLASSES)
checkpoint = torch.load("./best_retina_model.pth", map_location=DEVICE)
model.load_state_dict(checkpoint["model_state_dict"])
model.to(DEVICE).eval();

In [16]:
def inference_on_image(orig_image: np.ndarray, resize_dim=None, threshold=0.25):
    """
    Runs inference on a single image (OpenCV BGR or NumPy array).
    - resize_dim: if not None, we resize to (resize_dim, resize_dim)
    - threshold: detection confidence threshold
    Returns: processed image with bounding boxes drawn.
    """
    image = orig_image.copy()
    COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
    # Optionally resize for inference.
    if resize_dim is not None:
        image = cv2.resize(image, (resize_dim, resize_dim))

    # Convert BGR to RGB, normalize [0..1]
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    # Move channels to front (C,H,W)
    image_tensor = torch.tensor(image_rgb.transpose(2, 0, 1), dtype=torch.float).unsqueeze(0)
    start_time = time.time()
    # Inference
    with torch.no_grad():
        outputs = model(image_tensor)
    end_time = time.time()
    # Get the current fps.
    fps = 1 / (end_time - start_time)
    fps_text = f"FPS: {fps:.2f}"
    # Move outputs to CPU numpy
    outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs]
    boxes = outputs[0]["boxes"].numpy()
    scores = outputs[0]["scores"].numpy()
    labels = outputs[0]["labels"].numpy().astype(int)

    # Filter out boxes with low confidence
    valid_idx = np.where(scores >= threshold)[0]
    boxes = boxes[valid_idx].astype(int)
    labels = labels[valid_idx]

    h_orig, w_orig = orig_image.shape[:2]
    
    # If we resized for inference, rescale boxes back to orig_image size
    if resize_dim is not None:
        h_orig, w_orig = orig_image.shape[:2]
        h_new, w_new = resize_dim, resize_dim
        # scale boxes
        boxes[:, [0, 2]] = (boxes[:, [0, 2]] / w_new) * w_orig
        boxes[:, [1, 3]] = (boxes[:, [1, 3]] / h_new) * h_orig

    # Draw bounding boxes
    for box, label_idx in zip(boxes, labels):
        class_name = CLASSES[label_idx] if 0 <= label_idx < len(CLASSES) else str(label_idx)
        color = COLORS[label_idx % len(COLORS)][::-1]  # BGR color
        cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), color, 5)
        cv2.putText(orig_image, class_name, (box[0], box[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3)
        cv2.putText(
            orig_image,
            fps_text,
            (int((w_orig / 2) - 50), 30),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.8,
            (0, 255, 0),
            2,
            cv2.LINE_AA,
        )
    return orig_image, fps, outputs

In [56]:
def validate(valid_data_loader, model, metric):
    print("Validating")
    model.eval()

    # Initialize tqdm progress bar.
    prog_bar = tqdm(valid_data_loader, total=len(valid_data_loader))
    target = []
    preds = []
    for i, data in enumerate(prog_bar):
        images, targets = data

        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

        with torch.no_grad():
            outputs = model(images, targets)

        # For mAP calculation using Torchmetrics.
        #####################################
        for i in range(len(images)):
            true_dict = dict()
            preds_dict = dict()
            true_dict["boxes"] = targets[i]["boxes"].detach().cpu()
            true_dict["labels"] = targets[i]["labels"].detach().cpu()
            preds_dict["boxes"] = outputs[i]["boxes"].detach().cpu()
            preds_dict["scores"] = outputs[i]["scores"].detach().cpu()
            preds_dict["labels"] = outputs[i]["labels"].detach().cpu()
            preds.append(preds_dict)
            target.append(true_dict)
        #####################################
    

    metric.reset()
    metric.update(preds, target)
    metric_summary = metric.compute()
    return metric_summary

In [60]:
metric = MeanAveragePrecision(iou_thresholds=[0.5, 0.8, 0.9])

In [66]:
valid_dataset = create_valid_dataset("./data/lite_data/test/")
valid_loader = create_valid_loader(valid_dataset)

  self._set_keys()


In [67]:
result = validate(valid_loader, model, metric)

Validating


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:23<00:00, 11.88s/it]


In [68]:
result

{'map': tensor(0.1746),
 'map_50': tensor(0.4735),
 'map_75': tensor(-1.),
 'map_small': tensor(0.1777),
 'map_medium': tensor(0.2141),
 'map_large': tensor(-1.),
 'mar_1': tensor(0.1198),
 'mar_10': tensor(0.2656),
 'mar_100': tensor(0.3490),
 'mar_small': tensor(0.3333),
 'mar_medium': tensor(0.4583),
 'mar_large': tensor(-1.),
 'map_per_class': tensor(-1.),
 'mar_100_per_class': tensor(-1.),
 'classes': tensor(1, dtype=torch.int32)}

In [69]:
model_yolo = YOLO("./outputs/best_yolov8_model.pt")
result_yolo = model_yolo.val(data="./data.yaml", iou=0.5, verbose=True, plots=True)

Ultralytics 8.3.163  Python-3.10.8 torch-2.6.0+cpu CPU (AMD Ryzen 5 4600H with Radeon Graphics)
YOLOv8s summary (fused): 72 layers, 11,125,971 parameters, 0 gradients, 28.4 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.10.0 ms, read: 450.468.6 MB/s, size: 5128.0 KB)


[34m[1mval: [0mScanning C:\Users\maks6\Desktop\Моя жизнь\pet-projects\БПЛА\git\retina_train\lite_data_yolo_test\labels... 30 imag[0m

[34m[1mval: [0mNew cache created: C:\Users\maks6\Desktop\ \pet-projects\\git\retina_train\lite_data_yolo_test\labels.cache



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:19<0


                   all         30         66      0.641      0.333      0.367      0.124
Speed: 7.2ms preprocess, 527.8ms inference, 0.0ms loss, 18.5ms postprocess per image
Results saved to [1mruns\detect\val2[0m


# В текущей реализации RetinaNet получила наилучший результат по метрике mAP50