Add yolo v8 (#71)

openvinotoolkit · Sep 18, 2023 · 3623705 · 3623705
1 parent 03a6cee
commit 3623705
Show file tree

Hide file tree

Showing 19 changed files with 550 additions and 67 deletions.
diff --git a/.github/workflows/test_accuracy.yml b/.github/workflows/test_accuracy.yml
@@ -29,6 +29,7 @@ jobs:
       run: |
         source venv/bin/activate
         pytest --data=./data tests/python/accuracy/test_accuracy.py
+        DATA=data pytest --data=./data tests/python/accuracy/test_YOLOv8.py
     - name: Install CPP ependencies
       run: |
         sudo bash model_api/cpp/install_dependencies.sh
@@ -40,3 +41,4 @@ jobs:
     - name: Run CPP Test
       run: |
         build/test_accuracy -d data -p tests/python/accuracy/public_scope.json
+        DATA=data build/test_YOLOv8
diff --git a/docs/model-configuration.md b/docs/model-configuration.md
@@ -49,6 +49,9 @@ The list features only model wrappers which intoduce new configuration values in
 ###### `YoloV4`
 1. `anchors`: List - list of custom anchor values
 1. `masks`: List - list of mask, applied to anchors for each output layer
+###### `YOLOv5`, `YOLOv8`
+1. `agnostic_nms`: bool - if True, the model is agnostic to the number of classes, and all classes are considered as one
+1. `iou_threshold`: float - threshold for non-maximum suppression (NMS) intersection over union (IOU) filtering
 ###### `YOLOX`
 1. `iou_threshold`: float - threshold for non-maximum suppression (NMS) intersection over union (IOU) filtering
 #### `HpeAssociativeEmbedding`

diff --git a/model_api/cpp/models/include/models/detection_model_yolo.h b/model_api/cpp/models/include/models/detection_model_yolo.h
@@ -83,3 +83,24 @@ class ModelYolo : public DetectionModelExt {
     std::vector<int64_t> presetMasks;
     ov::Layout yoloRegionLayout = "NCHW";
 };
+
+class YOLOv5 : public DetectionModelExt {
+    // Reimplementation of ultralytics.YOLO
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+    void updateModelInfo() override;
+    void init_from_config(const ov::AnyMap& top_priority, const ov::AnyMap& mid_priority);
+    bool agnostic_nms = false;
+public:
+    YOLOv5(std::shared_ptr<ov::Model>& model, const ov::AnyMap& configuration);
+    YOLOv5(std::shared_ptr<InferenceAdapter>& adapter);
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    static std::string ModelType;
+};
+
+class YOLOv8 : public YOLOv5 {
+public:
+    // YOLOv5 and YOLOv8 are identical in terms of inference
+    YOLOv8(std::shared_ptr<ov::Model>& model, const ov::AnyMap& configuration) : YOLOv5{model, configuration} {}
+    YOLOv8(std::shared_ptr<InferenceAdapter>& adapter) : YOLOv5{adapter} {}
+    static std::string ModelType;
+};
diff --git a/model_api/cpp/models/src/detection_model.cpp b/model_api/cpp/models/src/detection_model.cpp
@@ -91,6 +91,10 @@ std::unique_ptr<DetectionModel> DetectionModel::create_model(const std::string&
         detectionModel = std::unique_ptr<DetectionModel>(new ModelYoloX(model, configuration));
     } else if (model_type == ModelCenterNet::ModelType) {
         detectionModel = std::unique_ptr<DetectionModel>(new ModelCenterNet(model, configuration));
+    } else if (model_type == YOLOv5::ModelType) {
+        detectionModel = std::unique_ptr<DetectionModel>(new YOLOv5(model, configuration));
+    } else if (model_type == YOLOv8::ModelType) {
+        detectionModel = std::unique_ptr<DetectionModel>(new YOLOv8(model, configuration));
     } else {
         throw std::runtime_error("Incorrect or unsupported model_type is provided in the model_info section: " + model_type);
     }

diff --git a/model_api/cpp/models/src/detection_model_faceboxes.cpp b/model_api/cpp/models/src/detection_model_faceboxes.cpp
@@ -243,7 +243,7 @@ std::unique_ptr<ResultBase> ModelFaceBoxes::postprocess(InferenceResult& infResu
     std::vector<Anchor> boxes = filterBoxes(boxesTensor, anchors, scores.first, variance);
 
     // Apply Non-maximum Suppression
-    const std::vector<int> keep = nms(boxes, scores.second, iou_threshold);
+    const std::vector<size_t>& keep = nms(boxes, scores.second, iou_threshold);
 
     // Create detection result objects
     DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);

diff --git a/model_api/cpp/models/src/detection_model_ssd.cpp b/model_api/cpp/models/src/detection_model_ssd.cpp
@@ -161,12 +161,13 @@ std::unique_ptr<ResultBase> ModelSSD::postprocessSingleOutput(InferenceResult& i
                 0.f,
                 floatInputImgHeight);
             desc.width = clamp(
-                round((detections[i * numAndStep.objectSize + 5] * netInputWidth - padLeft) * invertedScaleX - desc.x),
+                round((detections[i * numAndStep.objectSize + 5] * netInputWidth - padLeft) * invertedScaleX),
                 0.f,
-                floatInputImgWidth);
+                floatInputImgWidth) - desc.x;
             desc.height = clamp(
-                round((detections[i * numAndStep.objectSize + 6] * netInputHeight - padTop) * invertedScaleY - desc.y),
-                0.f, floatInputImgHeight);
+                round((detections[i * numAndStep.objectSize + 6] * netInputHeight - padTop) * invertedScaleY),
+                0.f,
+                floatInputImgHeight) - desc.y;
             result->objects.push_back(desc);
         }
     }
@@ -222,12 +223,13 @@ std::unique_ptr<ResultBase> ModelSSD::postprocessMultipleOutputs(InferenceResult
                 0.f,
                 floatInputImgHeight);
             desc.width = clamp(
-                round((boxes[i * numAndStep.objectSize + 2] * widthScale - padLeft) * invertedScaleX - desc.x),
+                round((boxes[i * numAndStep.objectSize + 2] * widthScale - padLeft) * invertedScaleX),
                 0.f,
-                floatInputImgWidth);
+                floatInputImgWidth) - desc.x;
             desc.height = clamp(
-                round((boxes[i * numAndStep.objectSize + 3] * heightScale - padTop) * invertedScaleY - desc.y),
-                0.f, floatInputImgHeight);
+                round((boxes[i * numAndStep.objectSize + 3] * heightScale - padTop) * invertedScaleY),
+                0.f,
+                floatInputImgHeight) - desc.y;
             result->objects.push_back(desc);
         }
     }

diff --git a/model_api/cpp/models/src/detection_model_yolo.cpp b/model_api/cpp/models/src/detection_model_yolo.cpp
@@ -27,6 +27,7 @@
 #include <openvino/openvino.hpp>
 
 #include <utils/common.hpp>
+#include <utils/nms.hpp>
 #include <utils/slog.hpp>
 
 #include "models/internal_model_data.h"
@@ -504,3 +505,171 @@ ModelYolo::Region::Region(size_t classes,
         num = anchors.size() / 2;
     }
 }
+
+std::string YOLOv5::ModelType = "YOLOv5";
+
+void YOLOv5::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    const ov::Output<ov::Node>& input = model->input();
+    const ov::Shape& in_shape = input.get_partial_shape().get_max_shape();
+    if (in_shape.size() != 4) {
+        throw std::runtime_error("YOLO: the rank of the input must be 4");
+    }
+    inputNames.push_back(input.get_any_name());
+    const ov::Layout& inputLayout = getInputLayout(input);
+    if (!embedded_processing) {
+        model = ImageModel::embedProcessing(model,
+                                inputNames[0],
+                                inputLayout,
+                                resizeMode,
+                                interpolationMode,
+                                ov::Shape{
+                                    in_shape[ov::layout::width_idx(inputLayout)],
+                                    in_shape[ov::layout::height_idx(inputLayout)]
+                                },
+                                pad_value,
+                                reverse_input_channels,
+                                {},
+                                scale_values);
+
+        netInputWidth = in_shape[ov::layout::width_idx(inputLayout)];
+        netInputHeight = in_shape[ov::layout::height_idx(inputLayout)];
+
+        embedded_processing = true;
+    }
+
+    const ov::Output<const ov::Node>& output = model->output();
+    if (ov::element::Type_t::f32 != output.get_element_type()) {
+        throw std::runtime_error("YOLO: the output must be of precision f32");
+    }
+    const ov::Shape& out_shape = output.get_partial_shape().get_max_shape();
+    if (3 != out_shape.size()) {
+        throw std::runtime_error("YOLO: the output must be of rank 3");
+    }
+    if (!labels.empty() && labels.size() + 4 != out_shape[1]) {
+        throw std::runtime_error("YOLO: number of labels must be smaller than out_shape[1] by 4");
+    }
+}
+
+void YOLOv5::updateModelInfo() {
+    DetectionModelExt::updateModelInfo();
+    model->set_rt_info(YOLOv5::ModelType, "model_info", "model_type");
+    model->set_rt_info(agnostic_nms, "model_info", "agnostic_nms");
+    model->set_rt_info(iou_threshold, "model_info", "iou_threshold");
+}
+
+void YOLOv5::init_from_config(const ov::AnyMap& top_priority, const ov::AnyMap& mid_priority) {
+    pad_value = get_from_any_maps("pad_value", top_priority, mid_priority, 114);
+    if (top_priority.find("resize_type") == top_priority.end() && mid_priority.find("resize_type") == mid_priority.end()) {
+        interpolationMode = cv::INTER_LINEAR;
+        resizeMode = RESIZE_KEEP_ASPECT_LETTERBOX;
+    }
+    reverse_input_channels = get_from_any_maps("reverse_input_channels", top_priority, mid_priority, true);
+    scale_values = get_from_any_maps("scale_values", top_priority, mid_priority, std::vector<float>{255.0f});
+    confidence_threshold = get_from_any_maps("confidence_threshold", top_priority, mid_priority, 0.25f);
+    agnostic_nms = get_from_any_maps("agnostic_nms", top_priority, mid_priority, agnostic_nms);
+    iou_threshold = get_from_any_maps("iou_threshold", top_priority, mid_priority, 0.7f);
+}
+
+YOLOv5::YOLOv5(std::shared_ptr<ov::Model>& model, const ov::AnyMap& configuration)
+        : DetectionModelExt(model, configuration) {
+    init_from_config(configuration, model->get_rt_info<ov::AnyMap>("model_info"));
+}
+
+YOLOv5::YOLOv5(std::shared_ptr<InferenceAdapter>& adapter)
+        : DetectionModelExt(adapter) {
+    init_from_config(adapter->getModelConfig(), ov::AnyMap{});
+}
+
+std::unique_ptr<ResultBase> YOLOv5::postprocess(InferenceResult& infResult) {
+    if (1 != infResult.outputsData.size()) {
+        throw std::runtime_error("YOLO: expect 1 output");
+    }
+    const ov::Tensor& detectionsTensor = infResult.getFirstOutputTensor();
+    const ov::Shape& out_shape = detectionsTensor.get_shape();
+    if (3 != out_shape.size()) {
+        throw std::runtime_error("YOLO: the output must be of rank 3");
+    }
+    if (1 != out_shape[0]) {
+        throw std::runtime_error("YOLO: the first dim of the output must be 1");
+    }
+    size_t num_proposals = out_shape[2];
+    std::vector<Anchor> boxes;
+    std::vector<float> confidences;
+    std::vector<size_t> labelIDs;
+    const float* const detections = detectionsTensor.data<float>();
+    for (size_t i = 0; i < num_proposals; ++i) {
+        float confidence = 0.0f;
+        size_t max_id = 0;
+        constexpr size_t LABELS_START = 4;
+        for (size_t j = LABELS_START; j < out_shape[1]; ++j) {
+            if (detections[j * num_proposals + i] > confidence) {
+                confidence = detections[j * num_proposals + i];
+                max_id = j;
+            }
+        }
+        if (confidence > confidence_threshold) {
+            boxes.push_back(Anchor{
+                detections[0 * num_proposals + i] - detections[2 * num_proposals + i] / 2.0f,
+                detections[1 * num_proposals + i] - detections[3 * num_proposals + i] / 2.0f,
+                detections[0 * num_proposals + i] + detections[2 * num_proposals + i] / 2.0f,
+                detections[1 * num_proposals + i] + detections[3 * num_proposals + i] / 2.0f,
+            });
+            confidences.push_back(confidence);
+            labelIDs.push_back(max_id - LABELS_START);
+        }
+    }
+    constexpr bool includeBoundaries = false;
+    constexpr size_t keep_top_k = 30000;
+    std::vector<size_t> keep;
+    if (agnostic_nms) {
+        keep = nms(boxes, confidences, iou_threshold, includeBoundaries, keep_top_k);
+    } else {
+        std::vector<AnchorLabeled> boxes_with_class;
+        boxes_with_class.reserve(boxes.size());
+        for (size_t i = 0; i < boxes.size(); ++i) {
+            boxes_with_class.emplace_back(boxes[i], int(labelIDs[i]));
+        }
+        keep = multiclass_nms(boxes_with_class, confidences, iou_threshold, includeBoundaries, keep_top_k);
+    }
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    auto base = std::unique_ptr<ResultBase>(result);
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+    float floatInputImgWidth = float(internalData.inputImgWidth),
+         floatInputImgHeight = float(internalData.inputImgHeight);
+    float invertedScaleX = floatInputImgWidth / netInputWidth,
+          invertedScaleY = floatInputImgHeight / netInputHeight;
+    int padLeft = 0, padTop = 0;
+    if (RESIZE_KEEP_ASPECT == resizeMode || RESIZE_KEEP_ASPECT_LETTERBOX == resizeMode) {
+        invertedScaleX = invertedScaleY = std::max(invertedScaleX, invertedScaleY);
+        if (RESIZE_KEEP_ASPECT_LETTERBOX == resizeMode) {
+            padLeft = (netInputWidth - int(std::round(floatInputImgWidth / invertedScaleX))) / 2;
+            padTop = (netInputHeight - int(std::round(floatInputImgHeight / invertedScaleY))) / 2;
+        }
+    }
+    for (size_t idx : keep) {
+        DetectedObject desc;
+        desc.x = clamp(
+            round((boxes[idx].left - padLeft) * invertedScaleX),
+            0.f,
+            floatInputImgWidth);
+        desc.y = clamp(
+            round((boxes[idx].top - padTop) * invertedScaleY),
+            0.f,
+            floatInputImgHeight);
+        desc.width = clamp(
+            round((boxes[idx].right - padLeft) * invertedScaleX),
+            0.f,
+            floatInputImgWidth) - desc.x;
+        desc.height = clamp(
+            round((boxes[idx].bottom - padTop) * invertedScaleY),
+            0.f,
+            floatInputImgHeight) - desc.y;
+        desc.confidence = confidences[idx];
+        desc.labelID = static_cast<size_t>(labelIDs[idx]);
+        desc.label = getLabelName(desc.labelID);
+        result->objects.push_back(desc);
+    }
+    return base;
+}
+
+std::string YOLOv8::ModelType = "YOLOv8";
diff --git a/model_api/cpp/models/src/detection_model_yolox.cpp b/model_api/cpp/models/src/detection_model_yolox.cpp
@@ -190,8 +190,8 @@ std::unique_ptr<ResultBase> ModelYoloX::postprocess(InferenceResult& infResult)
     }
 
     // NMS for valid boxes
-    std::vector<int> keep = nms(validBoxes, scores, iou_threshold, true);
-    for (auto& index: keep) {
+    const std::vector<size_t>& keep = nms(validBoxes, scores, iou_threshold, true);
+    for (size_t index: keep) {
         // Create new detected box
         DetectedObject obj;
         obj.x = clamp(validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));

diff --git a/model_api/cpp/utils/include/utils/nms.hpp b/model_api/cpp/utils/include/utils/nms.hpp
@@ -50,13 +50,13 @@ struct AnchorLabeled : public Anchor {
     AnchorLabeled() = default;
     AnchorLabeled(float _left, float _top, float _right, float _bottom, int _labelID) :
         Anchor(_left, _top, _right, _bottom), labelID(_labelID) {}
+    AnchorLabeled(const Anchor& coords, int labelID) : Anchor{coords}, labelID{labelID} {}
 };
 
 template <typename Anchor>
-std::vector<int> nms(const std::vector<Anchor>& boxes, const std::vector<float>& scores,
-                     const float thresh, bool includeBoundaries=false, size_t maxNum=0) {
-    if (maxNum == 0) {
-        maxNum = boxes.size();
+std::vector<size_t> nms(const std::vector<Anchor>& boxes, const std::vector<float>& scores, const float thresh, bool includeBoundaries=false, size_t keep_top_k=0) {
+    if (keep_top_k == 0) {
+        keep_top_k = boxes.size();
     }
     std::vector<float> areas(boxes.size());
     for (size_t i = 0; i < boxes.size(); ++i) {
@@ -67,25 +67,24 @@ std::vector<int> nms(const std::vector<Anchor>& boxes, const std::vector<float>&
     std::sort(order.begin(), order.end(), [&scores](int o1, int o2) { return scores[o1] > scores[o2]; });
 
     size_t ordersNum = 0;
-    for (; ordersNum < order.size() && scores[order[ordersNum]] >= 0  && ordersNum < maxNum; ordersNum++);
+    for (; ordersNum < order.size() && scores[order[ordersNum]] >= 0  && ordersNum < keep_top_k; ordersNum++);
 
-    std::vector<int> keep;
+    std::vector<size_t> keep;
     bool shouldContinue = true;
     for (size_t i = 0; shouldContinue && i < ordersNum; ++i) {
-        auto idx1 = order[i];
+        int idx1 = order[i];
         if (idx1 >= 0) {
             keep.push_back(idx1);
             shouldContinue = false;
             for (size_t j = i + 1; j < ordersNum; ++j) {
-                auto idx2 = order[j];
+                int idx2 = order[j];
                 if (idx2 >= 0) {
                     shouldContinue = true;
-                    auto overlappingWidth = std::fminf(boxes[idx1].right, boxes[idx2].right) - std::fmaxf(boxes[idx1].left, boxes[idx2].left);
-                    auto overlappingHeight = std::fminf(boxes[idx1].bottom, boxes[idx2].bottom) - std::fmaxf(boxes[idx1].top, boxes[idx2].top);
-                    auto intersection = overlappingWidth > 0 && overlappingHeight > 0 ? overlappingWidth * overlappingHeight : 0;
-                    auto overlap = intersection / (areas[idx1] + areas[idx2] - intersection);
-
-                    if (overlap >= thresh) {
+                    float overlappingWidth = std::fminf(boxes[idx1].right, boxes[idx2].right) - std::fmaxf(boxes[idx1].left, boxes[idx2].left);
+                    float overlappingHeight = std::fminf(boxes[idx1].bottom, boxes[idx2].bottom) - std::fmaxf(boxes[idx1].top, boxes[idx2].top);
+                    float intersection = overlappingWidth > 0 && overlappingHeight > 0 ? overlappingWidth * overlappingHeight : 0;
+                    float union_area = areas[idx1] + areas[idx2] - intersection;
+                    if (0.0f == union_area || intersection / union_area > thresh) {
                         order[j] = -1;
                     }
                 }
@@ -95,5 +94,5 @@ std::vector<int> nms(const std::vector<Anchor>& boxes, const std::vector<float>&
     return keep;
 }
 
-std::vector<int> multiclass_nms(const std::vector<AnchorLabeled>& boxes, const std::vector<float>& scores,
+std::vector<size_t> multiclass_nms(const std::vector<AnchorLabeled>& boxes, const std::vector<float>& scores,
                      const float iou_threshold=0.45f, bool includeBoundaries=false, size_t maxNum=200);
diff --git a/model_api/cpp/utils/src/nms.cpp b/model_api/cpp/utils/src/nms.cpp
@@ -19,7 +19,7 @@
 #include "utils/nms.hpp"
 
 
-std::vector<int> multiclass_nms(const std::vector<AnchorLabeled>& boxes, const std::vector<float>& scores,
+std::vector<size_t> multiclass_nms(const std::vector<AnchorLabeled>& boxes, const std::vector<float>& scores,
                      const float iou_threshold, bool includeBoundaries, size_t maxNum) {
     std::vector<Anchor> boxes_copy;
     boxes_copy.reserve(boxes.size());

diff --git a/model_api/python/openvino/model_api/models/__init__.py b/model_api/python/openvino/model_api/models/__init__.py
@@ -56,7 +56,7 @@
     add_rotated_rects,
     get_contours,
 )
-from .yolo import YOLO, YOLOF, YOLOX, YoloV3ONNX, YoloV4
+from .yolo import YOLO, YOLOF, YOLOX, YoloV3ONNX, YoloV4, YOLOv5, YOLOv8
 
 classification_models = [
     "resnet-18-pytorch",
@@ -118,6 +118,8 @@
     "YOLO",
     "YoloV3ONNX",
     "YoloV4",
+    "YOLOv5",
+    "YOLOv8",
     "YOLOF",
     "YOLOX",
     "ClassificationResult",