openvinotoolkit · dbudniko · Mar 29, 2022 · Mar 29, 2022 · Mar 29, 2022 · Mar 30, 2022
diff --git a/demos/README.md b/demos/README.md
@@ -38,6 +38,7 @@
    omz_demos_image_retrieval_demo_python
    omz_demos_segmentation_demo_cpp
    omz_demos_segmentation_demo_python
+   omz_demos_smart_framing_demo_cpp
    omz_demos_image_translation_demo_python
    omz_demos_instance_segmentation_demo_python
    omz_demos_interactive_face_detection_demo_cpp
@@ -139,7 +140,8 @@ The Open Model Zoo includes the following demos:
 - [Pedestrian Tracker C++ Demo](./pedestrian_tracker_demo/cpp/README.md) - Demo application for pedestrian tracking scenario.
 - [Place Recognition Python\* Demo](./place_recognition_demo/python/README.md) - This demo demonstrates how to run Place Recognition models using OpenVINO™.
 - [Security Barrier Camera C++ Demo](./security_barrier_camera_demo/cpp/README.md) - Vehicle Detection followed by the Vehicle Attributes and License-Plate Recognition, supports images/video and camera inputs.
-- [Speech Recognition DeepSpeech Python\* Demo](./speech_recognition_deepspeech_demo/python/README.md) - Speech recognition demo: accepts an audio file with an English phrase on input and converts it into text. This demo does streaming audio data processing and can optionally provide current transcription of the processed part.
+- [Smart Framing C++ Demo](./smart_framing_demo/cpp_gapi/README.md) - Person Detection followed by the Smart Framing/Croping and optionally Super Resolution, supports images/video and camera inputs.
+  [Speech Recognition DeepSpeech Python\* Demo](./speech_recognition_deepspeech_demo/python/README.md) - Speech recognition demo: accepts an audio file with an English phrase on input and converts it into text. This demo does streaming audio data processing and can optionally provide current transcription of the processed part.
 - [Speech Recognition QuartzNet Python\* Demo](./speech_recognition_quartznet_demo/python/README.md) - Speech recognition demo for QuartzNet: takes a whole audio file with an English phrase on input and converts it into text.
 - [Speech Recognition Wav2Vec Python\* Demo](./speech_recognition_wav2vec_demo/python/README.md) - Speech recognition demo for Wav2Vec: takes a whole audio file with an English phrase on input and converts it into text.
 - [Single Human Pose Estimation Python\* Demo](./single_human_pose_estimation_demo/python/README.md) - 2D human pose estimation demo.

diff --git a/demos/smart_framing_demo/cpp_gapi/CMakeLists.txt b/demos/smart_framing_demo/cpp_gapi/CMakeLists.txt
@@ -0,0 +1,13 @@
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+
+add_demo(NAME smart_framing_demo_gapi
+    SOURCES ${SOURCES}
+    HEADERS ${HEADERS}
+    INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/include"
+    DEPENDENCIES monitors utils_gapi
+    OPENCV_VERSION_REQUIRED 4.5.5)
diff --git a/demos/smart_framing_demo/cpp_gapi/README.md b/demos/smart_framing_demo/cpp_gapi/README.md
@@ -0,0 +1,114 @@
+# G-API Smart Framing Demo
+
+This demo shows how to perform smart framing using G-API.
+
+> **NOTE**: Only batch size of 1 is supported.
+
+## How It Works
+The demo application expects an yolo-v4-tiny-tf.xml object detection model in the Intermediate Representation (IR) format.
+The demo application expects an single-image-super-resolution-1032.xml or single-image-super-resolution-1033.xml super resolution model in the Intermediate Representation (IR) format if
+super resolution is enabled (default behaviour).
+
+The use case for the demo is an online conference where is needed to show only people and crop the most part of background. Super resolution can be optionally applied to minimize upscalling artifacts.
-The use case for the demo is an online conference where is needed to show only people and crop the most part of background. Super resolution can be optionally applied to minimize upscalling artifacts.
+The use case for the demo is an online conference where it is needed to show only people and crop the most part of background. Super resolution can be optionally applied to minimize upscalling artifacts.
-The use case for the demo is an online conference where is needed to show only people and crop the most part of background. Super resolution can be optionally applied to minimize upscalling artifacts.
+The use case for the demo is an online conference where it is needed to show only people and crop the most part of background. Super resolution can be optionally applied to minimize upscalling artifacts.
+
+As input, the demo application accepts a path to a single image file, a video file or a numeric ID of a web camera specified with a command-line argument `-i`
+
+The demo workflow is the following:
+
+1. The demo application reads image/video frames one by one, resizes them to fit into the input image blob of the network (`image`).
-1. The demo application reads image/video frames one by one, resizes them to fit into the input image blob of the network (`image`).
+1. The demo application reads image/video frames one by one, resizes them to fit into the input image blob of the network.
-1. The demo application reads image/video frames one by one, resizes them to fit into the input image blob of the network (`image`).
+1. The demo application reads image/video frames one by one, resizes them to fit into the input image blob of the network.
+2. The demo visualizes the resulting smart framing.
+
+> **NOTE**: By default, Open Model Zoo demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the demo application or reconvert your model using the Model Optimizer tool with the `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Embedding Preprocessing Computation](@ref openvino_docs_MO_DG_Additional_Optimization_Use_Cases).
+
+## Preparing to Run
+
+For demo input image or video files, refer to the section **Media Files Available for Demos** in the [Open Model Zoo Demos Overview](../../README.md).
+The list of models supported by the demo is in `<omz_dir>/demos/background_subtraction_demo/cpp_gapi/models.lst` file.
+This file can be used as a parameter for [Model Downloader](../../../tools/model_tools/README.md) and Converter to download and, if necessary, convert models to OpenVINO IR format (\*.xml + \*.bin).
+
+An example of using the Model Downloader:
+
+```sh
+omz_downloader --list models.lst
+```
+
+An example of using the Model Converter:
+
+```sh
+omz_converter --list models.lst
+```
+
+### Supported Models
+
+* yolo-v4-tiny-tf
+* single-image-super-resolution-1032
+* single-image-super-resolution-1033
+
+> **NOTE**: Refer to the tables [Intel's Pre-Trained Models Device Support](../../../models/intel/device_support.md) and [Public Pre-Trained Models Device Support](../../../models/public/device_support.md) for the details on models inference support at different devices.
+
+## Running
+
+Run the application with the `-h` option to see the following usage message:
+
+```
+[ INFO ] OpenVINO Runtime version ......... <version>
+[ INFO ] Build ........... <build>
+
+smart_framing_demo_gapi [OPTION]
+Options:
+
+    -h                         Print a usage message.
+    -i                         Required. An input to process. The input must be a single image, a folder of images, video file or camera id.
+    -loop                      Optional. Enable reading the input in a loop.
+    -o "<path>"                Optional. Name of the output file(s) to save.
+    -limit "<num>"             Optional. Number of frames to store in output. If 0 is set, all frames are stored.
+    -res "<WxH>"               Optional. Set camera resolution in format WxH.
+    -m_yolo "<path>"           Required. Path to an .xml file with a trained YOLO v4 Tiny model.
+    -at_sr "<type>"            Required if Super Resolution is not disabled by apply_sr=false flag. Architecture type: Super Resolution - 3 channels input (3ch) or 1 channel input (1ch).
+    -m_sr "<path>"             Required if Super Resolution is not disabled by apply_sr=false flag. Path to an .xml file with a trained Super Resolution model.
+    -kernel_package "<string>" Optional. G-API kernel package type: opencv, fluid (by default opencv is used).
+    -d_yolo "<device>"         Optional. Target device for YOLO v4 Tiny network (the list of available devices is shown below). The demo will look for a suitable plugin for a specified device. Default value is "CPU".
+    -d_sr "<device>"           Optional. Target device for Super resolution network (the list of available devices is shown below). The demo will look for a suitable plugin for a specified device. Default value is "CPU".
+    -t_conf_yolo               Optional. YOLO v4 Tiny confidence threshold.
+    -t_box_iou_yolo            Optional. YOLO v4 Tiny box IOU threshold.
+    -advanced_pp               Optional. Use advanced post-processing for the YOLO v4 Tiny.
+    -apply_sr                  Optional. Use Super Resolution post processing model.
+    -nireq "<integer>"         Optional. Number of infer requests. If this option is omitted, number of infer requests is determined automatically.
+    -nthreads "<integer>"      Optional. Number of threads.
+    -nstreams                  Optional. Number of streams to use for inference on the CPU or/and GPU in throughput mode (for HETERO and MULTI device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>)
+    -no_show                   Optional. Don't show output.
+    -u                         Optional. List of monitors to show initially.
+
+Available target devices:  <targets>
+```
+
+Running the application with an empty list of options yields the short version of the usage message and an error message.
+
+To run the demo, please provide paths to the model in the IR format, and to an input video, image, or folder with images:
+
+```bash
+./smart_framing_demo_gapi/ -m <path_to_model> -i <path_to_file>
-./smart_framing_demo_gapi/ -m <path_to_model> -i <path_to_file>
+./smart_framing_demo_gapi -m <path_to_model> -i <path_to_file>
-./smart_framing_demo_gapi/ -m <path_to_model> -i <path_to_file>
+./smart_framing_demo_gapi -m <path_to_model> -i <path_to_file>
+```
+
+>**NOTE**: If you provide a single image as an input, the demo processes and renders it quickly, then exits. To continuously visualize inference results on the screen, apply the `loop` option, which enforces processing a single image in a loop.
+
+You can save processed results to a Motion JPEG AVI file or separate JPEG or PNG files using the `-o` option:
+
+* To save processed results in an AVI file, specify the name of the output file with `avi` extension, for example: `-o output.avi`.
+* To save processed results as images, specify the template name of the output image file with `jpg` or `png` extension, for example: `-o output_%03d.jpg`. The actual file names are constructed from the template at runtime by replacing regular expression `%03d` with the frame number, resulting in the following: `output_000.jpg`, `output_001.jpg`, and so on.
+To avoid disk space overrun in case of continuous input stream, like camera, you can limit the amount of data stored in the output file(s) with the `limit` option. The default value is 1000. To change it, you can apply the `-limit N` option, where `N` is the number of frames to store.
+
+>**NOTE**: Windows\* systems may not have the Motion JPEG codec installed by default. If this is the case, you can download OpenCV FFMPEG back end using the PowerShell script provided with the OpenVINO &trade; install package and located at `<INSTALL_DIR>/opencv/ffmpeg-download.ps1`. The script should be run with administrative privileges if OpenVINO &trade; is installed in a system protected folder (this is a typical case). Alternatively, you can save results as images.
+
+## Demo Output
+
+The application uses OpenCV to display resulting images.
+The demo reports
+
+* **FPS**: average rate of video frame processing (frames per second).
+
+## See Also
+
+* [Open Model Zoo Demos](../../README.md)
+* [Model Optimizer](https://docs.openvino.ai/latest/openvino_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html)
+* [Model Downloader](../../../tools/model_tools/README.md)
diff --git a/demos/smart_framing_demo/cpp_gapi/include/custom_kernels.hpp b/demos/smart_framing_demo/cpp_gapi/include/custom_kernels.hpp
@@ -0,0 +1,136 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/infer/ie.hpp>
+
+#include <inference_engine.hpp>
+
+namespace IE = InferenceEngine;
+
+namespace custom {
+
+const std::vector<std::string> coco_classes = {
+"person",         //0
+"bicycle",        //1
+"car",            //2
+"motorcycle",     //3
+"airplane",       //4
+"bus",            //5
+"train",          //6
+"truck",          //7
+"boat",           //8
+"traffic light",  //9
+"fire hydrant",   //10
+"stop sign",      //11
+"parking meter",  //12
+"bench",          //13
+"bird",           //14
+"cat",            //15
+"dog",            //16
+"horse",          //17
+"sheep",          //18
+"cow",            //19
+"elephant",       //20
+"bear",           //21
+"zebra",          //22
+"giraffe",        //23
+"backpack",       //24
+"umbrella",       //25
+"handbag",        //26
+"tie",            //27
+"suitcase",       //28
+"frisbee",        //29
+"skis",           //30
+"snowboard",      //31
+"sports ball",    //32
+"kite",           //33
+"baseball bat",   //34
+"baseball glove", //35
+"skateboard",     //36
+"surfboard",      //37
+"tennis racket",  //38
+"bottle",         //39
+"wine glass",     //40
+"cup",            //41
+"fork",           //42
+"knife",          //43
+"spoon",          //44
+"bowl",           //45
+"banana",         //46
+"apple",          //47
+"sandwich",       //48
+"orange",         //49
+"broccoli",       //50
+"carrot",         //51
+"hot dog",        //52
+"pizza",          //53
+"donut",          //54
+"cake",           //55
+"chair",          //56
+"couch",          //57
+"potted plant",   //58
+"bed",            //59
+"dining table",   //60
+"toilet",         //61
+"tv",             //62
+"laptop",         //63
+"mouse",          //64
+"remote",         //65
+"keyboard",       //66
+"cell phone",     //67
+"microwave",      //68
+"oven",           //69
+"toaster",        //70
+"sink",           //71
+"refrigerator",   //72
+"book",           //73
+"clock",          //74
+"vase",           //75
+"scissors",       //76
+"teddy bear",     //77
+"hair drier",     //78
+"toothbrush"      //79
+};
+
+struct DetectedObject : public cv::Rect2f
+{
+    unsigned int labelID;
+    std::string label;
+    float confidence;
+};
+
+using GDetections = cv::GArray<DetectedObject>;
+
+G_API_OP(GYOLOv4TinyPostProcessingKernel, < GDetections(cv::GMat, cv::GMat, cv::GMat, float, float, bool) >, "custom.yolov4_tiny_post_processing") {
+        static cv::GArrayDesc outMeta(const cv::GMatDesc&, const cv::GMatDesc&, const cv::GMatDesc&, const float, const float, const bool) {
+            return cv::empty_array_desc();
+        }
+};
+
+G_API_OP(GSmartFramingKernel, <cv::GMat(cv::GMat, GDetections)>, "custom.smart_framing") {
+        static cv::GMatDesc outMeta(const cv::GMatDesc & in, const cv::GArrayDesc&) {
+            return in;
+        }
+};
+
+G_API_OP(GSuperResolutionPostProcessingKernel, < cv::GMat(cv::GMat) >, "custom.super_resolution_post_processing") {
+        static cv::GMatDesc outMeta(const cv::GMatDesc & in) {
+            cv::GMatDesc out_desc(CV_8U /* depth */, in.dims[1] /* channels */, cv::Size(in.dims[3], in.dims[2]), false /* planar */);
+            return out_desc;
+        }
+};
+
+G_API_OP(GCvt32Fto8U, <cv::GMat(cv::GMat)>, "custom.convertFP32ToU8") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc & in) {
+        // NB: Input is ND mat.
+        return cv::GMatDesc{ CV_8U, in.dims[1], cv::Size(in.dims[3], in.dims[2]) };
+    }
+};
+
+cv::gapi::GKernelPackage kernels();
+
+} // namespace custom