Merge pull request #3552 from wdkwyf/smartlab_models

[Smartlab]Update smartlab models
openvinotoolkit · Nov 4, 2022 · e7f5ffa · e7f5ffa
2 parents e801239 + 6d06462
commit e7f5ffa
Show file tree

Hide file tree

Showing 10 changed files with 295 additions and 244 deletions.
diff --git a/data/dataset_definitions.yml b/data/dataset_definitions.yml
@@ -1462,6 +1462,17 @@ datasets:
       input_suffix: in
       reference_suffix: out
 
+  -   name: online_mstcn_plus_encoder_dataset
+      data_source: annotation
+      reader: numpy_reader
+      annotation_conversion:
+          converter: feature_regression
+          reference_dir: image
+          input_dir: image
+          input_suffix: in
+          reference_suffix: out
+          use_bin_data: True
+
   - name: smartlab_detection_10cl_top
     data_source: object_detection/streams_1/top/images
     annotation_conversion:

diff --git a/models/intel/device_support.md b/models/intel/device_support.md
@@ -136,8 +136,10 @@
 | yolo-v2-tiny-ava-sparse-30-0001 | YES | YES | YES |
 | yolo-v2-tiny-ava-sparse-60-0001 | YES | YES | YES |
 | yolo-v2-tiny-vehicle-detection-0001 | YES | YES | YES |
-| smartlab-object-detection-0001 | YES |  |  |
-| smartlab-object-detection-0002 | YES |  |  |
-| smartlab-object-detection-0003 | YES |  |  |
-| smartlab-object-detection-0004 | YES |  |  |
+| smartlab-object-detection-0001 | YES | YES  |  |
+| smartlab-object-detection-0002 | YES | YES |  |
+| smartlab-object-detection-0003 | YES | YES |  |
+| smartlab-object-detection-0004 | YES | YES |  |
+| smartlab-action-recognition-0001 | YES | YES |  |
 | smartlab-sequence-modelling-0001 | YES | YES |  |
+| smartlab-sequence-modelling-0002 | YES | YES |  |
diff --git a/models/intel/index.md b/models/intel/index.md
@@ -475,8 +475,12 @@ Deep Learning models for online sequence modeling.
 
 | Model Name | Complexity (GFLOPs) | Size (Mp) |
 |------------|---------------------|-----------|
-| [smartlab-sequence-modelling-0001](./smartlab-sequence-modelling-0001/README.md) | 0.049   | 1.02    |
-
+| [smartlab-sequence-modelling-0001](./smartlab-sequence-modelling-0001/README.md) | 0.11   | 2.537    |
+| [smartlab-sequence-modelling-0002](./smartlab-sequence-modelling-0002/README.md) | 0.049   | 1.02    |
+| [smartlab-action-recognition-0001](./smartlab-action-recognition-0001/README.md) |      |        |
+|   smartlab-action-recognition-0001-encoder-side                                    | 0.611   | 3.387   |
+|   smartlab-action-recognition-0001-encoder-top                                    | 0.611   | 3.387   |
+|   smartlab-action-recognition-0001-decoder                                   | 0.008   | 4.099   |
 ## See Also
 
 * [Open Model Zoo Demos](../../demos/README.md)

diff --git a/models/intel/smartlab-action-recognition-0001/README.md b/models/intel/smartlab-action-recognition-0001/README.md
@@ -0,0 +1,75 @@
+# smartlab-action-recognition-0001 (composite)
+
+## Use Case and High-Level Description
+
+There are 3 models for smartlab action recogntion including two encoder models and one decoder model. 
+
+These models are fine-tuned with smartlab dataset to predict actions and can classfy 3 types of action including "noise_action", "put_take" and "adjust_rider".
+
+## Example of the input data
+![](./assets/frame0001.jpg)
+
+## Example of the output
+Output `put_take` action
+
+## Composite model specification
+| Metric                                         | Value              |
+| ---------------------------------------------- | ------------------ |
+| Accuracy on the DSI1867                        | TODO               |
+| Source framework                               | PyTorch\*          |
+
+## Encoder models specification
+
+The smartlab-action-recognition-0001-encoder-* have Mobilenet-V2 like backbone with convolutional encoder part of the action recognition. 
+
+There are two models called: `smartlab-action-recognition-0001-encoder-side` and `smartlab-action-recognition-0001-encoder-top`, which have the same strcuture but different weights.
+
+| Metric  | Value |
+| ------- | ----- |
+| GFlops  | 0.611 |
+| MParams | 3.387 |
+
+### Inputs
+
+Image, name: `input_image`, shape: `1, 3, 224, 224` in the `B, C, H, W` format, where:
+
+- `B` - batch size
+- `C` - number of channels
+- `H` - image height
+- `W` - image width
+Expected color order is `BGR`
+
+### Outputs
+
+1. Name: `output_feature`, shape: `1, 1280`. Features from encoder part of action recogntion head.
+
+## Decoder model specification
+
+The smartlab-action-recognition-0001-decoder is a fully connected decoder part which accepts features from top and front views, computed by encoder and predicts score for action across following label list: `no_action`, `noise_action`,  `adjust_rider`
+
+| Metric  | Value |
+| ------- | ----- |
+| GFlops  | 0.008 |
+| MParams | 4.099 |
+
+### Inputs
+
+1. Name: `input_feature_1`, shape: `1, 1280`. Encoded features from topview.
+2. Name: `input_feature_2`, shape: `1, 1280`. Encoded features from frontview.
+
+### Outputs
+
+1. Name: `decoder_hidden`, shape: `1, 3`. The foramt [`has_action_conf_score`, `action_1_logits`, `action_2_logits`]
+    * `has_action_conf_score` - confidence for action frame. If>0.5, there is specified action.
+    * `action_1_logits` - confidence for the put_take action class
+    * `action_2_logits` - confidence for the adjust_rider action class
+
+Classification confidence scores in the [0, 1] range.
+## Demo usage
+The model can be used in the following demos provided by the Open Model Zoo to show its capabilities:
+
+- [smartlab_demo/python](../../../demos/smartlab_demo/python/README.md)
+
+## Legal Information
+
+[*] Other names and brands may be claimed as the property of others.
diff --git a/models/intel/smartlab-action-recognition-0001/assets/frame0001.jpg b/models/intel/smartlab-action-recognition-0001/assets/frame0001.jpg
diff --git a/models/intel/smartlab-sequence-modelling-0001/README.md b/models/intel/smartlab-sequence-modelling-0001/README.md
@@ -1,153 +1,33 @@
 # smartlab-sequence-modelling-0001
 
 ## Use Case and High-Level Description
-This is an online action segmentation network for 16 classes trained on Intel dataset. It is an online version of MSTCN++. The difference between online MSTCN++ and MSTCN++ is that the former accept stream video as input while the latter assume the whole video is given.
-
-For the original MSTCN++ model details see [paper](https://arxiv.org/abs/2006.09220)
+This is a feature extractor that is based on Mobilenet-v3 network without origianl classifier layer. Input is RGB image and output is feature vector.
+For the original mobilenet-v3 model details see [PyTorch\* document](https://pytorch.org/vision/stable/models/generated/torchvision.models.mobilenet_v3_small.html#torchvision.models.mobilenet_v3_small) and [paper](https://arxiv.org/abs/1905.02244).
 
 ## Specification
 
 | Metric                          | Value                                     |
 |---------------------------------|-------------------------------------------|
-| GOPs                            | 0.048915                                  |
-| MParams                         | 1.018179                                  |
+| GOPs                            | 0.11                                  |
+| MParams                         | 2.537                                  |
 | Source framework                | PyTorch\*                                 |
 
-## Accuracy
-<table>
-    <tr>
-        <th colspan="2">Accuracy</th>
-        <th>noise/background</th>
-        <th>remove_support_sleeve</th>
-        <th>adjust_rider</th>
-        <th>adjust_nut</th>
-        <th>adjust_balancing</th>
-        <th>open_box</th>
-        <th>close_box</th>
-        <th>choose_weight</th>
-        <th>put_left</th>
-        <th>put_right</th>
-        <th>take_left</th>
-        <th>take_right</th>
-        <th>install support_sleeve</th>
-        <th>mean</th>
-        <th>mPR (P+R)/2</th>
-    </tr>
-    <tbody>
-        <tr>
-            <td rowspan=2>frame-level</td>
-            <td rowspan=1>precision</td>
-            <td>0.22</td>
-            <td>0.84</td>
-            <td>0.81</td>
-            <td>0.62</td>
-            <td>0.67</td>
-            <td>0.87</td>
-            <td>0.56</td>
-            <td>0.52</td>
-            <td>0.54</td>
-            <td>0.74</td>
-            <td>0.62</td>
-            <td>0.68</td>
-            <td>0.86</td>
-            <td>0.66</td>
-            <td rowspan=2>0.66</td>
-        </tr>
-        <tr>
-            <td rowspan=1>recall</td>
-            <td>0.4</td>
-            <td>0.95</td>
-            <td>0.83</td>
-            <td>0.86</td>
-            <td>0.43</td>
-            <td>0.8</td>
-            <td>0.31</td>
-            <td>0.52</td>
-            <td>0.68</td>
-            <td>0.65</td>
-            <td>0.62</td>
-            <td>0.51</td>
-            <td>0.92</td>
-            <td>0.65</td>
-        </tr>
-        <tr>
-            <td rowspan=2>segment IOU</td>
-            <td rowspan=1>precision</td>
-            <td>0.38</td>
-            <td>0.94</td>
-            <td>0.77</td>
-            <td>0.65</td>
-            <td>0.6</td>
-            <td>0.85</td>
-            <td>0.56</td>
-            <td>0.68</td>
-            <td>0.74</td>
-            <td>0.88</td>
-            <td>0.72</td>
-            <td>0.78</td>
-            <td>0.69</td>
-            <td>0.7</td>
-            <td rowspan=2>0.77</td>
-        </tr>
-        <tr>
-            <td>recall</td>
-            <td>0.64</td>
-            <td>1</td>
-            <td>0.96</td>
-            <td>0.94</td>
-            <td>0.62</td>
-            <td>0.96</td>
-            <td>0.48</td>
-            <td>0.77</td>
-            <td>0.91</td>
-            <td>0.88</td>
-            <td>0.83</td>
-            <td>0.85</td>
-            <td>1</td>
-            <td>0.83</td>
-        </tr>
-    </tbody>
-</table>
 
-Notice: In the accuracy report, feature extraction network is i3d-rgb, you can get this model from `../../public/i3d-rgb-tf/README.md`.
 
 ## Inputs
-The inputs to the network are feature vectors at each video frame, which should be the output of feature extraction network, such as [i3d-rgb-tf](../../public/i3d-rgb-tf/README.md) and [resnet-50-tf](../../public/resnet-50-tf/README.md), and feature outputs of the previous frame.
-
-You can check the i3d-rgb and smartlab-sequence-modelling-0001 usage in demos/smartlab_demo
 
-1. Input feature, name: `input`, shape: `1, 2048, 24`, format: `B, W, H`, where:
+Image, name: `input`, shape: `1, 3, 224, 224`, format: `B, C, H, W`, where:
 
    - `B` - batch size
-   - `W` - feature map width
-   - `H` - feature map height
-
-2. History feature 1, name: `fhis_in_0`, shape: `12, 64, 2048`, format: `C, H', W`,
-3. History feature 2, name: `fhis_in_1`, shape: `11, 64, 2048`, format: `C, H', W`,
-4. History feature 3, name: `fhis_in_2`, shape: `11, 64, 2048`, format: `C, H', W`,
-5. History feature 4, name: `fhis_in_3`, shape: `11, 64, 2048`, format: `C, H', W`, where:
+   - `C` - number of channels
+   - `H` - image height
+   - `W` - image width
 
-   - `C` - the channel number of feature vector
-   - `H`- feature map height
-   - `W` - feature map width
 
 ## Outputs
 
-The outputs also include two parts: predictions and four feature outputs. Predictions is the action classification and prediction results. Four Feature maps are the model layer features in past frames.
-1. Prediction, name: `output`, shape: `4, 1, 64, 24`, format: `C, B, H, W`,
-   - `C` - the channel number of feature vector
-   - `B` - batch size
-   - `H`- feature map height
-   - `W` - feature map width
-After post-process with argmax() function, the prediction result can be used to decide the action type of the current frame.
-2. History feature 1, name: `fhis_out_0`, shape: `12, 64, 2048`, format: `C, H, W`,
-3. History feature 2, name: `fhis_out_1`, shape: `11, 64, 2048`, format: `C, H, W`,
-4. History feature 3, name: `fhis_out_2`, shape: `11, 64, 2048`, format: `C, H, W`,
-5. History feature 4, name: `fhis_out_3`, shape: `11, 64, 2048`, format: `C, H, W`, where:
-
-   - `C` - the channel number of feature vector
-   - `H`- feature map height
-   - `W` - feature map width
+Model has output name: `output`, shape: `1, 576, 1, 1`
+`576` is the length of feature map.
 
 ## Legal Information
 [*] Other names and brands may be claimed as the property of others.
diff --git a/models/intel/smartlab-sequence-modelling-0001/accuracy-check.yml b/models/intel/smartlab-sequence-modelling-0001/accuracy-check.yml
@@ -3,62 +3,20 @@ models:
     launchers:
       - framework: openvino
         adapter:
-          type: multi_output_regression
-          ignore_batch: True
-          outputs:
-            - output
-            - fhis_out_0
-            - fhis_out_1
-            - fhis_out_2
-            - fhis_out_3
+          type: regression
         inputs:
           - name: input
             type: INPUT
             value: .*input
-            shape: [1, 2048, 24]
-            layout: NHWC
-          - name: fhis_in_0
-            type: INPUT
-            value: .*fhis_in_0
-            shape: [12, 64, 2048]
-            layout: NHWC
-          - name: fhis_in_1
-            type: INPUT
-            value: .*fhis_in_1
-            shape: [11, 64, 2048]
-            layout: NHWC
-          - name: fhis_in_2
-            type: INPUT
-            value: .*fhis_in_2
-            shape: [11, 64, 2048]
-            layout: NHWC
-          - name: fhis_in_3
-            type: INPUT
-            value: .*fhis_in_3
-            shape: [11, 64, 2048]
-            layout: NHWC
-        allow_reshape_input: True
+            shape: [1, 3, 224, 224]
+            layout: NCHW
     datasets:
-      - name: online_mstcn_plus_dataset
+      - name: online_mstcn_plus_encoder_dataset
         metrics:
           - type: mae
-            max_error: True
             presenter: print_vector
             abs_threshold: 0.1
             rel_threshold: 1
             reference:
-              output@max_error: 1e-5
-              output@mean: 1e-5
-              output@std: 0
-              fhis_out_0@max_error: 1e-5
-              fhis_out_0@mean: 1e-5
-              fhis_out_0@std: 0
-              fhis_out_1@max_error: 1e-5
-              fhis_out_1@mean: 1e-5
-              fhis_out_1@std: 0
-              fhis_out_2@max_error: 1e-5
-              fhis_out_2@mean: 1e-5
-              fhis_out_2@std: 0
-              fhis_out_3@max_error: 1e-5
-              fhis_out_3@mean: 1e-5
-              fhis_out_3@std: 0
+              mean: 0.05
+              std: 0.1