fused the convolutio 3d and 1d to the new implementation.

opencv · Dec 10, 2022 · 54cf528 · 54cf528
1 parent 281b790
commit 54cf528
Show file tree

Hide file tree

Showing 6 changed files with 285 additions and 2,363 deletions.
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
@@ -16,7 +16,7 @@ namespace cv { namespace dnn {
 
 static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
                            float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
-                           int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
+                           int dilation_h, int stride_w, int stride_h, int inner_xleft, int inner_xright, int inner_ytop,
                            int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
 {
 #if CV_SIMD128
@@ -46,11 +46,11 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
     for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
     {
 #if CV_SIMD128
-        dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
+        dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_h == 1 && dilation_h == 1
               ? 3 : 1;
 #endif
         int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
-        int yi_ = y0 * stride_y - pad_top;
+        int yi_ = y0 * stride_h - pad_top;
 
         for (;;)
         {
@@ -59,7 +59,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
             {
                 for (; x0 < x1; x0++)
                 {
-                    int xi_ = x0 * stride_x - pad_left;
+                    int xi_ = x0 * stride_w - pad_left;
                     s_0 = s_1 = s_2 = biasval;
                     for (int k = 0; k < ksize; k++)
                     {
@@ -87,7 +87,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
             {
                 for (; x0 < x1; x0++)
                 {
-                    int xi_ = x0 * stride_x - pad_left;
+                    int xi_ = x0 * stride_w - pad_left;
                     s_0 = biasval;
                     for (int k = 0; k < ksize; k++) {
                         int dy = yxtab[k * 2];
@@ -113,7 +113,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                     {
                         for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                         {
-                            int xi_ = x0 * stride_x - pad_left;
+                            int xi_ = x0 * stride_w - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
 
                             v_float32x4 s0, s1, s2;
@@ -189,7 +189,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                     {
                         for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                         {
-                            int xi_ = x0 * stride_x - pad_left;
+                            int xi_ = x0 * stride_w - pad_left;
                             const float *inptr_xi = inptr + Wi * yi_ + xi_;
                             v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias);
                             v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1;
@@ -214,7 +214,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
                 {
                     for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
                     {
-                        int xi_ = x0 * stride_x - pad_left, k = 0;
+                        int xi_ = x0 * stride_w - pad_left, k = 0;
                         const float *inptr_xi = inptr + Wi * yi_ + xi_;
                         v_float32x4 s0 = vbias;
                         for (; k <= ksize - 4; k += 4)
@@ -248,7 +248,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
             {
                 for (; x0 < x1; x0++)
                 {
-                    int xi_ = x0 * stride_x - pad_left;
+                    int xi_ = x0 * stride_w - pad_left;
                     const float *inptr_xi = inptr + W0 * yi_ + xi_;
                     s_0 = s_1 = s_2 = biasval;
                     for (int k = 0; k < ksize; k++)
@@ -275,7 +275,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
             {
                 for (; x0 < x1; x0++)
                 {
-                    int xi_ = x0 * stride_x - pad_left;
+                    int xi_ = x0 * stride_w - pad_left;
                     const float *inptr_xi = inptr + Wi * yi_ + xi_;
                     s_0 = biasval;
                     for (int k = 0; k < ksize; k++)
@@ -293,24 +293,37 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
     }
 }
 
-void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
+void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
     Mat input = _input.getMat();
     Mat output = _output.getMat();
     MatShape inputShape = shape(input);
     MatShape outputShape = shape(output);
-    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
 
-    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    // Only Conv1D and Conv2D are supported for Depth-wise convolution.
+    CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
+    CV_Assert(inputShape.size() == outputShape.size());
+
+    int conv_dim = conv->conv_dim;
+    CV_Assert(conv_dim == CONV_2D || conv_dim == CONV_1D);
+
+    int N = inputShape[0], C = inputShape[1];
+
+    int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
+    int Wi = inputShape[inputShape.size() - 1];
+
     int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
-    int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;
+
+    int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
+    int W0 = outputShape[outputShape.size() - 1];
+    int ngroups = conv->ngroups;
 
     const size_t inp_planesize = (size_t) Hi * Wi;
     const size_t out_planesize = (size_t) H0 * W0;
 
     CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
 
-    int stride_y = conv->stride_y, stride_x = conv->stride_x;
-    int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
+    int stride_h = conv->stride_h, stride_w = conv->stride_w;
+    int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
 
     int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
     int pad_left = conv->pad_left, pad_right = conv->pad_right;
@@ -333,22 +346,22 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
     {
         int y = k < ksize ? k / Wk : 0;
         int x = k < ksize ? k % Wk : 0;
-        int dy = y * dilation_y, dx = x * dilation_x;
+        int dy = y * dilation_h, dx = x * dilation_w;
         yxtab[k * 2] = dy;
         yxtab[k * 2 + 1] = dx;
         ofstab[k] = dy * Wi + dx;
     }
 
     const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
-    int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3;
-    int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4;
+    int inner_ytop = (pad_bottom + stride_h - 1) / stride_h, inner_ybottom = 3;
+    int inner_xleft = (pad_left + stride_w - 1) / stride_w, inner_xright = 4;
 
     CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
 
-    inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x;
-    inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi;
-    inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y;
-    inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi;
+    inner_xright = (Wi - (Wk - 1) * dilation_w + pad_left) / stride_w;
+    inner_xright += inner_xright * stride_w - pad_left + (Wk - 1) * dilation_w < Wi;
+    inner_ybottom = (Hi - (Hk - 1) * dilation_h + pad_top) / stride_h;
+    inner_ybottom += inner_ybottom * stride_h - pad_top + (Hk - 1) * dilation_h < Hi;
 
     if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom)
     {
@@ -358,34 +371,33 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
 
     inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0;
 
-    bool useSIMD = stride_x == 1 && inner_xleft < W0;
+    bool useSIMD = stride_w == 1 && inner_xleft < W0;
     bool is3x3 = Hk == 3 && Wk == 3;
 
     parallel_for_(Range(0, N * C), [&](const Range &r0) {
-        for (int nc = r0.start; nc < r0.end; nc++)
-        {
-            int c = nc % C;
-            const float *inptr = inp + inp_planesize * nc;
-            float *outptr0 = out + out_planesize * nc;
+    for (int nc = r0.start; nc < r0.end; nc++)
+    {
+        int c = nc % C;
+        const float *inptr = inp + inp_planesize * nc;
+        float *outptr0 = out + out_planesize * nc;
 
-            float biasval = bias[c];
-            const float *weights = weights0 + c * padded_ksize;
+        float biasval = bias[c];
+        const float *weights = weights0 + c * padded_ksize;
 
 #if CV_TRY_AVX2
-            if (conv->useAVX2)
-                opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
-                                         pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
-                                         inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
-            else
+        if (conv->useAVX2)
+            opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
+                                     pad_top, pad_left, dilation_h, stride_w, stride_h, inner_xleft, inner_xright, inner_ytop,
+                                     inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
+        else
 #endif
-            depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
-                           pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
-                           inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
+        depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
+                       pad_top, pad_left, dilation_h, stride_w, stride_h, inner_xleft, inner_xright, inner_ytop,
+                       inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
 
-            if (activ)
-                activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
-        }
-    });
+        if (activ)
+            activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
+    }});
 }
 
-}} // namespace cv::dnn
+}} // namespace cv::dnn