optimized conv1d, no regression in all conv1d cases.

opencv · Dec 20, 2022 · 5084ff5 · 5084ff5
1 parent 13bede9
commit 5084ff5
Show file tree

Hide file tree

Showing 2 changed files with 310 additions and 110 deletions.
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
@@ -75,10 +75,9 @@ Ptr<FastConv> initFastConv(
 
     if (ifRunDepthWise)
     {
-        if (conv_dim == CONV_1D)
+        if (conv_dim == CONV_1D || conv_dim == CONV_3D)
         {
-            ifRunDepthWise &= Hk == 1 && Wk == 3 && (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
-                              && max(stride_w, dilation_w) >= conv->pad_left && conv->pad_left <= 1;
+            ifRunDepthWise = false;
         }
         else if (conv_dim == CONV_2D)
         {
@@ -87,7 +86,7 @@ Ptr<FastConv> initFastConv(
                               && conv->pad_left <= 1 && conv->pad_top <= 1;
         }
 
-        if (!ifRunDepthWise || conv_dim == CONV_3D)
+        if (!ifRunDepthWise)
         {
             ifRunDepthWise = false;
             ifRunDepthWiseRemain = true;
@@ -417,7 +416,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
             }
     }
 
-    int MAX_STRIPES = 32; // (56 + CONV_NR - 1)/CONV_NR;
+    int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;
 
     // Friendly to L1 cache
     const int K_BLOCK_SIZE = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
@@ -435,7 +434,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     else
         Kg_nblocks = 1;
 
-    bool bigKernelDepthPack = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN && ksize > 25; // Spacial data pack branch for big kernel depth-wise.
+    bool bigKernelDepthPack = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN && (ksize > 25 || conv_dim == CONV_1D); // Spacial data pack branch for big kernel depth-wise.
 
     int Kstripes = Kg_nblocks*stripes_per_sample;
     int nsubtasks = N*ngroups*Kstripes;
@@ -553,7 +552,58 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                         int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
                         int y0 = yx0 / W0, x0 = yx0 - y0 * W0;
 
-                        if (conv_dim == CONV_1D || conv_dim == CONV_2D)
+                        if (conv_dim == CONV_1D)
+                        {
+                            for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
+                            {
+                                int delta = std::min(slice_len - slice_i, W0 - x0);
+                                int x1 = x0 + delta;
+
+                                int in_w = x0 * stride_w - pad_left;
+                                float* inptrIn = inptr + in_w;
+
+                                int s0 = slice_i;
+
+                                for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
+                                {
+                                    if (x0 + 2 <= x1 && 0 <= in_w &&
+                                        in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
+                                    {
+                                        float* inpbufC = inpbuf + s0;
+                                        float* inptrInC = inptrIn;
+
+                                        for (int k = 0; k < ksize; k++)
+                                        {
+                                            int k1 = ofstab[k];
+                                            float v0 = inptrInC[k1];
+                                            float v1 = inptrInC[k1 + stride_w];
+                                            inpbufC[k*CONV_NRCg] = v0;
+                                            inpbufC[k*CONV_NRCg+1] = v1;
+                                        }
+
+                                        x0++;
+                                        s0++;
+                                        inptrIn += stride_w;
+                                        in_w += stride_w;
+                                    }
+                                    else
+                                    {
+                                        int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
+                                        int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
+
+                                        float* inpbufC = inpbuf + s0;
+                                        float* inptrInC = inptrIn;
+                                        for (int w = w0; w < w1; w++)
+                                        {
+                                            int imgofs = w*dilation_w;
+                                            inpbufC[w*CONV_NRCg] = inptrInC[imgofs];
+                                        }
+                                    }
+                                }
+                                slice_i += delta;
+                            }
+                        }
+                        if (conv_dim == CONV_2D)
                         {
                             for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
                             {
@@ -799,28 +849,31 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
 
                 // spacial branch for depth-wise convolution implemented using generic convolution.
                 // In this case, CONV_MR is 1, and CONV_NR is the same.
-                if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) {
+                if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
+                {
                     size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
                     float *cptr0 = cbuf_task;
                     float *weights = conv->weightsBufPtr + g * padded_ksize;
                     int out_width = zyx_block_limit - zyx0;
                     float *outptr = out + outofs;
                     const float biasVal = *(conv->biasBuf.data() + g);
-                    for (int stripe = 0; stripe < nstripes; stripe++) {
+                    for (int stripe = 0; stripe < nstripes; stripe++)
+                    {
                         const float *inptr = inpbuf_task + stripe * stripesize;
-                        const int outLen = out_width - stripe * CONV_NR;
+                        const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
                         bool ifBuffer = outLen < CONV_NR;
                         float *cptr = outptr + stripe * CONV_NR;
-                        if (ifBuffer) {
+                        if (ifBuffer)
+                        {
                             memcpy(cptr0, cptr, outLen * sizeof(cptr[0]));
                             cptr = cptr0;
                         }
 #if CV_TRY_AVX2
-                        if (conv->useAVX2)
+                        if (conv->useAVX2 && outLen > CONV_NR/3)
                                 opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
-                            else
+                        else
 #endif
-                        convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
+                        convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen);
 
                         if (ifBuffer)
                         {
@@ -837,33 +890,44 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                 int ldc = nstripes * CONV_NR;
 
                 // 2. do convolution, compute Kg x (zyx_block_limit - zyx0) part of the output tensor
-                for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE) {
+                int out_width = zyx_block_limit - zyx0;
+                for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE)
+                {
                     int k1_block = k0_block + K_BLOCK_SIZE < k1 ? k0_block + K_BLOCK_SIZE : k1;
-                    for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE) {
+                    for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE)
+                    {
                         int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg;
-                        for (int stripe = 0; stripe < nstripes; stripe++) {
+                        for (int stripe = 0; stripe < nstripes; stripe++)
+                        {
+                            const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
+
+#if CV_TRY_AVX2 || CV_TRY_NEON
+                            // The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4.
+                            bool runOpt = outLen > std::min(8, CONV_NR/3);
+#endif
                             float *wptr = weights + k0_block * DkHkWkCg + c0 * CONV_MR;
                             const float *inptr = inpbuf_task + stripe * stripesize + c0 * CONV_NR;
                             float *cptr = cbuf_task + stripe * CONV_NR;
                             for (int k = k0_block; k < k1_block; k += CONV_MR,
-                                    wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc) {
+                                    wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc)
+                            {
 #if CV_TRY_AVX2
-                                if (conv->useAVX2)
-                                opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
-                            else
+                                if (conv->useAVX2 && runOpt)
+                                    opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
+                                else
 #endif
 #if CV_TRY_NEON
-                                if (conv->useNEON)
-                                opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
-                            else
+                                if (conv->useNEON && runOpt)
+                                    opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
+                                else
 #endif
-                                convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
+                                // The possible outLen range is 24 or 8~1.
+                                convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen);
                             }
                         }
                     }
 
                     size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
-                    int out_width = zyx_block_limit - zyx0;
                     const float *cptr = cbuf_task;
 
                     float *outptr = out + outofs;