Skip to content

Commit

Permalink
optimized conv1d, no regression in all conv1d cases.
Browse files Browse the repository at this point in the history
  • Loading branch information
zihaomu committed Dec 20, 2022
1 parent 13bede9 commit 5084ff5
Show file tree
Hide file tree
Showing 2 changed files with 310 additions and 110 deletions.
116 changes: 90 additions & 26 deletions modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,9 @@ Ptr<FastConv> initFastConv(

if (ifRunDepthWise)
{
if (conv_dim == CONV_1D)
if (conv_dim == CONV_1D || conv_dim == CONV_3D)
{
ifRunDepthWise &= Hk == 1 && Wk == 3 && (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
&& max(stride_w, dilation_w) >= conv->pad_left && conv->pad_left <= 1;
ifRunDepthWise = false;
}
else if (conv_dim == CONV_2D)
{
Expand All @@ -87,7 +86,7 @@ Ptr<FastConv> initFastConv(
&& conv->pad_left <= 1 && conv->pad_top <= 1;
}

if (!ifRunDepthWise || conv_dim == CONV_3D)
if (!ifRunDepthWise)
{
ifRunDepthWise = false;
ifRunDepthWiseRemain = true;
Expand Down Expand Up @@ -417,7 +416,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
}
}

int MAX_STRIPES = 32; // (56 + CONV_NR - 1)/CONV_NR;
int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;

// Friendly to L1 cache
const int K_BLOCK_SIZE = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
Expand All @@ -435,7 +434,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
else
Kg_nblocks = 1;

bool bigKernelDepthPack = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN && ksize > 25; // Spacial data pack branch for big kernel depth-wise.
bool bigKernelDepthPack = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN && (ksize > 25 || conv_dim == CONV_1D); // Spacial data pack branch for big kernel depth-wise.

int Kstripes = Kg_nblocks*stripes_per_sample;
int nsubtasks = N*ngroups*Kstripes;
Expand Down Expand Up @@ -553,7 +552,58 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
int y0 = yx0 / W0, x0 = yx0 - y0 * W0;

if (conv_dim == CONV_1D || conv_dim == CONV_2D)
if (conv_dim == CONV_1D)
{
for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
{
int delta = std::min(slice_len - slice_i, W0 - x0);
int x1 = x0 + delta;

int in_w = x0 * stride_w - pad_left;
float* inptrIn = inptr + in_w;

int s0 = slice_i;

for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
{
if (x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
{
float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;

for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
inpbufC[k*CONV_NRCg] = v0;
inpbufC[k*CONV_NRCg+1] = v1;
}

x0++;
s0++;
inptrIn += stride_w;
in_w += stride_w;
}
else
{
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);

float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;
for (int w = w0; w < w1; w++)
{
int imgofs = w*dilation_w;
inpbufC[w*CONV_NRCg] = inptrInC[imgofs];
}
}
}
slice_i += delta;
}
}
if (conv_dim == CONV_2D)
{
for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
{
Expand Down Expand Up @@ -799,28 +849,31 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co

// spacial branch for depth-wise convolution implemented using generic convolution.
// In this case, CONV_MR is 1, and CONV_NR is the same.
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) {
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
{
size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
float *cptr0 = cbuf_task;
float *weights = conv->weightsBufPtr + g * padded_ksize;
int out_width = zyx_block_limit - zyx0;
float *outptr = out + outofs;
const float biasVal = *(conv->biasBuf.data() + g);
for (int stripe = 0; stripe < nstripes; stripe++) {
for (int stripe = 0; stripe < nstripes; stripe++)
{
const float *inptr = inpbuf_task + stripe * stripesize;
const int outLen = out_width - stripe * CONV_NR;
const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
bool ifBuffer = outLen < CONV_NR;
float *cptr = outptr + stripe * CONV_NR;
if (ifBuffer) {
if (ifBuffer)
{
memcpy(cptr0, cptr, outLen * sizeof(cptr[0]));
cptr = cptr0;
}
#if CV_TRY_AVX2
if (conv->useAVX2)
if (conv->useAVX2 && outLen > CONV_NR/3)
opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
else
else
#endif
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen);

if (ifBuffer)
{
Expand All @@ -837,33 +890,44 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int ldc = nstripes * CONV_NR;

// 2. do convolution, compute Kg x (zyx_block_limit - zyx0) part of the output tensor
for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE) {
int out_width = zyx_block_limit - zyx0;
for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE)
{
int k1_block = k0_block + K_BLOCK_SIZE < k1 ? k0_block + K_BLOCK_SIZE : k1;
for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE) {
for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE)
{
int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg;
for (int stripe = 0; stripe < nstripes; stripe++) {
for (int stripe = 0; stripe < nstripes; stripe++)
{
const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);

#if CV_TRY_AVX2 || CV_TRY_NEON
// The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4.
bool runOpt = outLen > std::min(8, CONV_NR/3);
#endif
float *wptr = weights + k0_block * DkHkWkCg + c0 * CONV_MR;
const float *inptr = inpbuf_task + stripe * stripesize + c0 * CONV_NR;
float *cptr = cbuf_task + stripe * CONV_NR;
for (int k = k0_block; k < k1_block; k += CONV_MR,
wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc) {
wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc)
{
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
if (conv->useAVX2 && runOpt)
opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
#endif
#if CV_TRY_NEON
if (conv->useNEON)
opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
if (conv->useNEON && runOpt)
opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
#endif
convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
// The possible outLen range is 24 or 8~1.
convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen);
}
}
}

size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
int out_width = zyx_block_limit - zyx0;
const float *cptr = cbuf_task;

float *outptr = out + outofs;
Expand Down

0 comments on commit 5084ff5

Please sign in to comment.