Skip to content

Commit

Permalink
optimized conv1d and conv3d
Browse files Browse the repository at this point in the history
  • Loading branch information
zihaomu committed Dec 20, 2022
1 parent 13bede9 commit 8f44578
Show file tree
Hide file tree
Showing 2 changed files with 422 additions and 129 deletions.
247 changes: 202 additions & 45 deletions modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ Ptr<FastConv> initFastConv(
{
if (conv_dim == CONV_1D)
{
#if CV_NEON
ifRunDepthWise = false;
#else
ifRunDepthWise &= Hk == 1 && Wk == 3 && (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
&& max(stride_w, dilation_w) >= conv->pad_left && conv->pad_left <= 1;
#endif
}
else if (conv_dim == CONV_2D)
{
Expand Down Expand Up @@ -267,6 +271,83 @@ Ptr<FastConv> initFastConv(
return conv;
}

static void packData8(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
const int stride_w, const int ksize)
{
float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;

if (stride_w == 1)
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + 1];
float v2 = inptrInC[k1 + 2];
float v3 = inptrInC[k1 + 3];
float v4 = inptrInC[k1 + 4];
float v5 = inptrInC[k1 + 5];
float v6 = inptrInC[k1 + 6];
float v7 = inptrInC[k1 + 7];

inpbufC[k*CONV_NR] = v0;
inpbufC[k*CONV_NR+1] = v1;
inpbufC[k*CONV_NR+2] = v2;
inpbufC[k*CONV_NR+3] = v3;
inpbufC[k*CONV_NR+4] = v4;
inpbufC[k*CONV_NR+5] = v5;
inpbufC[k*CONV_NR+6] = v6;
inpbufC[k*CONV_NR+7] = v7;
}
else
for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
float v2 = inptrInC[k1 + 2*stride_w];
float v3 = inptrInC[k1 + 3*stride_w];
float v4 = inptrInC[k1 + 4*stride_w];
float v5 = inptrInC[k1 + 5*stride_w];
float v6 = inptrInC[k1 + 6*stride_w];
float v7 = inptrInC[k1 + 7*stride_w];

inpbufC[k*CONV_NR] = v0;
inpbufC[k*CONV_NR+1] = v1;
inpbufC[k*CONV_NR+2] = v2;
inpbufC[k*CONV_NR+3] = v3;
inpbufC[k*CONV_NR+4] = v4;
inpbufC[k*CONV_NR+5] = v5;
inpbufC[k*CONV_NR+6] = v6;
inpbufC[k*CONV_NR+7] = v7;
}
x0+=7;
s0+=7;
inptrIn += 7*stride_w;
in_w += 7*stride_w;
}

static void packData2(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
const int stride_w, const int ksize)
{
float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;

for (int k = 0; k < ksize; k++)
{
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
inpbufC[k*CONV_NR] = v0;
inpbufC[k*CONV_NR+1] = v1;
}

x0++;
s0++;
inptrIn += stride_w;
in_w += stride_w;
}

void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd)
{
Expand Down Expand Up @@ -417,7 +498,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
}
}

int MAX_STRIPES = 32; // (56 + CONV_NR - 1)/CONV_NR;
int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;

// Friendly to L1 cache
const int K_BLOCK_SIZE = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
Expand All @@ -435,7 +516,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
else
Kg_nblocks = 1;

bool bigKernelDepthPack = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN && ksize > 25; // Spacial data pack branch for big kernel depth-wise.
// Spacial data pack branch for big kernel depth-wise.
bool bigKernelDepthPack = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN && (conv_dim == CONV_3D || conv_dim == CONV_1D);

int Kstripes = Kg_nblocks*stripes_per_sample;
int nsubtasks = N*ngroups*Kstripes;
Expand Down Expand Up @@ -539,7 +621,6 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
else if (bigKernelDepthPack)
{
CV_Assert(Cg == 1);
const int CONV_NRCg = CONV_NR * Cg;
const int HW0 = H0 * W0;
const int HWi = Hi * Wi;
int slice_len = std::min(zyx_block_limit - zyx0, CONV_NR);
Expand All @@ -553,28 +634,28 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
int y0 = yx0 / W0, x0 = yx0 - y0 * W0;

if (conv_dim == CONV_1D || conv_dim == CONV_2D)
if (conv_dim == CONV_1D)
{
for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
{
int delta = std::min(slice_len - slice_i, W0 - x0);
int x1 = x0 + delta;

int in_h = y0 * stride_h - pad_top;
int in_w = x0 * stride_w - pad_left;

float* inptrIn = inptr + in_h*Wi + in_w;

bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);
float* inptrIn = inptr + in_w;

int s0 = slice_i;

for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
{
if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
// Pack 8
if (x0 + 8 <= x1 && 0 <= in_w &&
in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
{
packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w,ksize);
}
else if (x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
{
float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;
Expand All @@ -584,8 +665,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int k1 = ofstab[k];
float v0 = inptrInC[k1];
float v1 = inptrInC[k1 + stride_w];
inpbufC[k*CONV_NRCg] = v0;
inpbufC[k*CONV_NRCg+1] = v1;
inpbufC[k*CONV_NR] = v0;
inpbufC[k*CONV_NR+1] = v1;
}

x0++;
Expand All @@ -598,6 +679,53 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);

float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;
for (int w = w0; w < w1; w++)
{
int imgofs = w*dilation_w;
inpbufC[w*CONV_NR] = inptrInC[imgofs];
}
}
}
slice_i += delta;
}
}
if (conv_dim == CONV_2D)
{
for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
{
int delta = std::min(slice_len - slice_i, W0 - x0);
int x1 = x0 + delta;

int in_h = y0 * stride_h - pad_top;
int in_w = x0 * stride_w - pad_left;

float* inptrIn = inptr + in_h*Wi + in_w;

bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);

int s0 = slice_i;
for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
{
// Pack 8
if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
{
packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w,ksize);
}
else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
{
packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w,ksize);
}
else
{
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);

float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;

Expand All @@ -606,7 +734,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
for (int w = w0; w < w1; w++)
{
int imgofs = h*(dilation_h*Wi) + w*dilation_w;
inpbufC[(h*Wk + w)*CONV_NRCg] = inptrInC[imgofs];
inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs];
}
}
}
Expand All @@ -630,26 +758,41 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
int d1 = std::min(Dk, (Di - in_d + dilation_d - 1) / dilation_d);

bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);

int s0 = slice_i;
for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
{
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
// Pack 8
if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
{
packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w,ksize);
}
else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
{
packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w,ksize);
}
else
{
int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);

float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;
float* inpbufC = inpbuf + s0;
float* inptrInC = inptrIn;

for ( int d = d0; d < d1; d++)
{
for (int h = h0; h < h1; h++)
for ( int d = d0; d < d1; d++)
{
for (int w = w0; w < w1; w++)
for (int h = h0; h < h1; h++)
{
int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
inpbufC[((d*Hk + h)*Wk + w)*CONV_NRCg] = inptrInC[imgofs];
for (int w = w0; w < w1; w++)
{
int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs];
}
}
}
}
Expand Down Expand Up @@ -799,28 +942,31 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co

// spacial branch for depth-wise convolution implemented using generic convolution.
// In this case, CONV_MR is 1, and CONV_NR is the same.
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) {
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
{
size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
float *cptr0 = cbuf_task;
float *weights = conv->weightsBufPtr + g * padded_ksize;
int out_width = zyx_block_limit - zyx0;
float *outptr = out + outofs;
const float biasVal = *(conv->biasBuf.data() + g);
for (int stripe = 0; stripe < nstripes; stripe++) {
for (int stripe = 0; stripe < nstripes; stripe++)
{
const float *inptr = inpbuf_task + stripe * stripesize;
const int outLen = out_width - stripe * CONV_NR;
const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
bool ifBuffer = outLen < CONV_NR;
float *cptr = outptr + stripe * CONV_NR;
if (ifBuffer) {
if (ifBuffer)
{
memcpy(cptr0, cptr, outLen * sizeof(cptr[0]));
cptr = cptr0;
}
#if CV_TRY_AVX2
if (conv->useAVX2)
if (conv->useAVX2 && outLen > CONV_NR/3)
opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
else
else
#endif
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen);

if (ifBuffer)
{
Expand All @@ -837,33 +983,44 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
int ldc = nstripes * CONV_NR;

// 2. do convolution, compute Kg x (zyx_block_limit - zyx0) part of the output tensor
for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE) {
int out_width = zyx_block_limit - zyx0;
for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE)
{
int k1_block = k0_block + K_BLOCK_SIZE < k1 ? k0_block + K_BLOCK_SIZE : k1;
for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE) {
for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE)
{
int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg;
for (int stripe = 0; stripe < nstripes; stripe++) {
for (int stripe = 0; stripe < nstripes; stripe++)
{
const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);

#if CV_TRY_AVX2 || CV_TRY_NEON
// The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4.
bool runOpt = outLen > std::min(8, CONV_NR/3);
#endif
float *wptr = weights + k0_block * DkHkWkCg + c0 * CONV_MR;
const float *inptr = inpbuf_task + stripe * stripesize + c0 * CONV_NR;
float *cptr = cbuf_task + stripe * CONV_NR;
for (int k = k0_block; k < k1_block; k += CONV_MR,
wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc) {
wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc)
{
#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
if (conv->useAVX2 && runOpt)
opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
#endif
#if CV_TRY_NEON
if (conv->useNEON)
opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
if (conv->useNEON && runOpt)
opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
else
#endif
convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
// The possible outLen range is 24 or 8~1.
convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen);
}
}
}

size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
int out_width = zyx_block_limit - zyx0;
const float *cptr = cbuf_task;

float *outptr = out + outofs;
Expand Down

0 comments on commit 8f44578

Please sign in to comment.