Skip to content

Commit

Permalink
fused the convolutio 3d and 1d to the new implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
zihaomu committed Dec 10, 2022
1 parent 281b790 commit 54cf528
Show file tree
Hide file tree
Showing 6 changed files with 285 additions and 2,363 deletions.
839 changes: 10 additions & 829 deletions modules/dnn/src/layers/convolution_layer.cpp

Large diffs are not rendered by default.

100 changes: 56 additions & 44 deletions modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace cv { namespace dnn {

static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
int dilation_h, int stride_w, int stride_h, int inner_xleft, int inner_xright, int inner_ytop,
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
{
#if CV_SIMD128
Expand Down Expand Up @@ -46,11 +46,11 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
{
#if CV_SIMD128
dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_h == 1 && dilation_h == 1
? 3 : 1;
#endif
int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
int yi_ = y0 * stride_y - pad_top;
int yi_ = y0 * stride_h - pad_top;

for (;;)
{
Expand All @@ -59,7 +59,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
int xi_ = x0 * stride_w - pad_left;
s_0 = s_1 = s_2 = biasval;
for (int k = 0; k < ksize; k++)
{
Expand Down Expand Up @@ -87,7 +87,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
int xi_ = x0 * stride_w - pad_left;
s_0 = biasval;
for (int k = 0; k < ksize; k++) {
int dy = yxtab[k * 2];
Expand All @@ -113,7 +113,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
int xi_ = x0 * stride_w - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;

v_float32x4 s0, s1, s2;
Expand Down Expand Up @@ -189,7 +189,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
int xi_ = x0 * stride_w - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias);
v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1;
Expand All @@ -214,7 +214,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left, k = 0;
int xi_ = x0 * stride_w - pad_left, k = 0;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
v_float32x4 s0 = vbias;
for (; k <= ksize - 4; k += 4)
Expand Down Expand Up @@ -248,7 +248,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
int xi_ = x0 * stride_w - pad_left;
const float *inptr_xi = inptr + W0 * yi_ + xi_;
s_0 = s_1 = s_2 = biasval;
for (int k = 0; k < ksize; k++)
Expand All @@ -275,7 +275,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
for (; x0 < x1; x0++)
{
int xi_ = x0 * stride_x - pad_left;
int xi_ = x0 * stride_w - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
s_0 = biasval;
for (int k = 0; k < ksize; k++)
Expand All @@ -293,24 +293,37 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
}
}

void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
Mat input = _input.getMat();
Mat output = _output.getMat();
MatShape inputShape = shape(input);
MatShape outputShape = shape(output);
CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);

int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
// Only Conv1D and Conv2D are supported for Depth-wise convolution.
CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
CV_Assert(inputShape.size() == outputShape.size());

int conv_dim = conv->conv_dim;
CV_Assert(conv_dim == CONV_2D || conv_dim == CONV_1D);

int N = inputShape[0], C = inputShape[1];

int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
int Wi = inputShape[inputShape.size() - 1];

int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;

int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
int W0 = outputShape[outputShape.size() - 1];
int ngroups = conv->ngroups;

const size_t inp_planesize = (size_t) Hi * Wi;
const size_t out_planesize = (size_t) H0 * W0;

CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);

int stride_y = conv->stride_y, stride_x = conv->stride_x;
int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
int stride_h = conv->stride_h, stride_w = conv->stride_w;
int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;

int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
int pad_left = conv->pad_left, pad_right = conv->pad_right;
Expand All @@ -333,22 +346,22 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
{
int y = k < ksize ? k / Wk : 0;
int x = k < ksize ? k % Wk : 0;
int dy = y * dilation_y, dx = x * dilation_x;
int dy = y * dilation_h, dx = x * dilation_w;
yxtab[k * 2] = dy;
yxtab[k * 2 + 1] = dx;
ofstab[k] = dy * Wi + dx;
}

const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3;
int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4;
int inner_ytop = (pad_bottom + stride_h - 1) / stride_h, inner_ybottom = 3;
int inner_xleft = (pad_left + stride_w - 1) / stride_w, inner_xright = 4;

CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));

inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x;
inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi;
inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y;
inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi;
inner_xright = (Wi - (Wk - 1) * dilation_w + pad_left) / stride_w;
inner_xright += inner_xright * stride_w - pad_left + (Wk - 1) * dilation_w < Wi;
inner_ybottom = (Hi - (Hk - 1) * dilation_h + pad_top) / stride_h;
inner_ybottom += inner_ybottom * stride_h - pad_top + (Hk - 1) * dilation_h < Hi;

if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom)
{
Expand All @@ -358,34 +371,33 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&

inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0;

bool useSIMD = stride_x == 1 && inner_xleft < W0;
bool useSIMD = stride_w == 1 && inner_xleft < W0;
bool is3x3 = Hk == 3 && Wk == 3;

parallel_for_(Range(0, N * C), [&](const Range &r0) {
for (int nc = r0.start; nc < r0.end; nc++)
{
int c = nc % C;
const float *inptr = inp + inp_planesize * nc;
float *outptr0 = out + out_planesize * nc;
for (int nc = r0.start; nc < r0.end; nc++)
{
int c = nc % C;
const float *inptr = inp + inp_planesize * nc;
float *outptr0 = out + out_planesize * nc;

float biasval = bias[c];
const float *weights = weights0 + c * padded_ksize;
float biasval = bias[c];
const float *weights = weights0 + c * padded_ksize;

#if CV_TRY_AVX2
if (conv->useAVX2)
opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
else
if (conv->useAVX2)
opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
pad_top, pad_left, dilation_h, stride_w, stride_h, inner_xleft, inner_xright, inner_ytop,
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
else
#endif
depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
pad_top, pad_left, dilation_h, stride_w, stride_h, inner_xleft, inner_xright, inner_ytop,
inner_ybottom, ifMinMaxAct, useSIMD, is3x3);

if (activ)
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
}
});
if (activ)
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
}});
}

}} // namespace cv::dnn
}} // namespace cv::dnn

0 comments on commit 54cf528

Please sign in to comment.