Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve and refactor softmax layer #24466

Merged
merged 11 commits into from
Nov 6, 2023
51 changes: 51 additions & 0 deletions modules/dnn/perf/perf_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -707,4 +707,55 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_FullyConnected, Combine(
dnnBackendsAndTargets()
));

typedef TestBaseWithParam<tuple<std::vector<int>, int, tuple<Backend, Target> > > Layer_Softmax;
PERF_TEST_P_(Layer_Softmax, softmax_3d) {
std::vector<int> shape = get<0>(GetParam());
int axis = get<1>(GetParam());
int backendId = get<0>(get<2>(GetParam()));
int targetId = get<1>(get<2>(GetParam()));

Mat data(shape, CV_32FC1);
Scalar mean = 0.f;
Scalar std = 1.f;
randn(data, mean, std);

Net net;
LayerParams lp;
lp.type = "Softmax";
lp.name = "testLayer";
lp.set("axis", axis);

net.addLayerToPrev(lp.name, lp.type, lp);
// warmup
{
net.setInput(data);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
Mat out = net.forward();
}

TEST_CYCLE() {
Mat res = net.forward();
}

SANITY_CHECK_NOTHING();
}
WanliZhong marked this conversation as resolved.
Show resolved Hide resolved

INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
Values( // input size
std::vector<int>({16, 50, 50}),
std::vector<int>({16, 197, 197}),
std::vector<int>({16, 1024, 1024})
),
Values(0, 1, 2), // axis
dnnBackendsAndTargets(/* withInferenceEngine= */ false,
/* withHalide= */ false,
/* withCpuOCV= */ true,
/* withVkCom= */ false,
/* withCUDA= */ false,
/* withNgraph= */ false,
/* withWebnn= */ false,
/* withCann= */ false) // only test on CPU
));

} // namespace
159 changes: 159 additions & 0 deletions modules/dnn/src/layers/cpu_kernels/softmax.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpNN.fx).
// Here is the original license:
/*
This file is a part of ficus language project.
See ficus/LICENSE for the licensing terms
*/

#include "../../precomp.hpp"
#include "softmax.hpp"

#ifdef CV_SIMD
#define _VEXP_INIT() \
v_float32 _vexp_lo = vx_setall_f32(-88.3762626647949f); \
v_float32 _vexp_hi = vx_setall_f32(88.3762626647949f); \
v_float32 _vexp_half = vx_setall_f32(0.5f); \
v_float32 _vexp_one = vx_setall_f32(1.f); \
v_float32 _vexp_LOG2EF = vx_setall_f32(1.44269504088896341f); \
v_float32 _vexp_C1 = vx_setall_f32(-0.693359375f); \
v_float32 _vexp_C2 = vx_setall_f32(2.12194440e-4f); \
v_float32 _vexp_p0 = vx_setall_f32(1.9875691500E-4f); \
v_float32 _vexp_p1 = vx_setall_f32(1.3981999507E-3f); \
v_float32 _vexp_p2 = vx_setall_f32(8.3334519073E-3f); \
v_float32 _vexp_p3 = vx_setall_f32(4.1665795894E-2f); \
v_float32 _vexp_p4 = vx_setall_f32(1.6666665459E-1f); \
v_float32 _vexp_p5 = vx_setall_f32(5.0000001201E-1f)

#define _VEXP_COMPUTE(x, y) { \
WanliZhong marked this conversation as resolved.
Show resolved Hide resolved
v_float32 _vexp_, _vexp_x, _vexp_y, _vexp_z; \
_vexp_x = v_min(x, _vexp_hi); \
_vexp_x = v_max(_vexp_x, _vexp_lo); \
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF, _vexp_half); \
v_int32 _vexp_mm = v_floor(_vexp_); \
_vexp_ = v_cvt_f32(_vexp_mm); \
_vexp_mm = v_add(_vexp_mm, vx_setall_s32(0x7f)); \
_vexp_mm = v_shl(_vexp_mm, 23); \
_vexp_x = v_fma(_vexp_, _vexp_C1, _vexp_x); \
_vexp_x = v_fma(_vexp_, _vexp_C2, _vexp_x); \
_vexp_z = v_mul(_vexp_x, _vexp_x); \
_vexp_y = v_fma(_vexp_x, _vexp_p0, _vexp_p1); \
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2); \
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3); \
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4); \
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5); \
_vexp_y = v_fma(_vexp_y, _vexp_z, _vexp_x); \
_vexp_y = v_add(_vexp_y, _vexp_one); \
y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm)); \
}
#endif

namespace cv { namespace dnn {

void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
CV_Assert(src.type() == CV_32F);
CV_Assert(src.isContinuous() && dst.isContinuous());
CV_Assert(src.size == dst.size);
axis = normalize_axis(axis, src.dims);

size_t outerSize = src.total(0, axis),
innerSize = src.total(axis + 1);

const float *srcPtr = src.ptr<float>();
float *dstPtr = dst.ptr<float>();

size_t outerStep = src.total(axis);
size_t cnStep = src.total(axis + 1);

// multi-threads
size_t totalTasks = outerSize * innerSize;
double nstripes = (double) totalTasks / 1024.0;
// make the channel axis to be multiple of 8
size_t channelAxis = (axisStep + 7) & -8;

#ifdef CV_SIMD
WanliZhong marked this conversation as resolved.
Show resolved Hide resolved
const int nlanes = VTraits<v_float32>::vlanes();
// the number of redundant dimension
size_t redundantDim = nlanes - axisStep % nlanes;
#endif

parallel_for_(Range(0, (int) totalTasks), [&](const Range &range) {
AutoBuffer<float> axisBuf_(channelAxis);
float *axisBuf = axisBuf_.data();

for (size_t i = range.start; i < range.end; i++) {
size_t outerDim = i / innerSize;
size_t innerDim = i % innerSize;
size_t srcOffset = outerDim * outerStep + innerDim;
// copy data from src to buf along axis, since the data may not be continuous
for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
axisBuf[cnDim] = srcPtr[srcOffset + (cnDim + axisBias) * cnStep];

float s = 0.f;
#ifdef CV_SIMD
_VEXP_INIT();
// make the value of the redundant dimension to be -FLT_MAX
if (redundantDim != nlanes) {
for (size_t j = axisStep; j < axisStep + redundantDim; j++)
axisBuf[j] = -FLT_MAX;
}
// calculate the max value along the axis
v_float32 vmax = vx_load(axisBuf);
for (size_t cnDim = nlanes; cnDim < axisStep; cnDim += nlanes) {
v_float32 val = vx_load(axisBuf + cnDim);
vmax = v_max(vmax, val);
}
float maxVal = v_reduce_max(vmax);

// calculate the exp value along the axis
v_float32 vs = vx_setzero_f32();
vmax = vx_setall_f32(maxVal);
v_float32 val;
for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
val = vx_load(axisBuf + cnDim);
val = v_sub(val, vmax);
_VEXP_COMPUTE(val, val);
vs = v_add(vs, val);
v_store(axisBuf + cnDim, val);
}

s = v_reduce_sum(vs);
// subtract the value of the redundant dimension
if (redundantDim != nlanes) {
float* _val = new float[nlanes];
v_store(_val, val);
for (size_t j = nlanes - redundantDim; j < nlanes; j++)
s -= _val[j];
}
#else
float maxVal = axisBuf[0];
for (size_t cnDim = 1; cnDim < axisStep; cnDim++) {
maxVal = std::max(maxVal, axisBuf[cnDim]);
}
for (size_t j = 0; j < axisStep; j++) {
axisBuf[j] = expf(axisBuf[j] - maxVal);
s += axisBuf[j];
}
#endif
s = 1.f / s;

// copy back the result to src
for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
dstPtr[srcOffset + (cnDim + axisBias) * cnStep] = axisBuf[cnDim] * s;
}
}, nstripes);
}

void softmax(Mat &dst, const Mat &src, int axis) {
softmax(dst, src, axis, 0, src.size[axis]);
}

void logSoftmax(Mat &dst, const Mat &src, int axis) {
softmax(dst, src, axis);
log(dst, dst);
}

}} // cv::dnn
28 changes: 28 additions & 0 deletions modules/dnn/src/layers/cpu_kernels/softmax.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpNN.fx).
// Here is the original license:
/*
This file is a part of ficus language project.
See ficus/LICENSE for the licensing terms
*/

#ifndef OPENCV_DNN_SOFTMAX_HPP
#define OPENCV_DNN_SOFTMAX_HPP

#include "opencv2/core/hal/intrin.hpp"
#include <opencv2/dnn/shape_utils.hpp>

namespace cv { namespace dnn {

void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep);

void softmax(Mat &dst, const Mat &src, int axis);

void logSoftmax(Mat &dst, const Mat &src, int axis);

}} // cv::dnn

#endif // OPENCV_DNN_SOFTMAX_HPP
7 changes: 3 additions & 4 deletions modules/dnn/src/layers/region_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include <opencv2/dnn/shape_utils.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include "../nms.inl.hpp"
#include "cpu_kernels/softmax.hpp"

#ifdef HAVE_OPENCL
#include "opencl_kernels_dnn.hpp"
Expand Down Expand Up @@ -280,10 +281,8 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
}

if (useSoftmax) { // Yolo v2
for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
int index = cell_size*i;
softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
}
Mat _inpBlob = inpBlob.reshape(0, outBlob.dims, outBlob.size);
softmax(outBlob, _inpBlob, -1, 5, classes);
}
else if (useLogistic) { // Yolo v3
for (int i = 0; i < batch_size*rows*cols*anchors; ++i){
Expand Down
83 changes: 5 additions & 78 deletions modules/dnn/src/layers/softmax_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include <algorithm>
#include <stdlib.h>
#include <opencv2/core/utils/logger.hpp>
#include "cpu_kernels/softmax.hpp"
using std::max;

#ifdef HAVE_OPENCL
Expand Down Expand Up @@ -225,89 +226,15 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
std::vector<Mat> inputs, outputs, internals;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
internals_arr.getMatVector(internals);

const Mat &src = inputs[0];
Mat &dst = outputs[0];

int axis = normalize_axis(axisRaw, src.dims);
size_t outerSize = src.total(0, axis), channels = src.size[axis],
innerSize = src.total(axis + 1);

CV_Assert(src.type() == CV_32F);
CV_Assert(src.isContinuous() && dst.isContinuous());

const float *srcPtr = src.ptr<float>();
float *dstPtr = dst.ptr<float>();
float *bufPtr = internals[0].ptr<float>();

size_t outerStep = src.total(axis);
size_t cnStep = src.total(axis + 1);

//compute max along axis
for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
{
size_t srcOffset = outerDim * outerStep;
size_t bufOffset = outerDim * cnStep;

memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));

for (size_t cnDim = 1; cnDim < channels; cnDim++)
{
for (size_t i = 0; i < innerSize; i++)
bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
}
}

//subtract max
for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
{
size_t srcOffset = outerDim * outerStep;
size_t bufOffset = outerDim * cnStep;

for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
dstPtr[offset + i] = srcPtr[offset + i] - bufPtr[bufOffset + i];
}
}

cv::exp(dst, dst);

for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
{
size_t srcOffset = outerDim * outerStep;
size_t bufOffset = outerDim * cnStep;

//sum exp along axis
for (size_t i = 0; i < innerSize; i++)
bufPtr[bufOffset + i] = 0.f;

for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
bufPtr[bufOffset + i] += dstPtr[offset + i];
}

//divide by computed sum
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
dstPtr[offset + i] /= bufPtr[bufOffset + i];
}
if (logSoftMax)
{
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
dstPtr[offset + i] = log(dstPtr[offset + i]);
}
}
}
if(logSoftMax)
logSoftmax(dst, src, axis);
else
softmax(dst, src, axis);
}

#ifdef HAVE_CUDA
Expand Down
7 changes: 7 additions & 0 deletions modules/dnn/src/onnx/onnx_importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2871,6 +2871,13 @@ void ONNXImporter::parseUpsample(LayerParams& layerParams, const opencv_onnx::No
void ONNXImporter::parseSoftMax(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
{
const std::string& layer_type = node_proto.op_type();
int axis;
if (layerParams.has("opset") && layerParams.get<int>("opset") > 11) {
axis = layerParams.get<int>("axis", -1);
} else {
axis = layerParams.get<int>("axis", 1);
}
layerParams.set<int>("axis", axis);
layerParams.type = "Softmax";
layerParams.set("log_softmax", layer_type == "LogSoftmax");
addLayer(layerParams, node_proto);
Expand Down