Skip to content

Commit

Permalink
add fc acl executor
Browse files Browse the repository at this point in the history
  • Loading branch information
allnes committed May 14, 2024
1 parent a2cd7be commit 9004e85
Show file tree
Hide file tree
Showing 12 changed files with 530 additions and 122 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -220,18 +220,22 @@ bool CpuBlockedMemoryDesc::isBlockedCFormat(size_t blk_size) const {
}

bool CpuBlockedMemoryDesc::isTailCFormat() const {
#if !defined(OPENVINO_ARCH_ARM) && !defined(OPENVINO_ARCH_ARM64)
if (shape.getRank() < 3) {
return false;
}
#endif
if (shape.getRank() != order.size()) {
return false;
}
if (!std::is_sorted(order.begin(), --order.end())) {
return false;
}
#if !defined(OPENVINO_ARCH_ARM) && !defined(OPENVINO_ARCH_ARM64)
if (order.back() != 1) {
return false;
}
#endif
return true;
}

Expand Down
121 changes: 19 additions & 102 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,66 +361,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseRelu:
if (aclEltwiseAttrs.alpha == 0) {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
ActivationLayerInfo::ActivationFunction::RELU))
return false;
} else {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
return false;
}
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
if (aclEltwiseAttrs.alpha == 0) {
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
} else {
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
}
return acl_op;
};
break;
case Algorithm::EltwiseGeluErf:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
return acl_op;
};
break;
case Algorithm::EltwiseElu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwiseTanh:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
return acl_op;
};
break;
case Algorithm::EltwiseSigmoid:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
return acl_op;
};
break;
case Algorithm::EltwiseAbs:
if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
Expand All @@ -430,24 +370,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseSqrt:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
return acl_op;
};
break;
case Algorithm::EltwiseSoftRelu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
return acl_op;
};
break;
case Algorithm::EltwiseExp:
if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
Expand All @@ -457,28 +379,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseClamp:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwiseSwish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwisePrelu:
if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
Expand All @@ -488,12 +388,29 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseRelu:
case Algorithm::EltwiseGeluErf:
case Algorithm::EltwiseElu:
case Algorithm::EltwiseTanh:
case Algorithm::EltwiseSigmoid:
case Algorithm::EltwiseSqrt:
case Algorithm::EltwiseSoftRelu:
case Algorithm::EltwiseClamp:
case Algorithm::EltwiseSwish:
case Algorithm::EltwiseHswish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
getActivationLayerInfo(aclEltwiseAttrs.algorithm,
aclEltwiseAttrs.alpha,
aclEltwiseAttrs.beta,
aclEltwiseAttrs.gamma)))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
acl_op->configure(&srcTensors[0], &dstTensors[0],
getActivationLayerInfo(aclEltwiseAttrs.algorithm,
aclEltwiseAttrs.alpha,
aclEltwiseAttrs.beta,
aclEltwiseAttrs.gamma));
return acl_op;
};
break;
Expand Down
64 changes: 64 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_executor.hpp"
#include "acl_utils.hpp"
#include "nodes/executors/executor.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

bool ACLCommonExecutor::update(const MemoryArgs &memory) {
std::unordered_map<int, arm_compute::DataType> acl_tensors_types_list;
std::unordered_map<int, arm_compute::DataLayout> acl_tensors_layouts_list;
for (auto& cpu_mem_ptr : memory) {
acl_tensors_types_list[cpu_mem_ptr.first] = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
acl_tensors_layouts_list[cpu_mem_ptr.first] = getAclDataLayoutByMemoryDesc(cpu_mem_ptr.second->getDescPtr());
}

for (auto& cpu_mem_ptr : memory) {
if (acl_tensors_types_list[cpu_mem_ptr.first] == arm_compute::DataType::UNKNOWN) {
list_acl_tensors_infos[cpu_mem_ptr.first] = arm_compute::TensorInfo();
continue;
}

auto collapsed_dims = collapse_dims_to_max_rank(cpu_mem_ptr.second->getStaticDims(),
aclTensorAttrs.maxDimsShape);
auto acl_tensor_shape = shapeCast(collapsed_dims);
if (aclTensorAttrs.enableNHWCReshape) {
changeLayoutToNH_C({&acl_tensor_shape});
}
list_acl_tensors_infos[cpu_mem_ptr.first] = arm_compute::TensorInfo(acl_tensor_shape, 1,
acl_tensors_types_list[cpu_mem_ptr.first],
acl_tensors_layouts_list[cpu_mem_ptr.first]);
}

auto status = prepare_tensors_info();
if (!status) {
DEBUG_LOG("ACL operator validation was failed: ", status.error_description());
return false;
}

for (auto& acl_tensor_info : list_acl_tensors_infos) {
list_acl_tensors[acl_tensor_info.first].allocator()->init(acl_tensor_info.second);
}

configureThreadSafe([&] { ifunc = configure_function();});
return true;
}

void ACLCommonExecutor::execute(const MemoryArgs &memory) {
for (auto& acl_tensor : list_acl_tensors) {
acl_tensor.second.allocator()->import_memory(memory.at(acl_tensor.first)->getData());
}
ifunc->run();
for (auto& acl_tensor : list_acl_tensors) {
acl_tensor.second.allocator()->free();
}
}

} // namespace intel_cpu
} // namespace ov
38 changes: 38 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "cpu_memory.h"
#include "nodes/executors/executor.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"

namespace ov {
namespace intel_cpu {

struct ACLTensorAttrs {
bool enableNHWCReshape = false;
size_t maxDimsShape = arm_compute::MAX_DIMS;
};

class ACLCommonExecutor : public Executor {
public:
virtual arm_compute::Status prepare_tensors_info() = 0;
virtual std::unique_ptr<arm_compute::IFunction> configure_function() = 0;

protected:
std::unique_ptr<arm_compute::IFunction> ifunc = nullptr;
std::unordered_map<int, arm_compute::Tensor> list_acl_tensors;
std::unordered_map<int, arm_compute::TensorInfo> list_acl_tensors_infos;
ACLTensorAttrs aclTensorAttrs;

private:
void execute(const MemoryArgs& memory) override;
bool update(const MemoryArgs& memory) override;
};

using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>;

} // namespace intel_cpu
} // namespace ov
114 changes: 114 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_fullyconnected.hpp"
#include "acl_utils.hpp"
#include "nodes/executors/executor.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const PostOps &postOps,
const MemoryArgs &memory,
const ExecutorContext::CPtr context) : withBias(attrs.withBias) {
aclTensorAttrs.enableNHWCReshape = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
fullyConnectedLayerInfo.transpose_weights = !attrs.weightsNonTransposed;
if (memory.at(ARG_SRC)->getPrecision() == ov::element::f16) {
fullyConnectedLayerInfo.fp_mixed_precision = true;
}

// Add postops
if (!postOps.empty() && postOps.size() == 1) {
if (const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0])) {
fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
activation->alpha(),
activation->beta(),
activation->gamma());
}
}
}

bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
if (!config.postOps.empty() && config.postOps.size() != 1) {
DEBUG_LOG("ACLFullyConnectedExecutor supports only 1 post op");
return false;
}

const auto& srcDesc = config.descs.at(ARG_SRC);
if (!one_of(srcDesc->getShape().getDims().size(), 2, 3, 4)) {
DEBUG_LOG("ACLFullyConnectedExecutor supports only 2, 3 or 4 dimensions for inputs");
return false;
}

const auto& weiDesc = config.descs.at(ARG_WEI);
if (!one_of(weiDesc->getShape().getDims().size(), 2, 3)) {
DEBUG_LOG("ACLFullyConnectedExecutor supports only 2 or 3 dimensions for weights");
return false;
}
return true;
}

arm_compute::Status ACLFullyConnectedExecutor::prepare_tensors_info() {
auto wei_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
if (wei_shape.num_dimensions() == 3) {
list_acl_tensors_infos.at(ARG_WEI).set_tensor_shape({wei_shape[0], wei_shape[1] * wei_shape[2]});
wei_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
}

auto src_shape = list_acl_tensors_infos.at(ARG_SRC).tensor_shape();
if (one_of(src_shape.num_dimensions(), 3, 4)) {
list_acl_tensors_infos.at(ARG_SRC).set_tensor_shape({wei_shape[0], src_shape.total_size() / wei_shape[0]});
src_shape = list_acl_tensors_infos.at(ARG_SRC).tensor_shape();
}

if (one_of(list_acl_tensors_infos.at(ARG_DST).tensor_shape().num_dimensions(), 3, 4)) {
list_acl_tensors_infos.at(ARG_DST).set_tensor_shape({wei_shape[1], src_shape[1]});
}

auto expected_weight_format = arm_compute::WeightFormat::ANY;
weightsInfo = arm_compute::WeightsInfo(false, 1, 1,
list_acl_tensors_infos.at(ARG_WEI).tensor_shape().total_size(),
false, expected_weight_format);

auto opt_impl_status = arm_compute::NEFullyConnectedLayer::has_opt_impl(
expected_weight_format,
&list_acl_tensors_infos.at(ARG_SRC),
&list_acl_tensors_infos.at(ARG_WEI),
withBias ? &list_acl_tensors_infos.at(ARG_BIAS) : nullptr,
&list_acl_tensors_infos.at(ARG_DST),
fullyConnectedLayerInfo,
weightsInfo);
if (!opt_impl_status) { return opt_impl_status; }
fullyConnectedLayerInfo.enable_fast_math = arm_compute::is_fixed_format_fast_math(expected_weight_format);

if (!fullyConnectedLayerInfo.transpose_weights) {
arm_compute::TensorShape temp_weights_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
std::swap(temp_weights_shape[0], temp_weights_shape[1]);
list_acl_tensors_infos.at(ARG_WEI).set_tensor_shape(temp_weights_shape);
}

return arm_compute::NEFullyConnectedLayer::validate(&list_acl_tensors_infos.at(ARG_SRC),
&list_acl_tensors_infos.at(ARG_WEI),
withBias ? &list_acl_tensors_infos.at(ARG_BIAS) : nullptr,
&list_acl_tensors_infos.at(ARG_DST),
fullyConnectedLayerInfo,
weightsInfo);
}

std::unique_ptr<arm_compute::IFunction> ACLFullyConnectedExecutor::configure_function() {
auto fc_func = make_unique<arm_compute::NEFullyConnectedLayer>();
fc_func->configure(&list_acl_tensors.at(ARG_SRC),
&list_acl_tensors.at(ARG_WEI),
withBias ? &list_acl_tensors.at(ARG_BIAS) : nullptr,
&list_acl_tensors.at(ARG_DST),
fullyConnectedLayerInfo,
weightsInfo);
return fc_func;
}

} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit 9004e85

Please sign in to comment.