add fc acl executor

openvinotoolkit · May 14, 2024 · 9004e85 · 9004e85
1 parent a2cd7be
commit 9004e85
Show file tree

Hide file tree

Showing 12 changed files with 530 additions and 122 deletions.
diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp
@@ -220,18 +220,22 @@ bool CpuBlockedMemoryDesc::isBlockedCFormat(size_t blk_size) const {
 }
 
 bool CpuBlockedMemoryDesc::isTailCFormat() const {
+#if !defined(OPENVINO_ARCH_ARM) && !defined(OPENVINO_ARCH_ARM64)
     if (shape.getRank() < 3) {
         return false;
     }
+#endif
     if (shape.getRank() != order.size()) {
         return false;
     }
     if (!std::is_sorted(order.begin(), --order.end())) {
         return false;
     }
+#if !defined(OPENVINO_ARCH_ARM) && !defined(OPENVINO_ARCH_ARM64)
     if (order.back() != 1) {
         return false;
     }
+#endif
     return true;
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -361,66 +361,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseRelu:
-            if (aclEltwiseAttrs.alpha == 0) {
-                if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                                 ActivationLayerInfo::ActivationFunction::RELU))
-                    return false;
-            } else {
-                if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                                 {ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
-                    return false;
-            }
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                if (aclEltwiseAttrs.alpha == 0) {
-                    acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
-                } else {
-                    acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                      {ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
-                }
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseGeluErf:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseElu:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseTanh:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSigmoid:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwiseAbs:
             if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
                 return false;
@@ -430,24 +370,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseSqrt:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSoftRelu:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwiseExp:
             if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
                 return false;
@@ -457,28 +379,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseClamp:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSwish:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwisePrelu:
             if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
                 return false;
@@ -488,12 +388,29 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
+        case Algorithm::EltwiseRelu:
+        case Algorithm::EltwiseGeluErf:
+        case Algorithm::EltwiseElu:
+        case Algorithm::EltwiseTanh:
+        case Algorithm::EltwiseSigmoid:
+        case Algorithm::EltwiseSqrt:
+        case Algorithm::EltwiseSoftRelu:
+        case Algorithm::EltwiseClamp:
+        case Algorithm::EltwiseSwish:
         case Algorithm::EltwiseHswish:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
+            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
+                                             getActivationLayerInfo(aclEltwiseAttrs.algorithm,
+                                                                    aclEltwiseAttrs.alpha,
+                                                                    aclEltwiseAttrs.beta,
+                                                                    aclEltwiseAttrs.gamma)))
                 return false;
             exec_func = [this]() -> std::unique_ptr<IFunction> {
                 auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
+                acl_op->configure(&srcTensors[0], &dstTensors[0],
+                                  getActivationLayerInfo(aclEltwiseAttrs.algorithm,
+                                                         aclEltwiseAttrs.alpha,
+                                                         aclEltwiseAttrs.beta,
+                                                         aclEltwiseAttrs.gamma));
                 return acl_op;
             };
             break;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_executor.hpp"
+#include "acl_utils.hpp"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+
+namespace ov {
+namespace intel_cpu {
+
+bool ACLCommonExecutor::update(const MemoryArgs &memory) {
+    std::unordered_map<int, arm_compute::DataType>   acl_tensors_types_list;
+    std::unordered_map<int, arm_compute::DataLayout> acl_tensors_layouts_list;
+    for (auto& cpu_mem_ptr : memory) {
+        acl_tensors_types_list[cpu_mem_ptr.first] = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
+        acl_tensors_layouts_list[cpu_mem_ptr.first] = getAclDataLayoutByMemoryDesc(cpu_mem_ptr.second->getDescPtr());
+    }
+
+    for (auto& cpu_mem_ptr : memory) {
+        if (acl_tensors_types_list[cpu_mem_ptr.first] == arm_compute::DataType::UNKNOWN) {
+            list_acl_tensors_infos[cpu_mem_ptr.first] = arm_compute::TensorInfo();
+            continue;
+        }
+
+        auto collapsed_dims = collapse_dims_to_max_rank(cpu_mem_ptr.second->getStaticDims(),
+                                                        aclTensorAttrs.maxDimsShape);
+        auto acl_tensor_shape = shapeCast(collapsed_dims);
+        if (aclTensorAttrs.enableNHWCReshape) {
+            changeLayoutToNH_C({&acl_tensor_shape});
+        }
+        list_acl_tensors_infos[cpu_mem_ptr.first] = arm_compute::TensorInfo(acl_tensor_shape, 1,
+                                                                            acl_tensors_types_list[cpu_mem_ptr.first],
+                                                                            acl_tensors_layouts_list[cpu_mem_ptr.first]);
+    }
+
+    auto status = prepare_tensors_info();
+    if (!status) {
+        DEBUG_LOG("ACL operator validation was failed: ", status.error_description());
+        return false;
+    }
+
+    for (auto& acl_tensor_info : list_acl_tensors_infos) {
+        list_acl_tensors[acl_tensor_info.first].allocator()->init(acl_tensor_info.second);
+    }
+
+    configureThreadSafe([&] { ifunc = configure_function();});
+    return true;
+}
+
+void ACLCommonExecutor::execute(const MemoryArgs &memory) {
+    for (auto& acl_tensor : list_acl_tensors) {
+        acl_tensor.second.allocator()->import_memory(memory.at(acl_tensor.first)->getData());
+    }
+    ifunc->run();
+    for (auto& acl_tensor : list_acl_tensors) {
+        acl_tensor.second.allocator()->free();
+    }
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpu_memory.h"
+#include "nodes/executors/executor.hpp"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+namespace ov {
+namespace intel_cpu {
+
+struct ACLTensorAttrs {
+    bool enableNHWCReshape = false;
+    size_t maxDimsShape = arm_compute::MAX_DIMS;
+};
+
+class ACLCommonExecutor : public Executor {
+public:
+    virtual arm_compute::Status prepare_tensors_info() = 0;
+    virtual std::unique_ptr<arm_compute::IFunction> configure_function() = 0;
+
+protected:
+    std::unique_ptr<arm_compute::IFunction> ifunc = nullptr;
+    std::unordered_map<int, arm_compute::Tensor> list_acl_tensors;
+    std::unordered_map<int, arm_compute::TensorInfo> list_acl_tensors_infos;
+    ACLTensorAttrs aclTensorAttrs;
+
+private:
+    void execute(const MemoryArgs& memory) override;
+    bool update(const MemoryArgs& memory) override;
+};
+
+using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>;
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_fullyconnected.hpp"
+#include "acl_utils.hpp"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+
+namespace ov {
+namespace intel_cpu {
+
+ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const PostOps &postOps,
+                                                     const MemoryArgs &memory,
+                                                     const ExecutorContext::CPtr context) : withBias(attrs.withBias) {
+    aclTensorAttrs.enableNHWCReshape = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
+    fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
+    fullyConnectedLayerInfo.transpose_weights = !attrs.weightsNonTransposed;
+    if (memory.at(ARG_SRC)->getPrecision() == ov::element::f16) {
+        fullyConnectedLayerInfo.fp_mixed_precision = true;
+    }
+
+    // Add postops
+    if (!postOps.empty() && postOps.size() == 1) {
+        if (const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0])) {
+            fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
+                                                                             activation->alpha(),
+                                                                             activation->beta(),
+                                                                             activation->gamma());
+        }
+    }
+}
+
+bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
+    if (!config.postOps.empty() && config.postOps.size() != 1) {
+        DEBUG_LOG("ACLFullyConnectedExecutor supports only 1 post op");
+        return false;
+    }
+
+    const auto& srcDesc = config.descs.at(ARG_SRC);
+    if (!one_of(srcDesc->getShape().getDims().size(), 2, 3, 4)) {
+        DEBUG_LOG("ACLFullyConnectedExecutor supports only 2, 3 or 4 dimensions for inputs");
+        return false;
+    }
+
+    const auto& weiDesc = config.descs.at(ARG_WEI);
+    if (!one_of(weiDesc->getShape().getDims().size(), 2, 3)) {
+        DEBUG_LOG("ACLFullyConnectedExecutor supports only 2 or 3 dimensions for weights");
+        return false;
+    }
+    return true;
+}
+
+arm_compute::Status ACLFullyConnectedExecutor::prepare_tensors_info() {
+    auto wei_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
+    if (wei_shape.num_dimensions() == 3) {
+        list_acl_tensors_infos.at(ARG_WEI).set_tensor_shape({wei_shape[0], wei_shape[1] * wei_shape[2]});
+        wei_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
+    }
+
+    auto src_shape = list_acl_tensors_infos.at(ARG_SRC).tensor_shape();
+    if (one_of(src_shape.num_dimensions(), 3, 4)) {
+        list_acl_tensors_infos.at(ARG_SRC).set_tensor_shape({wei_shape[0], src_shape.total_size() / wei_shape[0]});
+        src_shape = list_acl_tensors_infos.at(ARG_SRC).tensor_shape();
+    }
+
+    if (one_of(list_acl_tensors_infos.at(ARG_DST).tensor_shape().num_dimensions(), 3, 4)) {
+        list_acl_tensors_infos.at(ARG_DST).set_tensor_shape({wei_shape[1], src_shape[1]});
+    }
+
+    auto expected_weight_format = arm_compute::WeightFormat::ANY;
+    weightsInfo = arm_compute::WeightsInfo(false, 1, 1,
+                                           list_acl_tensors_infos.at(ARG_WEI).tensor_shape().total_size(),
+                                           false, expected_weight_format);
+
+    auto opt_impl_status = arm_compute::NEFullyConnectedLayer::has_opt_impl(
+            expected_weight_format,
+            &list_acl_tensors_infos.at(ARG_SRC),
+            &list_acl_tensors_infos.at(ARG_WEI),
+            withBias ? &list_acl_tensors_infos.at(ARG_BIAS) : nullptr,
+            &list_acl_tensors_infos.at(ARG_DST),
+            fullyConnectedLayerInfo,
+            weightsInfo);
+    if (!opt_impl_status) { return opt_impl_status; }
+    fullyConnectedLayerInfo.enable_fast_math = arm_compute::is_fixed_format_fast_math(expected_weight_format);
+
+    if (!fullyConnectedLayerInfo.transpose_weights) {
+        arm_compute::TensorShape temp_weights_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
+        std::swap(temp_weights_shape[0], temp_weights_shape[1]);
+        list_acl_tensors_infos.at(ARG_WEI).set_tensor_shape(temp_weights_shape);
+    }
+
+    return arm_compute::NEFullyConnectedLayer::validate(&list_acl_tensors_infos.at(ARG_SRC),
+                                                        &list_acl_tensors_infos.at(ARG_WEI),
+                                                        withBias ? &list_acl_tensors_infos.at(ARG_BIAS) : nullptr,
+                                                        &list_acl_tensors_infos.at(ARG_DST),
+                                                        fullyConnectedLayerInfo,
+                                                        weightsInfo);
+}
+
+std::unique_ptr<arm_compute::IFunction> ACLFullyConnectedExecutor::configure_function() {
+    auto fc_func = make_unique<arm_compute::NEFullyConnectedLayer>();
+    fc_func->configure(&list_acl_tensors.at(ARG_SRC),
+                       &list_acl_tensors.at(ARG_WEI),
+                       withBias ? &list_acl_tensors.at(ARG_BIAS) : nullptr,
+                       &list_acl_tensors.at(ARG_DST),
+                       fullyConnectedLayerInfo,
+                       weightsInfo);
+    return fc_func;
+}
+
+}   // namespace intel_cpu
+}   // namespace ov