Add FullyConnected ACL executor

openvinotoolkit · Jun 21, 2024 · 3a13983 · 3a13983
1 parent 6a7c442
commit 3a13983
Show file tree

Hide file tree

Showing 12 changed files with 538 additions and 118 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -361,66 +361,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseRelu:
-            if (aclEltwiseAttrs.alpha == 0) {
-                if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                                 ActivationLayerInfo::ActivationFunction::RELU))
-                    return false;
-            } else {
-                if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                                 {ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
-                    return false;
-            }
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                if (aclEltwiseAttrs.alpha == 0) {
-                    acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
-                } else {
-                    acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                      {ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
-                }
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseGeluErf:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseElu:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseTanh:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSigmoid:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwiseAbs:
             if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
                 return false;
@@ -430,24 +370,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseSqrt:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSoftRelu:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwiseExp:
             if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
                 return false;
@@ -457,28 +379,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
-        case Algorithm::EltwiseClamp:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
-        case Algorithm::EltwiseSwish:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
-                                             {ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha}))
-                return false;
-            exec_func = [this]() -> std::unique_ptr<IFunction> {
-                auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0],
-                                  {ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
-                return acl_op;
-            };
-            break;
         case Algorithm::EltwisePrelu:
             if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
                 return false;
@@ -488,12 +388,29 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                 return acl_op;
             };
             break;
+        case Algorithm::EltwiseRelu:
+        case Algorithm::EltwiseGeluErf:
+        case Algorithm::EltwiseElu:
+        case Algorithm::EltwiseTanh:
+        case Algorithm::EltwiseSigmoid:
+        case Algorithm::EltwiseSqrt:
+        case Algorithm::EltwiseSoftRelu:
+        case Algorithm::EltwiseClamp:
+        case Algorithm::EltwiseSwish:
         case Algorithm::EltwiseHswish:
-            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
+            if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
+                                             getActivationLayerInfo(aclEltwiseAttrs.algorithm,
+                                                                    aclEltwiseAttrs.alpha,
+                                                                    aclEltwiseAttrs.beta,
+                                                                    aclEltwiseAttrs.gamma)))
                 return false;
             exec_func = [this]() -> std::unique_ptr<IFunction> {
                 auto acl_op = std::make_unique<NEActivationLayer>();
-                acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
+                acl_op->configure(&srcTensors[0], &dstTensors[0],
+                                  getActivationLayerInfo(aclEltwiseAttrs.algorithm,
+                                                         aclEltwiseAttrs.alpha,
+                                                         aclEltwiseAttrs.beta,
+                                                         aclEltwiseAttrs.gamma));
                 return acl_op;
             };
             break;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_executor.hpp"
+#include "acl_utils.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+
+namespace ov {
+namespace intel_cpu {
+
+ACLMemoryInfo ACLCommonExecutor::initTensorInfo(const MemoryPtr& memoryPtr, ACLTensorAttrs attrs) {
+    auto acl_tensor_type   = precisionToAclDataType(memoryPtr->getPrecision());
+    auto acl_tensor_layout = getAclDataLayoutByMemoryDesc(memoryPtr->getDescPtr());
+
+    ACLMemoryInfo aclMemoryInfo = nullptr;
+    if (acl_tensor_type != arm_compute::DataType::UNKNOWN) {
+        auto collapsed_dims = collapse_dims_to_max_rank(memoryPtr->getStaticDims(), attrs.maxDimsShape);
+        auto acl_tensor_shape = shapeCast(collapsed_dims);
+        if (attrs.hasLayoutTypeNHWC) {
+            changeLayoutToNH_C({&acl_tensor_shape});
+        }
+        aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
+                acl_tensor_shape, 1,
+                acl_tensor_type,
+                acl_tensor_layout);
+    }
+    return aclMemoryInfo;
+}
+
+ACLMemory ACLCommonExecutor::initTensor(const ACLMemoryInfo& aclMemoryInfo) {
+    ACLMemory aclMemory = nullptr;
+    if (aclMemoryInfo) {
+        aclMemory = std::make_shared<arm_compute::Tensor>();
+        aclMemory->allocator()->init(*aclMemoryInfo);
+    }
+    return aclMemory;
+}
+
+bool ACLCommonExecutor::update(const MemoryArgs &memory) {
+    for (auto& cpu_mem_ptr : memory) {
+        // Initialize arm_compute::TensorInfo object
+        auto aclTensorInfo = initTensorInfo(cpu_mem_ptr.second, aclTensorAttrs);
+        // Initialize arm_compute::Tensor object
+        aclMemoryMap[cpu_mem_ptr.first] = initTensor(aclTensorInfo);
+    }
+
+    // Update arm_compute::TensorInfo objects for specific ACL function
+    auto tensorsInfoValidateStatus = updateTensorsInfo(aclMemoryMap);
+    if (!tensorsInfoValidateStatus) {
+        DEBUG_LOG("ACL operator validation was failed: ", tensorsInfoValidateStatus.error_description());
+        return false;
+    }
+
+    // Configure arm_compute::IFunction object
+    configureThreadSafe([&] {
+        iFunction = configureFunction(aclMemoryMap);
+    });
+    return true;
+}
+
+void ACLCommonExecutor::execute(const MemoryArgs &memory) {
+    for (auto& acl_tensor : aclMemoryMap) {
+        if (acl_tensor.second) {
+            acl_tensor.second->allocator()->import_memory(memory.at(acl_tensor.first)->getData());
+        }
+    }
+    iFunction->run();
+}
+
+ACLCommonExecutor::~ACLCommonExecutor() {
+    for (auto& acl_tensor : aclMemoryMap) {
+        if (acl_tensor.second) {
+            acl_tensor.second->allocator()->free();
+        }
+    }
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_executor.hpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpu_memory.h"
+#include "nodes/executors/executor.hpp"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+namespace ov {
+namespace intel_cpu {
+
+using ACLMemoryInfo = std::shared_ptr<arm_compute::TensorInfo>;
+using ACLMemory     = std::shared_ptr<arm_compute::Tensor>;
+using ACLMemoryMap  = std::unordered_map<int, ACLMemory>;
+using ACLFunction   = std::unique_ptr<arm_compute::IFunction>;
+
+struct ACLTensorAttrs {
+    bool hasLayoutTypeNHWC = false;
+    size_t maxDimsShape = arm_compute::MAX_DIMS;
+};
+
+class ACLCommonExecutor : public Executor {
+public:
+    virtual arm_compute::Status updateTensorsInfo(const ACLMemoryMap& acl_memory) {
+        OPENVINO_THROW_NOT_IMPLEMENTED("This version of the 'updateTensorsInfo' method is not implemented by executor");
+    }
+    virtual ACLFunction configureFunction(const ACLMemoryMap& acl_memory) {
+        OPENVINO_THROW_NOT_IMPLEMENTED("This version of the 'configureFunction' method is not implemented by executor");
+    }
+    impl_desc_type implType() const override {
+        return impl_desc_type::acl;
+    }
+    void execute(const MemoryArgs& memory) final;
+    bool update(const MemoryArgs& memory) final;
+    ~ACLCommonExecutor();
+
+protected:
+    ACLTensorAttrs aclTensorAttrs;
+
+private:
+    ACLMemoryMap aclMemoryMap;
+    ACLFunction iFunction = nullptr;
+    static ACLMemoryInfo initTensorInfo(const MemoryPtr& memoryPtr, ACLTensorAttrs attrs);
+    static ACLMemory initTensor(const ACLMemoryInfo& aclMemoryInfo);
+};
+
+using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>;
+
+}  // namespace intel_cpu
+}  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_fullyconnected.hpp"
+#include "acl_utils.hpp"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+#include "nodes/executors/debug_messages.hpp"
+#include "nodes/executors/implementation_utils.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const PostOps &postOps,
+                                                     const MemoryArgs &memory,
+                                                     const ExecutorContext::CPtr context) {
+    aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
+    fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
+    fullyConnectedLayerInfo.transpose_weights = !attrs.weightsNonTransposed;
+    if (memory.at(ARG_SRC)->getPrecision() == ov::element::f16) {
+        fullyConnectedLayerInfo.fp_mixed_precision = true;
+    }
+
+    // Add postops
+    if (!postOps.empty() && postOps.size() == 1) {
+        if (const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0])) {
+            fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
+                                                                             activation->alpha(),
+                                                                             activation->beta(),
+                                                                             activation->gamma());
+        }
+    }
+}
+
+bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
+    VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS);
+    VERIFY(postOpsNumbers(config) < 2,          UNSUPPORTED_NUMBER_OF_POSTOPS);
+    VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
+    VERIFY(one_of(weiRank(config), 2U, 3U),     UNSUPPORTED_SRC_RANK);
+    return true;
+}
+
+arm_compute::Status ACLFullyConnectedExecutor::updateTensorsInfo(const ACLMemoryMap& acl_memory) {
+    auto wei_shape = acl_memory.at(ARG_WEI)->info()->tensor_shape();
+    if (wei_shape.num_dimensions() == 3U) {
+        acl_memory.at(ARG_WEI)->info()->set_tensor_shape({wei_shape[0] * wei_shape[1], wei_shape[2]});
+    }
+
+    auto src_shape = acl_memory.at(ARG_SRC)->info()->tensor_shape();
+    if (one_of(src_shape.num_dimensions(), 3U, 4U)) {
+        acl_memory.at(ARG_SRC)->info()->set_tensor_shape({
+            acl_memory.at(ARG_WEI)->info()->tensor_shape()[0],
+            src_shape.total_size() / acl_memory.at(ARG_WEI)->info()->tensor_shape()[0]});
+    }
+
+    if (one_of(acl_memory.at(ARG_DST)->info()->tensor_shape().num_dimensions(), 3U, 4U)) {
+        acl_memory.at(ARG_DST)->info()->set_tensor_shape({
+            acl_memory.at(ARG_WEI)->info()->tensor_shape()[1],
+            acl_memory.at(ARG_SRC)->info()->tensor_shape()[1]});
+    }
+
+    if (!fullyConnectedLayerInfo.transpose_weights) {
+        arm_compute::TensorShape temp_weights_shape = acl_memory.at(ARG_WEI)->info()->tensor_shape();
+        std::swap(temp_weights_shape[0], temp_weights_shape[1]);
+        acl_memory.at(ARG_WEI)->info()->set_tensor_shape(temp_weights_shape);
+    }
+
+    return arm_compute::NEFullyConnectedLayer::validate(
+            acl_memory.at(ARG_SRC)->info(),
+            acl_memory.at(ARG_WEI)->info(),
+            acl_memory.at(ARG_BIAS) ? acl_memory.at(ARG_BIAS)->info() : nullptr,
+            acl_memory.at(ARG_DST)->info(),
+            fullyConnectedLayerInfo,
+            weightsInfo);
+}
+
+ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLMemoryMap& acl_memory) {
+    auto neFC = std::make_unique<arm_compute::NEFullyConnectedLayer>();
+    neFC->configure(
+            acl_memory.at(ARG_SRC).get(),
+            acl_memory.at(ARG_WEI).get(),
+            acl_memory.at(ARG_BIAS).get(),
+            acl_memory.at(ARG_DST).get(),
+            fullyConnectedLayerInfo,
+            weightsInfo);
+    return neFC;
+}
+
+}   // namespace intel_cpu
+}   // namespace ov