Skip to content

Commit

Permalink
[CPU][ARM] Weights compression f32->f16 is moved to CPU Plug-in side
Browse files Browse the repository at this point in the history
  • Loading branch information
antonvor committed Dec 19, 2023
1 parent 63cb89f commit 7681bf9
Show file tree
Hide file tree
Showing 12 changed files with 276 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ namespace pass {
class TRANSFORMATIONS_API EnableDecompressionConvertConstantFolding;
class TRANSFORMATIONS_API DisableDecompressionConvertConstantFolding;
class TRANSFORMATIONS_API KeepConstAndDecompression;
class TRANSFORMATIONS_API KeepConstFP32Unfolded;
class TRANSFORMATIONS_API KeepConstantsPrecisionAndAddConverts;

} // namespace pass
Expand Down Expand Up @@ -49,6 +50,12 @@ class ov::pass::KeepConstAndDecompression : public MatcherPass {
KeepConstAndDecompression();
};

class ov::pass::KeepConstFP32Unfolded : public MatcherPass {
public:
OPENVINO_RTTI("KeepConstFP32Unfolded", "0");
KeepConstFP32Unfolded();
};

/**
* @ingroup ie_transformation_common_api
* @brief Prevents Consts precision conversion and adds Convert with disabled ConstantFolding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ TRANSFORMATIONS_API void unmark_as_decompression(const std::shared_ptr<Node>& no

TRANSFORMATIONS_API bool is_decompression(const std::shared_ptr<Node>& node);

TRANSFORMATIONS_API void mark_as_compression(const std::shared_ptr<Node>& node);

TRANSFORMATIONS_API void unmark_as_compression(const std::shared_ptr<Node>& node);

TRANSFORMATIONS_API bool is_compression(const std::shared_ptr<Node>& node);

/**
* @ingroup ie_runtime_attr_api
* @brief Decompression class represents runtime info attribute that marks operation
Expand All @@ -43,4 +49,19 @@ class TRANSFORMATIONS_API Decompression : public RuntimeAttribute {
}
};

class TRANSFORMATIONS_API Compression : public RuntimeAttribute {
public:
OPENVINO_RTTI("Compression", "0");

Compression() = default;

bool visit_attributes(AttributeVisitor& visitor) override {
return true;
}

bool is_copyable() const override {
return false;
}
};

} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "openvino/op/result.hpp"
#include "openvino/op/util/precision_sensitive_attribute.hpp"
#include "openvino/pass/constant_folding.hpp"
#include "transformations/rt_info/decompression.hpp"
#include "transformations/rt_info/disable_fp16_compression.hpp"

using namespace ov;
Expand Down Expand Up @@ -48,6 +49,7 @@ bool ov::pass::AlignMixedFP32FP16Types::run_on_model(const std::shared_ptr<ov::M
copy_runtime_info(incoming_node, convert);
input.replace_source_output(convert);
disable_fp16_compression(convert);
mark_as_compression(convert);
pass::disable_constant_folding(convert);
is_changed = true;
}
Expand Down Expand Up @@ -76,6 +78,7 @@ bool ov::pass::AlignMixedFP32FP16Types::run_on_model(const std::shared_ptr<ov::M
auto init_name = node->get_friendly_name() + "_compressed_to_f16";
convert->set_friendly_name(generate_uniq_name(init_name));
out_inputs.replace_source_output(convert);
mark_as_compression(convert);
pass::disable_constant_folding(convert);
is_changed = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,32 @@ pass::KeepConstAndDecompression::KeepConstAndDecompression() {
register_matcher(m, callback);
}

pass::KeepConstFP32Unfolded::KeepConstFP32Unfolded() {
MATCHER_SCOPE(KeepConstFP16Unfolded);

auto node_pattern = pattern::wrap_type<ov::op::v0::MatMul>();

matcher_pass_callback callback = [=](pattern::Matcher& m) {
auto node = m.get_match_root();

if (transformation_callback(node)) {
return false;
}

auto constNode = node->get_input_node_shared_ptr(1);
if (!is_type<ov::op::v0::Constant>(constNode) || constNode->get_output_element_type(0) != element::f32)
return false;

disable_constant_folding(constNode);
enable_keep_const_precision(constNode);
disable_fp16_compression(constNode);

return false;
};
auto m = std::make_shared<pattern::Matcher>(node_pattern, matcher_name);
register_matcher(m, callback);
}

pass::KeepConstantsPrecisionAndAddConverts::KeepConstantsPrecisionAndAddConverts() {
MATCHER_SCOPE(KeepConstantsPrecisionAndAddConverts);
auto const_pattern = pattern::wrap_type<ov::op::v0::Constant>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,18 @@ bool ov::is_decompression(const std::shared_ptr<Node>& node) {
const auto& rt_info = node->get_rt_info();
return rt_info.count(Decompression::get_type_info_static());
}

void ov::mark_as_compression(const std::shared_ptr<Node>& node) {
auto& rt_info = node->get_rt_info();
rt_info[Compression::get_type_info_static()] = Compression();
}

void ov::unmark_as_compression(const std::shared_ptr<Node>& node) {
auto& rt_info = node->get_rt_info();
rt_info.erase(Compression::get_type_info_static());
}

bool ov::is_compression(const std::shared_ptr<Node>& node) {
const auto& rt_info = node->get_rt_info();
return rt_info.count(Compression::get_type_info_static());
}
4 changes: 2 additions & 2 deletions src/plugins/intel_cpu/src/graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -949,8 +949,8 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) {
&& parent->getChildEdges().size() == 1
&& parent->getChildEdgeAt(0)->getOutputNum() == 1
&& parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected
&& one_of(parent->getOriginalInputPrecisionAtPort(0), ov::element::f16)
&& one_of(parent->getOriginalOutputPrecisionAtPort(0), ov::element::f32, ov::element::bf16)
&& one_of(parent->getOriginalInputPrecisionAtPort(0), ov::element::f32, ov::element::bf16, ov::element::f16)
&& one_of(parent->getOriginalOutputPrecisionAtPort(0), ov::element::f32, ov::element::bf16, ov::element::f16)
&& parent->isConstant();
return res;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() {
auto fc_input_b = pattern_map.at(weights_m);
bool is_convert = false;
if (auto convert_node = std::dynamic_pointer_cast<ov::op::v0::Convert>(fc_input_b.get_node_shared_ptr())) {
if (is_decompression(convert_node)) {
if (is_decompression(convert_node) || fp16_compression_is_disabled(convert_node) || is_compression(convert_node)) {
is_convert = true;
fc_input_b = convert_node->get_input_node_shared_ptr(0);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
// It cannot be static data, because it may be difference for different inferencePrecision
const auto precisions = get_convert_precisions();
if (inferencePrecision == ov::element::f16) {
CPU_REGISTER_PASS_ARM(manager, ov::pass::KeepConstFP32Unfolded);
precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}};
type_to_fuse_map empty_fuse_map = {};
const bool keep_precision_sensitive_in_fp32 = true;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/tests/functional/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ else()
file(GLOB_RECURSE TMP_LIST_OF_TEST_CLASSES ${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests/classes/*.cpp)
file(GLOB_RECURSE TMP_LIST_OF_COMMON_TEST_INSTANCES ${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests/instances/common/*.cpp)
file(GLOB_RECURSE TMP_LIST_OF_ARM_TEST_INSTANCES ${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests/instances/arm/*.cpp)
file(GLOB_RECURSE TMP_LIST_OF_ARM_SUBGRAPH_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/arm/*.cpp)
file(GLOB_RECURSE TMP_LIST_OF_ARM_SUBGRAPH_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/src/arm/*.cpp)
list(APPEND TMP_LIST_OF_EXPLICITLY_ENABLED_TESTS
${TMP_LIST_OF_TEST_CLASSES} ${TMP_LIST_OF_COMMON_TEST_INSTANCES} ${TMP_LIST_OF_ARM_TEST_INSTANCES} ${TMP_LIST_OF_ARM_SUBGRAPH_TESTS})
set(TMP_EXPLICITLY_ENABLED_TESTS "${TMP_LIST_OF_EXPLICITLY_ENABLED_TESTS}")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include <tuple>
#include <string>
#include <vector>
#include <memory>
#include <debug.h>
#include <shared_test_classes/base/ov_subgraph.hpp>
#include <ov_models/builders.hpp>
#include "common_test_utils/common_utils.hpp"
#include <common_test_utils/ov_tensor_utils.hpp>
#include "functional_test_utils/skip_tests_config.hpp"
#include "test_utils/cpu_test_utils.hpp"
#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"

#include "test_utils/cpu_test_utils.hpp"
#include "test_utils/convolution_params.hpp"

using namespace CPUTestUtils;

namespace ov {
namespace test {

using MatMulCompressConvertParams = std::tuple<
std::vector<InputShape>, // input shapes
std::pair<bool, bool>, // transposeA, transposeB
element::Type, // inference precision
CPUSpecificParams
>;

class MatMulCompressConvertCPUTest: public testing::WithParamInterface<MatMulCompressConvertParams>,
virtual public SubgraphBaseTest, public CPUTestsBase {
public:
static std::string getTestCaseName(testing::TestParamInfo<MatMulCompressConvertParams> obj) {
std::vector<InputShape> inputShapes;
std::pair<bool, bool> transpose;
element::Type inferPrecision;
CPUSpecificParams cpuParams;

std::tie(inputShapes, transpose, inferPrecision, cpuParams) = obj.param;

std::ostringstream result;
for (const auto& shape : inputShapes) {
result << ov::test::utils::partialShape2str({shape.first}) << "_";
}
result << "TS=";
for (const auto& shape : inputShapes) {
result << "(";
if (!shape.second.empty()) {
auto itr = shape.second.begin();
do {
result << ov::test::utils::vec2str(*itr);
} while (++itr != shape.second.end() && result << "_");
}
result << ")_";
}
result << "transpose_a=" << transpose.first << "_";
result << "transpose_b=" << transpose.second << "_";

result << "infer_precision=" << inferPrecision << "_";

result << CPUTestsBase::getTestCaseName(cpuParams);

return result.str();
}

protected:
template<typename T>
void transposeShape(T& shape) {
IE_ASSERT(shape.size() > 1);
std::swap(*(shape.end() - 1), *(shape.end() - 2));
}

void CheckFCWeightsPrecision(element::Type expectedWeiElemType) const {
auto getExecValue = [](const ov::Node::RTMap& rtInfo, const std::string &paramName) -> std::string {
auto it = rtInfo.find(paramName);
IE_ASSERT(rtInfo.end() != it);
return it->second.as<std::string>();
};

const auto execFunction = compiledModel.get_runtime_model();
ASSERT_NE(nullptr, execFunction);
for (const auto &fcNode : execFunction->get_ops()) {
if (getExecValue(fcNode->get_rt_info(), ExecGraphInfoSerialization::LAYER_TYPE) == "FullyConnected") {
const auto &constNode = fcNode->get_input_node_shared_ptr(1);
element::Type expectedType(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::OUTPUT_PRECISIONS));
ASSERT_EQ(expectedType, expectedWeiElemType);
}
}
}

void SetUp() override {
targetDevice = ov::test::utils::DEVICE_CPU;

std::vector<InputShape> inputShapes;
std::pair<bool, bool> transpose;
element::Type inferPrecision;
CPUSpecificParams cpuParams;

std::tie(inputShapes, transpose, inferPrecision, cpuParams) = this->GetParam();
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;

init_input_shapes(inputShapes);

bool transpA = transpose.first;
bool transpB = transpose.second;

if (transpA) {
transposeShape(inputDynamicShapes[0]);
for (auto& shapes : targetStaticShapes) {
transposeShape(shapes[0]);
}
}
if (transpB) {
transposeShape(inputDynamicShapes[1]);
for (auto& shapes : targetStaticShapes) {
transposeShape(shapes[1]);
}
}

if (inferPrecision == element::f16) {
convertCount = 2; // convert f32->f16 on the activation input and convert f16->f32 on the output
}

const auto& inShapeA = inputDynamicShapes[0];
const auto& inShapeB = inputDynamicShapes[1];

configuration.emplace(ov::hint::inference_precision(inferPrecision));

element::Type netType = element::f32;
inType = outType = netType;

std::string cpuNodeType = "FullyConnected";
selectedType = makeSelectedTypeStr(selectedType, outType);
ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(inType, inShapeA)};
auto tensor = ov::test::utils::create_and_fill_tensor(element::f32, inShapeB.get_shape());
std::shared_ptr<Node> inputB = std::make_shared<ov::op::v0::Constant>(tensor);

auto matMul = std::make_shared<ov::op::v0::MatMul>(params[0], inputB, transpA, transpB);

function = CPUTestsBase::makeNgraphFunction(netType, params, matMul, cpuNodeType);
}

void CheckExecutionGraph() {
CheckNumberOfNodesWithType(compiledModel, "FullyConnected", 1);
CheckNumberOfNodesWithType(compiledModel, "Convert", convertCount);
CheckFCWeightsPrecision(element::f32);
}

size_t convertCount = 0;
};

TEST_P(MatMulCompressConvertCPUTest, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED();
run();
CheckExecutionGraph();
}

namespace {

const std::vector<std::pair<bool, bool>> transposeParams = {
{false, true},
};

const std::vector<std::vector<InputShape>> inputShapes2D = {
static_shapes_to_test_representation({{2, 3}, {3, 4}}),
{
{{-1, -1}, {{2, 3}, {5, 3}}},
{{3, 4}, {{3, 4}, {3, 4}}}
},
};

const std::vector<element::Type> inferPrecisions = {
element::f32,
#if defined(OV_CPU_ARM_ENABLE_FP16)
element::f16,
#endif
};

const auto testParams2D_ARM_smoke = ::testing::Combine(
::testing::ValuesIn(inputShapes2D),
::testing::ValuesIn(transposeParams),
::testing::ValuesIn(inferPrecisions),
::testing::Values(CPUSpecificParams{}));

INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_ARM, MatMulCompressConvertCPUTest, testParams2D_ARM_smoke,
MatMulCompressConvertCPUTest::getTestCaseName);

} // namespace

} // namespace test
} // namespace ov
8 changes: 4 additions & 4 deletions src/plugins/intel_cpu/thirdparty/ACLConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,10 @@ elseif(NOT TARGET arm_compute::arm_compute)
list(APPEND ARM_COMPUTE_OPTIONS --jobs=${ARM_COMPUTE_SCONS_JOBS})
endif()

set(ARM_COMPUTE_DEBUG_OPTIONS
debug=1
asserts=1
logging=1)
# set(ARM_COMPUTE_DEBUG_OPTIONS
# debug=1
# asserts=1
# logging=1)

# cmake older 3.20 does not support generator expressions in add_custom_command
# https://cmake.org/cmake/help/latest/command/add_custom_command.html#examples-generating-files
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/thirdparty/onednn

0 comments on commit 7681bf9

Please sign in to comment.