diff --git a/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h new file mode 100644 index 0000000000000..9426dc8d16482 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h @@ -0,0 +1,57 @@ +//===- ARMCommonInstCombineIntrinsic.h - +// instCombineIntrinsic opts for both ARM and AArch64 -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains optimizations for ARM and AArch64 intrinsics that +/// are shared between both architectures. These functions can be called from: +/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics) +/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_* +/// intrinsics) +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H +#define LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H + +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +namespace llvm { + +namespace ARMCommon { + +/// Convert a table lookup to shufflevector if the mask is constant. +/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in +/// which case we could lower the shufflevector with rev64 instructions +/// as it's actually a byte reverse. +Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC); + +/// Simplify NEON multiply-long intrinsics (smull, umull). +/// These intrinsics perform widening multiplies: they multiply two vectors of +/// narrow integers and produce a vector of wider integers. This function +/// performs algebraic simplifications: +/// 1. Multiply by zero => zero vector +/// 2. Multiply by one => zero/sign-extend the non-one operand +/// 3. Both operands constant => regular multiply that can be constant-folded +/// later +Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC, + bool IsSigned); + +/// Simplify AES encryption/decryption intrinsics (AESE, AESD). +/// +/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as +/// separate arguments, before performing the encryption/decryption operation. +/// We can fold that "internal" XOR with a previous one. +Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC); + +} // namespace ARMCommon +} // namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0bae00bafee3c..0d0c0970091d6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/Debug.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -2856,6 +2857,18 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_neon_fmaxnm: case Intrinsic::aarch64_neon_fminnm: return instCombineMaxMinNM(IC, II); + case Intrinsic::aarch64_neon_tbl1: + return ARMCommon::simplifyNeonTbl1(II, IC); + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: { + bool IsSigned = IID == Intrinsic::aarch64_neon_smull; + return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned); + } + case Intrinsic::aarch64_crypto_aesd: + case Intrinsic::aarch64_crypto_aese: + case Intrinsic::aarch64_sve_aesd: + case Intrinsic::aarch64_sve_aese: + return ARMCommon::simplifyAES(II, IC); case Intrinsic::aarch64_sve_convert_from_svbool: return instCombineConvertFromSVBool(IC, II); case Intrinsic::aarch64_sve_dup: diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index fdb0ec40cb41f..c93b2fbf419fe 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -31,6 +31,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/SubtargetFeature.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" @@ -182,6 +183,19 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::arm_neon_vtbl1: + return ARMCommon::simplifyNeonTbl1(II, IC); + + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + bool IsSigned = IID == Intrinsic::arm_neon_vmulls; + return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned); + } + + case Intrinsic::arm_neon_aesd: + case Intrinsic::arm_neon_aese: + return ARMCommon::simplifyAES(II, IC); + case Intrinsic::arm_mve_pred_i2v: { Value *Arg = II.getArgOperand(0); Value *ArgArg; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8e4edefec42fd..8a54c0dde6be6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -737,44 +737,6 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { return nullptr; } -/// Convert a table lookup to shufflevector if the mask is constant. -/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in -/// which case we could lower the shufflevector with rev64 instructions -/// as it's actually a byte reverse. -static Value *simplifyNeonTbl1(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - // Bail out if the mask is not a constant. - auto *C = dyn_cast(II.getArgOperand(1)); - if (!C) - return nullptr; - - auto *VecTy = cast(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - - // Only perform this transformation for <8 x i8> vector types. - if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) - return nullptr; - - int Indexes[8]; - - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = C->getAggregateElement(I); - - if (!COp || !isa(COp)) - return nullptr; - - Indexes[I] = cast(COp)->getLimitedValue(); - - // Make sure the mask indices are in range. - if ((unsigned)Indexes[I] >= NumElts) - return nullptr; - } - - auto *V1 = II.getArgOperand(0); - auto *V2 = Constant::getNullValue(V1->getType()); - return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes)); -} - // Returns true iff the 2 intrinsics have the same operands, limiting the // comparison to the first NumOperands. static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, @@ -3155,72 +3117,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin); return CallInst::Create(NewFn, CallArgs); } - case Intrinsic::arm_neon_vtbl1: - case Intrinsic::aarch64_neon_tbl1: - if (Value *V = simplifyNeonTbl1(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::arm_neon_vmulls: - case Intrinsic::arm_neon_vmullu: - case Intrinsic::aarch64_neon_smull: - case Intrinsic::aarch64_neon_umull: { - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - - // Handle mul by zero first: - if (isa(Arg0) || isa(Arg1)) { - return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); - } - - // Check for constant LHS & RHS - in this case we just simplify. - bool Zext = (IID == Intrinsic::arm_neon_vmullu || - IID == Intrinsic::aarch64_neon_umull); - VectorType *NewVT = cast(II->getType()); - if (Constant *CV0 = dyn_cast(Arg0)) { - if (Constant *CV1 = dyn_cast(Arg1)) { - Value *V0 = Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext); - Value *V1 = Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext); - return replaceInstUsesWith(CI, Builder.CreateMul(V0, V1)); - } - - // Couldn't simplify - canonicalize constant to the RHS. - std::swap(Arg0, Arg1); - } - - // Handle mul by one: - if (Constant *CV1 = dyn_cast(Arg1)) - if (ConstantInt *Splat = - dyn_cast_or_null(CV1->getSplatValue())) - if (Splat->isOne()) - return CastInst::CreateIntegerCast(Arg0, II->getType(), - /*isSigned=*/!Zext); - - break; - } - case Intrinsic::arm_neon_aesd: - case Intrinsic::arm_neon_aese: - case Intrinsic::aarch64_crypto_aesd: - case Intrinsic::aarch64_crypto_aese: - case Intrinsic::aarch64_sve_aesd: - case Intrinsic::aarch64_sve_aese: { - Value *DataArg = II->getArgOperand(0); - Value *KeyArg = II->getArgOperand(1); - - // Accept zero on either operand. - if (!match(KeyArg, m_ZeroInt())) - std::swap(KeyArg, DataArg); - - // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR - Value *Data, *Key; - if (match(KeyArg, m_ZeroInt()) && - match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { - replaceOperand(*II, 0, Data); - replaceOperand(*II, 1, Key); - return II; - } - break; - } case Intrinsic::hexagon_V6_vandvrt: case Intrinsic::hexagon_V6_vandvrt_128B: { // Simplify Q -> V -> Q conversion. diff --git a/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp new file mode 100644 index 0000000000000..fe192596b6f2b --- /dev/null +++ b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp @@ -0,0 +1,136 @@ +//===- ARMCommonInstCombineIntrinsic.cpp - +// instCombineIntrinsic opts for both ARM and AArch64 ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains optimizations for ARM and AArch64 intrinsics that +/// are shared between both architectures. These functions can be called from: +/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics) +/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_* +/// intrinsics) +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +namespace llvm { +namespace ARMCommon { + +/// Convert a table lookup to shufflevector if the mask is constant. +/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in +/// which case we could lower the shufflevector with rev64 instructions +/// as it's actually a byte reverse. +Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) { + // Bail out if the mask is not a constant. + auto *C = dyn_cast(II.getArgOperand(1)); + if (!C) + return nullptr; + + auto *VecTy = cast(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + + // Only perform this transformation for <8 x i8> vector types. + if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) + return nullptr; + + int Indexes[8]; + + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = C->getAggregateElement(I); + + if (!COp || !isa(COp)) + return nullptr; + + Indexes[I] = cast(COp)->getLimitedValue(); + + // Make sure the mask indices are in range. + if ((unsigned)Indexes[I] >= NumElts) + return nullptr; + } + + auto *V1 = II.getArgOperand(0); + auto *V2 = Constant::getNullValue(V1->getType()); + Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes)); + return IC.replaceInstUsesWith(II, Shuf); +} + +/// Simplify NEON multiply-long intrinsics (smull, umull). +/// These intrinsics perform widening multiplies: they multiply two vectors of +/// narrow integers and produce a vector of wider integers. This function +/// performs algebraic simplifications: +/// 1. Multiply by zero => zero vector +/// 2. Multiply by one => zero/sign-extend the non-one operand +/// 3. Both operands constant => regular multiply that can be constant-folded +/// later +Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC, + bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + + // Handle mul by zero first: + if (isa(Arg0) || isa(Arg1)) { + return IC.replaceInstUsesWith(II, ConstantAggregateZero::get(II.getType())); + } + + // Check for constant LHS & RHS - in this case we just simplify. + VectorType *NewVT = cast(II.getType()); + if (Constant *CV0 = dyn_cast(Arg0)) { + if (Constant *CV1 = dyn_cast(Arg1)) { + Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, IsSigned); + Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, IsSigned); + return IC.replaceInstUsesWith(II, IC.Builder.CreateMul(V0, V1)); + } + + // Couldn't simplify - canonicalize constant to the RHS. + std::swap(Arg0, Arg1); + } + + // Handle mul by one: + if (Constant *CV1 = dyn_cast(Arg1)) + if (ConstantInt *Splat = + dyn_cast_or_null(CV1->getSplatValue())) + if (Splat->isOne()) + return CastInst::CreateIntegerCast(Arg0, II.getType(), IsSigned); + + return nullptr; +} + +/// Simplify AES encryption/decryption intrinsics (AESE, AESD). +/// +/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as +/// separate arguments, before performing the encryption/decryption operation. +/// We can fold that "internal" XOR with a previous one. +Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC) { + Value *DataArg = II.getArgOperand(0); + Value *KeyArg = II.getArgOperand(1); + + // Accept zero on either operand. + if (!match(KeyArg, m_ZeroInt())) + std::swap(KeyArg, DataArg); + + // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR + Value *Data, *Key; + if (match(KeyArg, m_ZeroInt()) && + match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { + IC.replaceOperand(II, 0, Data); + IC.replaceOperand(II, 1, Key); + return &II; + } + + return nullptr; +} + +} // namespace ARMCommon +} // namespace llvm diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index f367ca2fdf56b..a1c25fd9ccd2b 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMTransformUtils AddDiscriminators.cpp AMDGPUEmitPrintf.cpp + ARMCommonInstCombineIntrinsic.cpp ASanStackFrameLayout.cpp AssumeBundleBuilder.cpp BasicBlockUtils.cpp diff --git a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll index 8c69d0721b738..fdc628bb59cb0 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; RUN: opt --mtriple=aarch64 -S -passes=instcombine < %s | FileCheck %s ; ARM64 AES intrinsic variants define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) { diff --git a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll index 5fc5709ff8897..9ba4b418cb8e5 100644 --- a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp { ; CHECK-LABEL: define <4 x i32> @mulByZero( diff --git a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll index 0056d872ff9e3..10175096035ec 100644 --- a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s ; ARM AES intrinsic variants define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) { diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll index fbec1a2bb7a07..ceeac8648ec51 100644 --- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll +++ b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=arm -passes=instcombine -S | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv8-arm-none-eabi" diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn index 186d2ef96c19b..dae641537b43c 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn @@ -8,6 +8,7 @@ static_library("Utils") { ] sources = [ "AMDGPUEmitPrintf.cpp", + "ARMCommonInstCombineIntrinsic.cpp", "ASanStackFrameLayout.cpp", "AddDiscriminators.cpp", "AssumeBundleBuilder.cpp",