diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0bae00bafee3c..4a53e5bd49c70 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "AArch64TargetTransformInfo.h" +#include "../ARMCommon/ARMCommonInstCombineIntrinsic.h" #include "AArch64ExpandImm.h" #include "AArch64PerfectShuffle.h" #include "AArch64SMEAttributes.h" @@ -2856,6 +2857,26 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_neon_fmaxnm: case Intrinsic::aarch64_neon_fminnm: return instCombineMaxMinNM(IC, II); + case Intrinsic::aarch64_neon_tbl1: + case Intrinsic::aarch64_neon_tbl2: + case Intrinsic::aarch64_neon_tbl3: + case Intrinsic::aarch64_neon_tbl4: + return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false); + case Intrinsic::aarch64_neon_tbx1: + case Intrinsic::aarch64_neon_tbx2: + case Intrinsic::aarch64_neon_tbx3: + case Intrinsic::aarch64_neon_tbx4: + return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true); + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: { + bool IsSigned = IID == Intrinsic::aarch64_neon_smull; + return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned); + } + case Intrinsic::aarch64_crypto_aesd: + case Intrinsic::aarch64_crypto_aese: + case Intrinsic::aarch64_sve_aesd: + case Intrinsic::aarch64_sve_aese: + return ARMCommon::simplifyAES(II, IC); case Intrinsic::aarch64_sve_convert_from_svbool: return instCombineConvertFromSVBool(IC, II); case Intrinsic::aarch64_sve_dup: diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 285d646293eb7..d27a698ee9e4a 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -101,6 +101,7 @@ add_llvm_target(AArch64CodeGen AArch64Desc AArch64Info AArch64Utils + ARMCommon Analysis AsmPrinter CFGuard diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index fdb0ec40cb41f..99d57b00315b1 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "ARMTargetTransformInfo.h" +#include "../ARMCommon/ARMCommonInstCombineIntrinsic.h" #include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/ADT/APInt.h" @@ -182,6 +183,28 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::arm_neon_vtbl1: + case Intrinsic::arm_neon_vtbl2: + case Intrinsic::arm_neon_vtbl3: + case Intrinsic::arm_neon_vtbl4: + return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false); + + case Intrinsic::arm_neon_vtbx1: + case Intrinsic::arm_neon_vtbx2: + case Intrinsic::arm_neon_vtbx3: + case Intrinsic::arm_neon_vtbx4: + return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true); + + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + bool IsSigned = IID == Intrinsic::arm_neon_vmulls; + return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned); + } + + case Intrinsic::arm_neon_aesd: + case Intrinsic::arm_neon_aese: + return ARMCommon::simplifyAES(II, IC); + case Intrinsic::arm_mve_pred_i2v: { Value *Arg = II.getArgOperand(0); Value *ArgArg; diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt index eb3ad01a54fb2..9fc9bc134e5cc 100644 --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -73,6 +73,7 @@ add_llvm_target(ARMCodeGen Thumb2SizeReduction.cpp LINK_COMPONENTS + ARMCommon ARMDesc ARMInfo ARMUtils diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp new file mode 100644 index 0000000000000..b524916146ae5 --- /dev/null +++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp @@ -0,0 +1,217 @@ +//===- ARMCommonInstCombineIntrinsic.cpp - +// instCombineIntrinsic opts for both ARM and AArch64 ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains optimizations for ARM and AArch64 intrinsics that +/// are shared between both architectures. These functions can be called from: +/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics) +/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_* +/// intrinsics) +/// +//===----------------------------------------------------------------------===// + +#include "ARMCommonInstCombineIntrinsic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +namespace llvm { +namespace ARMCommon { + +/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and +/// at most two source operands are actually referenced. +Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC, + bool IsExtension) { + // Bail out if the mask is not a constant. + auto *C = dyn_cast(II.getArgOperand(II.arg_size() - 1)); + if (!C) + return nullptr; + + auto *RetTy = cast(II.getType()); + unsigned NumIndexes = RetTy->getNumElements(); + + // Only perform this transformation for <8 x i8> and <16 x i8> vector types. + if (!(RetTy->getElementType()->isIntegerTy(8) && + (NumIndexes == 8 || NumIndexes == 16))) + return nullptr; + + // For tbx instructions, the first argument is the "fallback" vector, which + // has the same length as the mask and return type. + unsigned int StartIndex = (unsigned)IsExtension; + auto *SourceTy = + cast(II.getArgOperand(StartIndex)->getType()); + // Note that the element count of each source vector does *not* need to be the + // same as the element count of the return type and mask! All source vectors + // must have the same element count as each other, though. + unsigned NumElementsPerSource = SourceTy->getNumElements(); + + // There are no tbl/tbx intrinsics for which the destination size exceeds the + // source size. However, our definitions of the intrinsics, at least in + // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it + // *could* technically happen. + if (NumIndexes > NumElementsPerSource) { + return nullptr; + } + + // The tbl/tbx intrinsics take several source operands followed by a mask + // operand. + unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension; + + // Map input operands to shuffle indices. This also helpfully deduplicates the + // input arguments, in case the same value is passed as an argument multiple + // times. + SmallDenseMap ValueToShuffleSlot; + Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy), + PoisonValue::get(SourceTy)}; + + int Indexes[16]; + for (unsigned I = 0; I < NumIndexes; ++I) { + Constant *COp = C->getAggregateElement(I); + + if (!COp || (!isa(COp) && !isa(COp))) + return nullptr; + + if (isa(COp)) { + Indexes[I] = -1; + continue; + } + + uint64_t Index = cast(COp)->getZExtValue(); + // The index of the input argument that this index references (0 = first + // source argument, etc). + unsigned SourceOperandIndex = Index / NumElementsPerSource; + // The index of the element at that source operand. + unsigned SourceOperandElementIndex = Index % NumElementsPerSource; + + Value *SourceOperand; + if (SourceOperandIndex >= NumSourceOperands) { + // This index is out of bounds. Map it to index into either the fallback + // vector (tbx) or vector of zeroes (tbl). + SourceOperandIndex = NumSourceOperands; + if (IsExtension) { + // For out-of-bounds indices in tbx, choose the `I`th element of the + // fallback. + SourceOperand = II.getArgOperand(0); + SourceOperandElementIndex = I; + } else { + // Otherwise, choose some element from the dummy vector of zeroes (we'll + // always choose the first). + SourceOperand = Constant::getNullValue(SourceTy); + SourceOperandElementIndex = 0; + } + } else { + SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex); + } + + // The source operand may be the fallback vector, which may not have the + // same number of elements as the source vector. In that case, we *could* + // choose to extend its length with another shufflevector, but it's simpler + // to just bail instead. + if (cast(SourceOperand->getType())->getNumElements() != + NumElementsPerSource) { + return nullptr; + } + + // We now know the source operand referenced by this index. Make it a + // shufflevector operand, if it isn't already. + unsigned NumSlots = ValueToShuffleSlot.size(); + // This shuffle references more than two sources, and hence cannot be + // represented as a shufflevector. + if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) { + return nullptr; + } + auto [It, Inserted] = + ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots); + if (Inserted) { + ShuffleOperands[It->getSecond()] = SourceOperand; + } + + unsigned RemappedIndex = + (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex; + Indexes[I] = RemappedIndex; + } + + Value *Shuf = IC.Builder.CreateShuffleVector( + ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes)); + return IC.replaceInstUsesWith(II, Shuf); +} + +/// Simplify NEON multiply-long intrinsics (smull, umull). +/// These intrinsics perform widening multiplies: they multiply two vectors of +/// narrow integers and produce a vector of wider integers. This function +/// performs algebraic simplifications: +/// 1. Multiply by zero => zero vector +/// 2. Multiply by one => zero/sign-extend the non-one operand +/// 3. Both operands constant => regular multiply that can be constant-folded +/// later +Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC, + bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + + // Handle mul by zero first: + if (isa(Arg0) || isa(Arg1)) { + return IC.replaceInstUsesWith(II, ConstantAggregateZero::get(II.getType())); + } + + // Check for constant LHS & RHS - in this case we just simplify. + VectorType *NewVT = cast(II.getType()); + if (Constant *CV0 = dyn_cast(Arg0)) { + if (Constant *CV1 = dyn_cast(Arg1)) { + Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, IsSigned); + Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, IsSigned); + return IC.replaceInstUsesWith(II, IC.Builder.CreateMul(V0, V1)); + } + + // Couldn't simplify - canonicalize constant to the RHS. + std::swap(Arg0, Arg1); + } + + // Handle mul by one: + if (Constant *CV1 = dyn_cast(Arg1)) + if (ConstantInt *Splat = + dyn_cast_or_null(CV1->getSplatValue())) + if (Splat->isOne()) + return CastInst::CreateIntegerCast(Arg0, II.getType(), IsSigned); + + return nullptr; +} + +/// Simplify AES encryption/decryption intrinsics (AESE, AESD). +/// +/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as +/// separate arguments, before performing the encryption/decryption operation. +/// We can fold that "internal" XOR with a previous one. +Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC) { + Value *DataArg = II.getArgOperand(0); + Value *KeyArg = II.getArgOperand(1); + + // Accept zero on either operand. + if (!match(KeyArg, m_ZeroInt())) + std::swap(KeyArg, DataArg); + + // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR + Value *Data, *Key; + if (match(KeyArg, m_ZeroInt()) && + match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { + IC.replaceOperand(II, 0, Data); + IC.replaceOperand(II, 1, Key); + return &II; + } + + return nullptr; +} + +} // namespace ARMCommon +} // namespace llvm diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h new file mode 100644 index 0000000000000..319aee48ccb0d --- /dev/null +++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h @@ -0,0 +1,56 @@ +//===- ARMCommonInstCombineIntrinsic.h - +// instCombineIntrinsic opts for both ARM and AArch64 -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains optimizations for ARM and AArch64 intrinsics that +/// are shared between both architectures. These functions can be called from: +/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics) +/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_* +/// intrinsics) +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H +#define LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H + +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +namespace llvm { + +namespace ARMCommon { + +/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and +/// at most two source operands are actually referenced. +Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC, + bool IsExtension); + +/// Simplify NEON multiply-long intrinsics (smull, umull). +/// These intrinsics perform widening multiplies: they multiply two vectors of +/// narrow integers and produce a vector of wider integers. This function +/// performs algebraic simplifications: +/// 1. Multiply by zero => zero vector +/// 2. Multiply by one => zero/sign-extend the non-one operand +/// 3. Both operands constant => regular multiply that can be constant-folded +/// later +Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC, + bool IsSigned); + +/// Simplify AES encryption/decryption intrinsics (AESE, AESD). +/// +/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as +/// separate arguments, before performing the encryption/decryption operation. +/// We can fold that "internal" XOR with a previous one. +Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC); + +} // namespace ARMCommon +} // namespace llvm + +#endif // LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H diff --git a/llvm/lib/Target/ARMCommon/CMakeLists.txt b/llvm/lib/Target/ARMCommon/CMakeLists.txt new file mode 100644 index 0000000000000..1805a5df2f053 --- /dev/null +++ b/llvm/lib/Target/ARMCommon/CMakeLists.txt @@ -0,0 +1,8 @@ +add_llvm_component_library(LLVMARMCommon + ARMCommonInstCombineIntrinsic.cpp + + LINK_COMPONENTS + Core + Support + TransformUtils + ) diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt index bcc13f942bf96..e3528014a4be2 100644 --- a/llvm/lib/Target/CMakeLists.txt +++ b/llvm/lib/Target/CMakeLists.txt @@ -31,6 +31,11 @@ if (NOT BUILD_SHARED_LIBS AND NOT APPLE AND set(CMAKE_CXX_VISIBILITY_PRESET hidden) endif() +# Add shared ARM/AArch64 utilities if either target is being built +if("ARM" IN_LIST LLVM_TARGETS_TO_BUILD OR "AArch64" IN_LIST LLVM_TARGETS_TO_BUILD) + add_subdirectory(ARMCommon) +endif() + foreach(t ${LLVM_TARGETS_TO_BUILD}) message(STATUS "Targeting ${t}") add_subdirectory(${t}) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8e4edefec42fd..8a54c0dde6be6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -737,44 +737,6 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { return nullptr; } -/// Convert a table lookup to shufflevector if the mask is constant. -/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in -/// which case we could lower the shufflevector with rev64 instructions -/// as it's actually a byte reverse. -static Value *simplifyNeonTbl1(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - // Bail out if the mask is not a constant. - auto *C = dyn_cast(II.getArgOperand(1)); - if (!C) - return nullptr; - - auto *VecTy = cast(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - - // Only perform this transformation for <8 x i8> vector types. - if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) - return nullptr; - - int Indexes[8]; - - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = C->getAggregateElement(I); - - if (!COp || !isa(COp)) - return nullptr; - - Indexes[I] = cast(COp)->getLimitedValue(); - - // Make sure the mask indices are in range. - if ((unsigned)Indexes[I] >= NumElts) - return nullptr; - } - - auto *V1 = II.getArgOperand(0); - auto *V2 = Constant::getNullValue(V1->getType()); - return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes)); -} - // Returns true iff the 2 intrinsics have the same operands, limiting the // comparison to the first NumOperands. static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, @@ -3155,72 +3117,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin); return CallInst::Create(NewFn, CallArgs); } - case Intrinsic::arm_neon_vtbl1: - case Intrinsic::aarch64_neon_tbl1: - if (Value *V = simplifyNeonTbl1(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::arm_neon_vmulls: - case Intrinsic::arm_neon_vmullu: - case Intrinsic::aarch64_neon_smull: - case Intrinsic::aarch64_neon_umull: { - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - - // Handle mul by zero first: - if (isa(Arg0) || isa(Arg1)) { - return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); - } - - // Check for constant LHS & RHS - in this case we just simplify. - bool Zext = (IID == Intrinsic::arm_neon_vmullu || - IID == Intrinsic::aarch64_neon_umull); - VectorType *NewVT = cast(II->getType()); - if (Constant *CV0 = dyn_cast(Arg0)) { - if (Constant *CV1 = dyn_cast(Arg1)) { - Value *V0 = Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext); - Value *V1 = Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext); - return replaceInstUsesWith(CI, Builder.CreateMul(V0, V1)); - } - - // Couldn't simplify - canonicalize constant to the RHS. - std::swap(Arg0, Arg1); - } - - // Handle mul by one: - if (Constant *CV1 = dyn_cast(Arg1)) - if (ConstantInt *Splat = - dyn_cast_or_null(CV1->getSplatValue())) - if (Splat->isOne()) - return CastInst::CreateIntegerCast(Arg0, II->getType(), - /*isSigned=*/!Zext); - - break; - } - case Intrinsic::arm_neon_aesd: - case Intrinsic::arm_neon_aese: - case Intrinsic::aarch64_crypto_aesd: - case Intrinsic::aarch64_crypto_aese: - case Intrinsic::aarch64_sve_aesd: - case Intrinsic::aarch64_sve_aese: { - Value *DataArg = II->getArgOperand(0); - Value *KeyArg = II->getArgOperand(1); - - // Accept zero on either operand. - if (!match(KeyArg, m_ZeroInt())) - std::swap(KeyArg, DataArg); - - // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR - Value *Data, *Key; - if (match(KeyArg, m_ZeroInt()) && - match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { - replaceOperand(*II, 0, Data); - replaceOperand(*II, 1, Key); - return II; - } - break; - } case Intrinsic::hexagon_V6_vandvrt: case Intrinsic::hexagon_V6_vandvrt_128B: { // Simplify Q -> V -> Q conversion. diff --git a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll index 8c69d0721b738..fdc628bb59cb0 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; RUN: opt --mtriple=aarch64 -S -passes=instcombine < %s | FileCheck %s ; ARM64 AES intrinsic variants define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) { diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll new file mode 100644 index 0000000000000..f747f44a7ab9f --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll @@ -0,0 +1,269 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask +; is constant and references 2 or fewer operands. + +; Basic tbl1 with all in-bounds indices should optimize to shufflevector. +define <16 x i8> @tbl1_basic(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_basic( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl2 with both operands the same should optimize (1 unique source). +define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) { +; CHECK-LABEL: @tbl2_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 with alternating duplicate operands should optimize (2 unique sources). +define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbl4_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 where mask only references first two operands should optimize. +define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: @tbl4_unused_operands( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 where mask only references one operand should optimize. +define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: @tbl4_single_operand_used( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources). +define <16 x i8> @tbl1_with_oob(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> , <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources). +define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) { +; CHECK-LABEL: @tbl2_duplicate_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> , <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources). +define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbl2_with_oob_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl1 with all OOB indices should optimize to zero vector. +define <16 x i8> @tbl1_all_oob(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_all_oob( +; CHECK-NEXT: ret <16 x i8> zeroinitializer +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl3 referencing all 3 operands should NOT optimize. +define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: @tbl3_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 referencing 3 unique operands should NOT optimize. +define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: @tbl4_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 referencing all 4 unique operands should NOT optimize. +define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: @tbl4_four_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbx1 with no OOB should optimize. +define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_no_oob( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx2 where fallback == second source operand should optimize (deduplicated). +define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbx2_fallback_equals_second_source( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx1 with OOB where fallback == source should optimize (deduplicated). +define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) { +; CHECK-LABEL: @tbx1_oob_fallback_same_as_source( +; CHECK-NEXT: [[A:%.*]] = shufflevector <16 x i8> [[A1:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[A]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources). +define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbx2_with_oob_bail( +; CHECK-NEXT: [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBX]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx1 with all OOB indices should optimize to fallback. +define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_all_oob( +; CHECK-NEXT: ret <16 x i8> [[FALLBACK:%.*]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize. +define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_fallback_size_mismatch( +; CHECK-NEXT: [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBX]] +; + %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; tbx1 with no OOB and mismatched fallback/source sizes should optimize. +define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; tbl1 with non-i8 element type should NOT optimize. +define <8 x i16> @tbl1_8x16(<16 x i8> %vec) { +; CHECK-LABEL: @tbl1_8x16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> ) +; CHECK-NEXT: ret <8 x i16> [[TBL1]] +; +entry: + %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> ) + ret <8 x i16> %tbl1 +} +declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>) + +; tbl1 with non-8/16 element count should NOT optimize. +define <12 x i8> @tbl1_16x8(<16 x i8> %vec) { +; CHECK-LABEL: @tbl1_16x8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> ) +; CHECK-NEXT: ret <12 x i8> [[TBL1]] +; +entry: + %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> ) + ret <12 x i8> %tbl1 +} +declare <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8>, <12 x i8>) + +; Non-constant mask should NOT optimize. +define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) { +; CHECK-LABEL: @tbl1_non_constant_mask( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]]) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask) + ret <16 x i8> %tbl +} + +; Mask with some poison elements should optimize, with poison propagating to output. +define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_poison_mask_elements( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; Mask with all poison elements should optimize to poison. +define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_all_poison_mask( +; CHECK-NEXT: ret <16 x i8> poison +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison) + ret <16 x i8> %tbl +} + +; "Real" declarations +declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll deleted file mode 100644 index 362cc0f6c4493..0000000000000 --- a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll +++ /dev/null @@ -1,65 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64" - -; Turning a table lookup intrinsic into a shuffle vector instruction -; can be beneficial. If the mask used for the lookup is the constant -; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64 -; instructions instead. - -define <8 x i8> @tbl1_8x8(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i8> [[TMP0]] -; -entry: - %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> ) - ret <8 x i8> %tbl1 -} - -; Bail the optimization if a mask index is out of range. -define <8 x i8> @tbl1_8x8_out_of_range(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8_out_of_range( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VEC:%.*]], <8 x i8> ) -; CHECK-NEXT: ret <8 x i8> [[TBL1]] -; -entry: - %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> ) - ret <8 x i8> %tbl1 -} - -; Bail the optimization if the size of the return vector is not 8 elements. -define <16 x i8> @tbl1_16x8(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_16x8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TBL1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[VEC:%.*]], <16 x i8> ) -; CHECK-NEXT: ret <16 x i8> [[TBL1]] -; -entry: - %tbl1 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %vec, <16 x i8> ) - ret <16 x i8> %tbl1 -} - -; Bail the optimization if the elements of the return vector are not of type i8. -define <8 x i16> @tbl1_8x16(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x16( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> ) -; CHECK-NEXT: ret <8 x i16> [[TBL1]] -; -entry: - %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> ) - ret <8 x i16> %tbl1 -} - -; The type <8 x i16> is not a valid return type for this intrinsic, -; but we want to test that the optimization won't trigger for vector -; elements of type different than i8. -declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>) - -declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) -declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) diff --git a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll index 5fc5709ff8897..9ba4b418cb8e5 100644 --- a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp { ; CHECK-LABEL: define <4 x i32> @mulByZero( diff --git a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll index 0056d872ff9e3..10175096035ec 100644 --- a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=instcombine < %s | FileCheck %s +; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s ; ARM AES intrinsic variants define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) { diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll new file mode 100644 index 0000000000000..05e6573eb76c1 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=arm -passes=instcombine -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8-arm-none-eabi" + +; We can turn a vtbl/vtbx intrinsic into a shufflevector instruction if the mask +; is constant and references 2 or fewer operands. + +; Basic vtbl1 with all in-bounds indices should optimize to shufflevector. +define <8 x i8> @vtbl1_basic(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_basic( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl2 with both operands the same should be optimized (1 unique source). +define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) { +; CHECK-LABEL: @vtbl2_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 with alternating duplicate operands should optimize (2 unique sources). +define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbl4_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 where mask only references first two operands should optimize. +define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-LABEL: @vtbl4_unused_operands( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 where mask only references one operand should optimize. +define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-LABEL: @vtbl4_single_operand_used( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources). +define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources). +define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) { +; CHECK-LABEL: @vtbl2_duplicate_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources). +define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbl2_with_oob_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl1 with all OOB indices should optimize to zero vector. +define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_all_oob( +; CHECK-NEXT: ret <8 x i8> zeroinitializer +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl3 referencing all 3 operands should NOT optimize. +define <8 x i8> @vtbl3_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK-LABEL: @vtbl3_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 referencing 3 unique operands should NOT optimize. +define <8 x i8> @vtbl4_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK-LABEL: @vtbl4_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 referencing all 4 unique operands should NOT optimize. +define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-LABEL: @vtbl4_four_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbx1 with no OOB should optimize. +define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) { +; CHECK-LABEL: @vtbx1_no_oob( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx2 where fallback == second source operand should optimize (deduplicated). +define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbx2_fallback_equals_second_source( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx1 with OOB where fallback == source should optimize (deduplicated). +define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) { +; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources). +define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbx2_with_oob_bail( +; CHECK-NEXT: [[TBX:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBX]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx1 with all OOB indices should optimize to fallback. +define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) { +; CHECK-LABEL: @vtbx1_all_oob( +; CHECK-NEXT: ret <8 x i8> [[FALLBACK:%.*]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; Non-constant mask should NOT optimize. +define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) { +; CHECK-LABEL: @vtbl1_non_constant_mask( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> [[MASK:%.*]]) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %mask) + ret <8 x i8> %tbl +} + +; Mask with some poison elements should optimize, with poison propagating to output. +define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_poison_mask_elements( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; Mask with all poison elements should optimize to poison. +define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_all_poison_mask( +; CHECK-NEXT: ret <8 x i8> poison +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison) + ret <8 x i8> %tbl +} + +; Declarations +declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll deleted file mode 100644 index fbec1a2bb7a07..0000000000000 --- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll +++ /dev/null @@ -1,35 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S | FileCheck %s - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -target triple = "armv8-arm-none-eabi" - -; Turning a table lookup intrinsic into a shuffle vector instruction -; can be beneficial. If the mask used for the lookup is the constant -; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64 -; instructions instead. - -define <8 x i8> @tbl1_8x8(<8 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i8> [[TMP0]] -; -entry: - %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> ) - ret <8 x i8> %vtbl1 -} - -; Bail the optimization if a mask index is out of range. -define <8 x i8> @tbl1_8x8_out_of_range(<8 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8_out_of_range( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[VTBL1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[VEC:%.*]], <8 x i8> ) -; CHECK-NEXT: ret <8 x i8> [[VTBL1]] -; -entry: - %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> ) - ret <8 x i8> %vtbl1 -} - -declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>)