diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 16996300e7bb93..6bdc620fc18f66 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1474,13 +1474,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { break; case Intrinsic::cttz: // FIXME: If necessary, this should go in target-specific overrides. - if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) + if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) return TargetTransformInfo::TCC_Basic; break; case Intrinsic::ctlz: // FIXME: If necessary, this should go in target-specific overrides. - if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) + if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) return TargetTransformInfo::TCC_Basic; break; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 94fafcc11aaf2b..bf4e85881e778e 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -621,12 +621,12 @@ class TargetLoweringBase { } /// Return true if it is cheap to speculate a call to intrinsic cttz. - virtual bool isCheapToSpeculateCttz() const { + virtual bool isCheapToSpeculateCttz(Type *Ty) const { return false; } /// Return true if it is cheap to speculate a call to intrinsic ctlz. - virtual bool isCheapToSpeculateCtlz() const { + virtual bool isCheapToSpeculateCtlz(Type *Ty) const { return false; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 88c01c32f4a719..4bd3ba3cc98b9e 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2045,13 +2045,13 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return false; // If it's cheap to speculate, there's nothing to do. + Type *Ty = CountZeros->getType(); auto IntrinsicID = CountZeros->getIntrinsicID(); - if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || - (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) + if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) || + (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty))) return false; // Only handle legal scalar cases. Anything else requires too much work. - Type *Ty = CountZeros->getType(); unsigned SizeInBits = Ty->getScalarSizeInBits(); if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits()) return false; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 004f4a520736dd..673c151619faa0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -742,11 +742,11 @@ class AArch64TargetLowering : public TargetLowering { return true; } - bool isCheapToSpeculateCttz() const override { + bool isCheapToSpeculateCttz(Type *) const override { return true; } - bool isCheapToSpeculateCtlz() const override { + bool isCheapToSpeculateCtlz(Type *) const override { return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3f16367add784e..4b0dd9f1e64850 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -692,11 +692,11 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also // profitable with the expansion for 64-bit since it's generally good to // speculate things. -bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { +bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { return true; } -bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { +bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 73081483f1c3d0..11ee9f9ff0dd56 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -193,8 +193,8 @@ class AMDGPUTargetLowering : public TargetLowering { unsigned NumElem, unsigned AS) const override; bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; - bool isCheapToSpeculateCttz() const override; - bool isCheapToSpeculateCtlz() const override; + bool isCheapToSpeculateCttz(Type *Ty) const override; + bool isCheapToSpeculateCtlz(Type *Ty) const override; bool isSDNodeAlwaysUniform(const SDNode *N) const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2306193f04297e..9da2cf2a9f94b1 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21157,11 +21157,11 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; } -bool ARMTargetLowering::isCheapToSpeculateCttz() const { +bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { return Subtarget->hasV6T2Ops(); } -bool ARMTargetLowering::isCheapToSpeculateCtlz() const { +bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { return Subtarget->hasV6T2Ops(); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 8947c4add327f5..9ff920f230e22c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -679,8 +679,8 @@ class VectorType; return (MemVT.getSizeInBits() <= 32); } - bool isCheapToSpeculateCttz() const override; - bool isCheapToSpeculateCtlz() const override; + bool isCheapToSpeculateCttz(Type *Ty) const override; + bool isCheapToSpeculateCtlz(Type *Ty) const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 1dc6a4cb9c894f..8ef7606cda1959 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -129,8 +129,8 @@ class HexagonTargetLowering : public TargetLowering { bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; - bool isCheapToSpeculateCttz() const override { return true; } - bool isCheapToSpeculateCtlz() const override { return true; } + bool isCheapToSpeculateCttz(Type *) const override { return true; } + bool isCheapToSpeculateCtlz(Type *) const override { return true; } bool isCtlzFast() const override { return true; } bool hasBitTest(SDValue X, SDValue Y) const override; diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 05f411c9e4cd39..3aee67653c32e6 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -1172,11 +1172,11 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return SDValue(); } -bool MipsTargetLowering::isCheapToSpeculateCttz() const { +bool MipsTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { return Subtarget.hasMips32(); } -bool MipsTargetLowering::isCheapToSpeculateCtlz() const { +bool MipsTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { return Subtarget.hasMips32(); } diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index 1f921fbe94916f..723be3b31dce2c 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -283,8 +283,8 @@ class TargetRegisterClass; EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType) const override; - bool isCheapToSpeculateCttz() const override; - bool isCheapToSpeculateCtlz() const override; + bool isCheapToSpeculateCttz(Type *Ty) const override; + bool isCheapToSpeculateCtlz(Type *Ty) const override; bool hasBitTest(SDValue X, SDValue Y) const override; bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index fb09f99a019d0e..ae66816548f9f4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -559,7 +559,7 @@ class NVPTXTargetLowering : public TargetLowering { // x == 0 is not undefined behavior) into a branch that checks whether x is 0 // and avoids calling ctlz in that case. We have a dedicated ctlz // instruction, so we say that ctlz is cheap to speculate. - bool isCheapToSpeculateCtlz() const override { return true; } + bool isCheapToSpeculateCtlz(Type *Ty) const override { return true; } AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override { return AtomicExpansionKind::None; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 4a08cc42fa9d79..efceab2180e7e5 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -790,11 +790,11 @@ namespace llvm { return MVT::i32; } - bool isCheapToSpeculateCttz() const override { + bool isCheapToSpeculateCttz(Type *Ty) const override { return true; } - bool isCheapToSpeculateCtlz() const override { + bool isCheapToSpeculateCtlz(Type *Ty) const override { return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 48b9d283b9638a..0a405f1b5e0de8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1155,11 +1155,11 @@ bool RISCVTargetLowering::signExtendConstant(const ConstantInt *CI) const { return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32); } -bool RISCVTargetLowering::isCheapToSpeculateCttz() const { +bool RISCVTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { return Subtarget.hasStdExtZbb(); } -bool RISCVTargetLowering::isCheapToSpeculateCtlz() const { +bool RISCVTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { return Subtarget.hasStdExtZbb(); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 5a1acde4a5b260..dcaa7f24b4c8d6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -370,8 +370,8 @@ class RISCVTargetLowering : public TargetLowering { bool isZExtFree(SDValue Val, EVT VT2) const override; bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; bool signExtendConstant(const ConstantInt *CI) const override; - bool isCheapToSpeculateCttz() const override; - bool isCheapToSpeculateCtlz() const override; + bool isCheapToSpeculateCttz(Type *Ty) const override; + bool isCheapToSpeculateCtlz(Type *Ty) const override; bool hasAndNotCompare(SDValue Y) const override; bool hasBitTest(SDValue X, SDValue Y) const override; bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index b9c95274f62b86..66f0fa20f3e320 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -423,7 +423,7 @@ class SystemZTargetLowering : public TargetLowering { return 1; return TargetLowering::getNumRegisters(Context, VT); } - bool isCheapToSpeculateCtlz() const override { return true; } + bool isCheapToSpeculateCtlz(Type *) const override { return true; } bool preferZeroCompareBranch() const override { return true; } bool hasBitPreservingFPLogic(EVT VT) const override { EVT ScVT = VT.getScalarType(); diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h index 087b0e215407c1..b9a29e4362d642 100644 --- a/llvm/lib/Target/VE/VEISelLowering.h +++ b/llvm/lib/Target/VE/VEISelLowering.h @@ -236,7 +236,7 @@ class VETargetLowering : public TargetLowering { // VE doesn't have rem. bool hasStandaloneRem(EVT) const override { return false; } // VE LDZ instruction returns 64 if the input is zero. - bool isCheapToSpeculateCtlz() const override { return true; } + bool isCheapToSpeculateCtlz(Type *) const override { return true; } // VE LDZ instruction is fast. bool isCtlzFast() const override { return true; } // VE has NND instruction. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 84823218216b65..2c60b0c223a4fe 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -751,12 +751,12 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint( return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } -bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const { +bool WebAssemblyTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { // Assume ctz is a relatively cheap operation. return true; } -bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const { +bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { // Assume clz is a relatively cheap operation. return true; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index d86f2e59e3d2c4..15b251c613d7be 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -65,8 +65,8 @@ class WebAssemblyTargetLowering final : public TargetLowering { std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; - bool isCheapToSpeculateCttz() const override; - bool isCheapToSpeculateCtlz() const override; + bool isCheapToSpeculateCttz(Type *Ty) const override; + bool isCheapToSpeculateCtlz(Type *Ty) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 431c2c8ca48fd6..69f25d19f49bc1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5820,12 +5820,13 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, return VT.isSimple() || !isOperationExpand(Opcode, VT); } -bool X86TargetLowering::isCheapToSpeculateCttz() const { - // Speculate cttz only if we can directly use TZCNT. - return Subtarget.hasBMI(); +bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { + // Speculate cttz only if we can directly use TZCNT or can promote to i32. + return Subtarget.hasBMI() || + (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32); } -bool X86TargetLowering::isCheapToSpeculateCtlz() const { +bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget.hasLZCNT(); } @@ -28877,6 +28878,10 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); + // If src is known never zero we can skip the CMOV. + if (DAG.isKnownNeverZero(N0)) + return Op; + // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT), DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b5cfcda519de49..184f53a7210355 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1033,9 +1033,9 @@ namespace llvm { bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override; - bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCttz(Type *Ty) const override; - bool isCheapToSpeculateCtlz() const override; + bool isCheapToSpeculateCtlz(Type *Ty) const override; bool isCtlzFast() const override; diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll index 0d6bff16aa536c..e456ecd4caee4d 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=-bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,NOBMI -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+sse2 | FileCheck %s -check-prefixes=SSE2,BMI -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+sse4.2 | FileCheck %s -check-prefixes=BMI,SSE42 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx | FileCheck %s -check-prefixes=BMI,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=BMI,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=BMI,AVX512,AVX512F -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512VPOPCNT -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BITALG +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=-bmi,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE2,NOBMI +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE2,BMI +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+sse4.2 | FileCheck %s -check-prefixes=CHECK,BMI,SSE42 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx | FileCheck %s -check-prefixes=CHECK,BMI,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=CHECK,BMI,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BW +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BW +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512VPOPCNT +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mattr=+bmi,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=CHECK,BMI,AVX512,AVX512BITALG ; Verify the cost of scalar trailing zero count instructions. @@ -70,52 +70,36 @@ define i32 @var_cttz_i32u(i32 %a) { } define i16 @var_cttz_i16(i16 %a) { -; NOBMI-LABEL: 'var_cttz_i16' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) -; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz -; -; BMI-LABEL: 'var_cttz_i16' -; BMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) -; BMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz +; CHECK-LABEL: 'var_cttz_i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz ; %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 0) ret i16 %cttz } define i16 @var_cttz_i16u(i16 %a) { -; NOBMI-LABEL: 'var_cttz_i16u' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true) -; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz -; -; BMI-LABEL: 'var_cttz_i16u' -; BMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true) -; BMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz +; CHECK-LABEL: 'var_cttz_i16u' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz ; %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 1) ret i16 %cttz } define i8 @var_cttz_i8(i8 %a) { -; NOBMI-LABEL: 'var_cttz_i8' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) -; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz -; -; BMI-LABEL: 'var_cttz_i8' -; BMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) -; BMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz +; CHECK-LABEL: 'var_cttz_i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz ; %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 0) ret i8 %cttz } define i8 @var_cttz_i8u(i8 %a) { -; NOBMI-LABEL: 'var_cttz_i8u' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true) -; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz -; -; BMI-LABEL: 'var_cttz_i8u' -; BMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true) -; BMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz +; CHECK-LABEL: 'var_cttz_i8u' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz ; %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 1) ret i8 %cttz diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll index aeb14820f00ed2..ce9e3118f1ebdf 100644 --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -510,34 +510,20 @@ define i64 @ctlz_i64_zero_test(i64 %n) { ret i64 %tmp1 } -; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. +; Promote i8 cttz to i32 and mask bit8 to prevent (slow) zero-src bsf case. define i8 @cttz_i8_zero_test(i8 %n) { ; X86-LABEL: cttz_i8_zero_test: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testb %al, %al -; X86-NEXT: je .LBB12_1 -; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl $256, %eax # imm = 0x100 +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl -; X86-NEXT: .LBB12_1: -; X86-NEXT: movb $8, %al -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl ; ; X64-LABEL: cttz_i8_zero_test: ; X64: # %bb.0: -; X64-NEXT: testb %dil, %dil -; X64-NEXT: je .LBB12_1 -; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: rep bsfl %eax, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq -; X64-NEXT: .LBB12_1: -; X64-NEXT: movb $8, %al +; X64-NEXT: orl $256, %edi # imm = 0x100 +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -559,34 +545,22 @@ define i8 @cttz_i8_zero_test(i8 %n) { ret i8 %tmp1 } -; Generate a test and branch to handle zero inputs because bsr/bsf are very slow. +; Promote i16 cttz to i32 and mask bit16 to prevent (slow) zero-src bsf case. define i16 @cttz_i16_zero_test(i16 %n) { ; X86-LABEL: cttz_i16_zero_test: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: testw %ax, %ax -; X86-NEXT: je .LBB13_1 -; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: movl $65536, %eax # imm = 0x10000 +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl -; X86-NEXT: .LBB13_1: -; X86-NEXT: movw $16, %ax -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: retl ; ; X64-LABEL: cttz_i16_zero_test: ; X64: # %bb.0: -; X64-NEXT: testw %di, %di -; X64-NEXT: je .LBB13_1 -; X64-NEXT: # %bb.2: # %cond.false +; X64-NEXT: orl $65536, %edi # imm = 0x10000 ; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq -; X64-NEXT: .LBB13_1: -; X64-NEXT: movw $16, %ax -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i16_zero_test: ; X86-CLZ: # %bb.0: