diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 8bd38a317b797..4e95791aaa14e 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -420,6 +420,17 @@ bool CannotBeOrderedLessThanZero(const Value *V, const TargetLibraryInfo *TLI); bool isKnownNeverInfinity(const Value *V, const TargetLibraryInfo *TLI, unsigned Depth = 0); +/// Return true if the floating-point value can never contain a NaN or infinity. +inline bool isKnownNeverInfOrNaN( + const Value *V, const DataLayout &DL, const TargetLibraryInfo *TLI, + unsigned Depth = 0, AssumptionCache *AC = nullptr, + const Instruction *CtxI = nullptr, const DominatorTree *DT = nullptr, + OptimizationRemarkEmitter *ORE = nullptr, bool UseInstrInfo = true) { + KnownFPClass Known = computeKnownFPClass(V, DL, fcInf | fcNan, Depth, TLI, AC, + CtxI, DT, ORE, UseInstrInfo); + return Known.isKnownNeverNaN() && Known.isKnownNeverInfinity(); +} + /// Return true if the floating-point scalar value is not a NaN or if the /// floating-point vector value has no NaN elements. Return false if a value /// could ever be NaN. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 44ad131bd7eff..48a6fde657094 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -328,7 +328,8 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, }); } -bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, +bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I, + const Value *Op0, const Value *Op1, InstCombiner &IC) const { // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or // infinity, gives +0.0. If we can prove we don't have one of the special @@ -340,9 +341,14 @@ bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, // One operand is not zero or infinity or NaN. return true; } + auto *TLI = &IC.getTargetLibraryInfo(); - if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && - isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { + if (isKnownNeverInfOrNaN(Op0, IC.getDataLayout(), TLI, 0, + &IC.getAssumptionCache(), &I, &IC.getDominatorTree(), + &IC.getOptimizationRemarkEmitter()) && + isKnownNeverInfOrNaN(Op1, IC.getDataLayout(), TLI, 0, + &IC.getAssumptionCache(), &I, &IC.getDominatorTree(), + &IC.getOptimizationRemarkEmitter())) { // Neither operand is infinity or NaN. return true; } @@ -1005,7 +1011,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // If we can prove we don't have one of the special cases then we can use a // normal fmul instruction instead. - if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { + if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); FMul->takeName(&II); return IC.replaceInstUsesWith(II, FMul); @@ -1032,7 +1038,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // If we can prove we don't have one of the special cases then we can use a // normal fma instead. - if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { + if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { II.setCalledOperand(Intrinsic::getDeclaration( II.getModule(), Intrinsic::fma, II.getType())); return ⅈ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 972ea8cf52ea0..cdd76861335fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -209,8 +209,8 @@ class GCNTTIImpl final : public BasicTTIImplBase { Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const; - bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, - InstCombiner &IC) const; + bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, + const Value *Op1, InstCombiner &IC) const; std::optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; std::optional simplifyDemandedVectorEltsIntrinsic( diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fma_legacy.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fma_legacy.ll index 89979bcedf814..72bffe62fbb14 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/fma_legacy.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/fma_legacy.ll @@ -83,4 +83,28 @@ define float @test_finite(i32 %x, i32 %y, float %z) { ret float %call } +; Combine to fma because neither argument can be infinity or NaN based on assumptions +define float @test_finite_assumed(float %x, float %y, float %z) { +; CHECK-LABEL: @test_finite_assumed( +; CHECK-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]]) +; CHECK-NEXT: [[IS_FINITE_X:%.*]] = fcmp one float [[FABS_X]], 0x7FF0000000000000 +; CHECK-NEXT: [[FABS_Y:%.*]] = call float @llvm.fabs.f32(float [[Y:%.*]]) +; CHECK-NEXT: [[IS_FINITE_Y:%.*]] = fcmp one float [[FABS_Y]], 0x7FF0000000000000 +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_FINITE_X]]) +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_FINITE_Y]]) +; CHECK-NEXT: [[CALL:%.*]] = call float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z:%.*]]) +; CHECK-NEXT: ret float [[CALL]] +; + %fabs.x = call float @llvm.fabs.f32(float %x) + %is.finite.x = fcmp one float %fabs.x, 0x7FF0000000000000 + %fabs.y = call float @llvm.fabs.f32(float %y) + %is.finite.y = fcmp one float %fabs.y, 0x7FF0000000000000 + call void @llvm.assume(i1 %is.finite.x) + call void @llvm.assume(i1 %is.finite.y) + %call = call float @llvm.amdgcn.fma.legacy(float %x, float %y, float %z) + ret float %call +} + declare float @llvm.amdgcn.fma.legacy(float, float, float) +declare float @llvm.fabs.f32(float) +declare void @llvm.assume(i1 noundef) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll index 439bedbe60dd8..d58470d6b12ad 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/fmul_legacy.ll @@ -53,4 +53,28 @@ define float @test_finite(i32 %x, i32 %y) { ret float %call } +; Combine to fmul because neither argument can be infinity or NaN based on assumptions +define float @test_finite_assumed(float %x, float %y) { +; CHECK-LABEL: @test_finite_assumed( +; CHECK-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]]) +; CHECK-NEXT: [[IS_FINITE_X:%.*]] = fcmp one float [[FABS_X]], 0x7FF0000000000000 +; CHECK-NEXT: [[FABS_Y:%.*]] = call float @llvm.fabs.f32(float [[Y:%.*]]) +; CHECK-NEXT: [[IS_FINITE_Y:%.*]] = fcmp one float [[FABS_Y]], 0x7FF0000000000000 +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_FINITE_X]]) +; CHECK-NEXT: call void @llvm.assume(i1 [[IS_FINITE_Y]]) +; CHECK-NEXT: [[CALL:%.*]] = fmul float [[X]], [[Y]] +; CHECK-NEXT: ret float [[CALL]] +; + %fabs.x = call float @llvm.fabs.f32(float %x) + %is.finite.x = fcmp one float %fabs.x, 0x7FF0000000000000 + %fabs.y = call float @llvm.fabs.f32(float %y) + %is.finite.y = fcmp one float %fabs.y, 0x7FF0000000000000 + call void @llvm.assume(i1 %is.finite.x) + call void @llvm.assume(i1 %is.finite.y) + %call = call float @llvm.amdgcn.fmul.legacy(float %x, float %y) + ret float %call +} + declare float @llvm.amdgcn.fmul.legacy(float, float) +declare float @llvm.fabs.f32(float) +declare void @llvm.assume(i1 noundef)