From aab5e41bd8374daed3563503aed35acfb95fb5af Mon Sep 17 00:00:00 2001 From: Yi Qian Date: Tue, 30 Sep 2025 20:34:15 +0000 Subject: [PATCH 1/3] [AMDGPU] Add a target option to disable aggressive FMA fusion --- llvm/lib/Target/AMDGPU/AMDGPU.td | 7 +++++++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 +++++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 + llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 ++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 ++++++-- 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index eaa1870f4be28..5a08e7d6db347 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1238,6 +1238,13 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst", // Subtarget Features (options and debugging) //===------------------------------------------------------------===// +def FeatureDisableAggressiveFMAFusion : SubtargetFeature< + "disable-aggressive-fma-fusion", + "DisableAggressiveFMAFusion", + "true", + "Do not fold fmul and fadd/fsub into fma." +>; + // Ugly hack to accomodate assembling modules with mixed // wavesizes. Ideally we would have a mapping symbol in assembly which // would keep track of which sections of code should be treated as diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index ed03ef21b6dda..0c380a7e4dc84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ class AMDGPUSubtarget { bool EnablePromoteAlloca = false; bool HasTrigReducedRange = false; bool FastFMAF32 = false; + bool DisableAggressiveFMAFusion = false; unsigned EUsPerCU = 4; unsigned MaxWavesPerEU = 10; unsigned LocalMemorySize = 0; @@ -303,6 +304,10 @@ class AMDGPUSubtarget { return FastFMAF32; } + bool hasDisableAggressiveFMAFusion() const { + return DisableAggressiveFMAFusion; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..554549063dbcc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -288,6 +288,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode, + AMDGPU::FeatureDisableAggressiveFMAFusion, AMDGPU::FeatureAutoWaitcntBeforeBarrier, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 7b94ea3ffbf1f..b7473e5ea4759 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -85,6 +85,10 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "-wavefrontsize64,"; } + // GFX9 enables fast-fmaf by default + if (GPU.contains_insensitive("gfx9") && !FS.contains_insensitive("fast-fmaf")) + FullFS += "+fast-fmaf"; + FullFS += FS; ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 16530087444d2..59fcf9fb6da39 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6502,10 +6502,14 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { // When fma is quarter rate, for f64 where add / sub are at best half rate, // most of these combines appear to be cycle neutral but save on instruction // count / code size. - return true; + return Subtarget->hasFastFMAF32() && + !Subtarget->hasDisableAggressiveFMAFusion(); } -bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } +bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { + return Subtarget->hasFastFMAF32() && + !Subtarget->hasDisableAggressiveFMAFusion(); +} EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const { From 9a47421e5871e979c047e6777be5868445f13375 Mon Sep 17 00:00:00 2001 From: Yi Qian Date: Thu, 2 Oct 2025 05:00:47 +0000 Subject: [PATCH 2/3] Remove an old change --- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index b7473e5ea4759..7b94ea3ffbf1f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -85,10 +85,6 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "-wavefrontsize64,"; } - // GFX9 enables fast-fmaf by default - if (GPU.contains_insensitive("gfx9") && !FS.contains_insensitive("fast-fmaf")) - FullFS += "+fast-fmaf"; - FullFS += FS; ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); From ada0bb92c0bacc0a3bde5680f82d17a4a7d9dc36 Mon Sep 17 00:00:00 2001 From: Yi Qian Date: Thu, 2 Oct 2025 05:52:35 +0000 Subject: [PATCH 3/3] Remove an unneeded check --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 59fcf9fb6da39..f0ac08c3b69f9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6502,13 +6502,11 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { // When fma is quarter rate, for f64 where add / sub are at best half rate, // most of these combines appear to be cycle neutral but save on instruction // count / code size. - return Subtarget->hasFastFMAF32() && - !Subtarget->hasDisableAggressiveFMAFusion(); + return !Subtarget->hasDisableAggressiveFMAFusion(); } bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { - return Subtarget->hasFastFMAF32() && - !Subtarget->hasDisableAggressiveFMAFusion(); + return !Subtarget->hasDisableAggressiveFMAFusion(); } EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,