-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[WebAssembly] Lower fmuladd to madd and nmadd #161355
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-webassembly Author: Sam Parker (sparker-arm) ChangesLower v8f16, v4f32 and v2f64 fmuladd calls to fma, when we have relaxed simd. Full diff: https://github.com/llvm/llvm-project/pull/161355.diff 4 Files Affected:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 163bf9ba5b089..b88bdd662e123 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -158,6 +158,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setTruncStoreAction(T, MVT::f16, Expand);
}
+ // Overwrite settings for FMA, when we have relaxed simd.
+ if (Subtarget->hasRelaxedSIMD()) {
+ setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+ if (Subtarget->hasFP16()) {
+ setOperationAction(ISD::FMA, MVT::v8f16, Legal);
+ }
+ }
+
// Expand unavailable integer operations.
for (auto Op :
{ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
@@ -992,6 +1001,28 @@ bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
(ExtT == MVT::v2i64 && MemT == MVT::v2i32);
}
+bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd(
+ const MachineFunction &MF, EVT VT) const {
+ if (!Subtarget->hasRelaxedSIMD() || !VT.isVector())
+ return false;
+
+ VT = VT.getScalarType();
+ if (!VT.isSimple())
+ return false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ break;
+ case MVT::f16:
+ return Subtarget->hasFP16();
+ case MVT::f32:
+ case MVT::f64:
+ return true;
+ }
+
+ return false;
+}
+
bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
const GlobalAddressSDNode *GA) const {
// Wasm doesn't support function addresses with offsets
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b33a8530310be..bfa968fdd2fac 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -67,6 +67,8 @@ class WebAssemblyTargetLowering final : public TargetLowering {
unsigned *Fast) const override;
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 130602650d34e..a3f007d960960 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1600,6 +1600,9 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
(!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+ def : Pat<(fma (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)),
+ (!cast<Instruction>("MADD_"#vec) V128:$c, V128:$a, V128:$b)>, Requires<[HasRelaxedSIMD]>;
+
def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
(!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de38951b1..7a2929f2f41a3 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -61,6 +61,39 @@ define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
ret <8 x half> %add
}
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.mul $push0=, $0, $1
+; STRICT-NEXT: f16x8.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+ %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.mul $push0=, $0, $1
+; STRICT-NEXT: f16x8.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+ %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fadd_fmul_4xf32:
@@ -103,9 +136,8 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
; RELAXED-LABEL: fmuladd_4xf32:
; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT: return $pop1
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fmuladd_4xf32:
; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -121,27 +153,8 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fma_4xf32:
; RELAXED: .functype fma_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.extract_lane $push2=, $0, 0
-; RELAXED-NEXT: f32x4.extract_lane $push1=, $1, 0
-; RELAXED-NEXT: f32x4.extract_lane $push0=, $2, 0
-; RELAXED-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0
-; RELAXED-NEXT: f32x4.splat $push4=, $pop3
-; RELAXED-NEXT: f32x4.extract_lane $push7=, $0, 1
-; RELAXED-NEXT: f32x4.extract_lane $push6=, $1, 1
-; RELAXED-NEXT: f32x4.extract_lane $push5=, $2, 1
-; RELAXED-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5
-; RELAXED-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8
-; RELAXED-NEXT: f32x4.extract_lane $push12=, $0, 2
-; RELAXED-NEXT: f32x4.extract_lane $push11=, $1, 2
-; RELAXED-NEXT: f32x4.extract_lane $push10=, $2, 2
-; RELAXED-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10
-; RELAXED-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13
-; RELAXED-NEXT: f32x4.extract_lane $push17=, $0, 3
-; RELAXED-NEXT: f32x4.extract_lane $push16=, $1, 3
-; RELAXED-NEXT: f32x4.extract_lane $push15=, $2, 3
-; RELAXED-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
-; RELAXED-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
-; RELAXED-NEXT: return $pop19
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fma_4xf32:
; STRICT: .functype fma_4xf32 (v128, v128, v128) -> (v128)
|
The pattern matching confused me yesterday and, after looking with fresh eyes, I'm pretty sure that's because the operands were ordered incorrectly in the original patterns. AFAICT the wasm instructions and ISD nodes both use the same operand order for FMA. @badumbatish Could you please take a look and confirm? edit: and changing this fixed a NaN error I was encountering with a workload. |
Performance wise, this looks really good for ML. Running NCNN, via V8 on AArch64, I observe a mean execution time reduction of ~15% over >30 workloads. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pattern matching confused me yesterday and, after looking with fresh eyes, I'm pretty sure that's because the operands were ordered incorrectly in the original patterns. AFAICT the wasm instructions and ISD nodes both use the same operand order for FMA. @badumbatish Could you please take a look and confirm?
edit: and changing this fixed a NaN error I was encountering with a workload.
can confirmed it was an ordering mistake. Both wasm fmadd and fma should follow IREE754.
; RELAXED-NEXT: f32x4.mul $push0=, $0, $1 | ||
; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
im curious is there a reason why we remove the fmul_contract and fsub_contract?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're talking about the change in TargetSelectionDAG.td, right? We don't need those patterns anymore since during DAG (lower/legalize/combine?) the fmuladd will, with relaxed-simd, no longer be broken into its constituent parts and instead it will be lowered to the FMA node.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh i see, i'm also not quite sure how to reconcile with the relaxed-simd madd's optional round once or twice. if we're going from fma (i assume is just rounding once) to relaxed madd (optionally rounding once or twice), and the platform that runs relaxed madd can only support rounding twice, isn't that some what dangerous?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the ISD FMA follows the IEEE convention of rounding only once, so I understand the concern.
The key thing here is that we (backend developers) follow the semantics of the LLVM langref, which allows the fmuladd to either be lowered to a fused op or not, and obey the user.
Whether a fmuladd call is generated in the first place is determined by the language semantics, and frontend options provided by the user.
As noted by @dschuff here, for C/C++ the operations can be fused in a single statement, without providing additional flags to relax FP constraints.
WebAssembly adds a further restriction though because we only allow the fusing in the presence of relaxed-simd, and this is something that the user has to manually opt-in for. The user needs to understand that the precision of these operations are no longer deterministic when using relaxed-simd.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My reading of the LLVM FMA intrinsic is that it only allows double-rounding if the 'afn' flag is set, and it looks like the ISD FMA is the same. That means that the wasm relaxed madd is not actually a valid implementation of that, because it might round once or twice. So in principle, this pattern should only match when the fast math flag is set on the ISD node. (And unfortunately I think that just because the user enabled relaxed-simd doesn't necessarily mean that they are opting in to all FMA operations being allowed to double round. All the feature-enablement flags just control what proposal is allowed in the output, rather than semantics).
So relaxed-simd or no, we have the general problem that if single-rounding is required, there really is no guaranteed valid implementation, short of some kind of slow software emulation.
If the FMAs are generated from e.g. C++ where double or single rounding is allowed, the mul + add or whatever fused operations are generated should already have the appropriate flags and things should just work. (or we could in principle fix them so they do). But something like std::fma requires single rounding, and I don't think we can claim to correctly implement that with relaxed simd without some way for the user to opt-in to the nondeterminism. Which, probably we should go ahead and add somehow, because as you found above, the performance benefit seems pretty substantial.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually I guess we don't know how much of the performance benefit above comes from fusing mul+add operations that would be valid either way according to C++, and how much of it is from using the wasm instruction for FMAs that are in the source and are required to be fused, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAICT, the code base is just relying on the C/C++ semantics. The IR I noticed included fmuladd
and not fma
, and there's no special constraints attached the fmuladd
call.
I will look into an alternative route of lowering the fmuladd
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, I've now introduced a FMULADD node that we can perform pattern match on directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I like this approach.
is there some sort of generic automated test suite that you're running that you can share with me? for llvm-test-suite, it's a bit hard since cmake can't build the targets (missing libs) so it requires some manual scripting |
I've developed my own suite over the last few years... which I've just started the process of open sourcing. I will let you know when that happens, but I highly doubt it will happen this year. EDIT: Running the aforementioned NCNN is rather easy though. I build it with wasi-sdk and use the following cmake command:
|
b3d5aa2
to
3ffead4
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
Results still look good:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM overall i just have a question on the truncate and extending for f16 based operations, we should wait for another reviewer
} | ||
|
||
if (Subtarget->hasRelaxedSIMD()) { | ||
if (Subtarget->hasFP16()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@brendandahl Does the FP16 proposal have a relaxed FMA?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, though they don't currently have the relaxed
prefix in the spec. WebAssembly/half-precision#5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, great. if you look at the tests below, do all of those conversion libcalls look like something we should be able to eventually lower to instructions?
not necessarily needed for this PR, but something for the future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think so. IIRC, since we don't have a fp16 registers some of the instructions weren't considered legal and were expanded.
; RELAXED-NEXT: f32x4.mul $push0=, $0, $1 | ||
; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I like this approach.
defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; | ||
defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>; | ||
defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>; | ||
defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasRelaxedSIMD, HasFP16]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The relaxed fp16 instructions are not a separate feature. HasFp16 implies the relaxed instructions are available.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh, interesting. Do you happen to know why? since the spec already has a deterministic profile, this would preclude FP16 from being used in that profile unless we break that dependency (or carve out a deterministic-profile subset of FP16, but that sounds like basically the same thing).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately, my chat history is gone for this topic... Now that I'm thinking about it more though, I think Ilya mentioned the instructions are actually deterministic (and also why relaxed was not in the name).
If the hardware supports fp16 then you have an fma instruction. If the hardware doesn't support fp16, you do the math as fp32. Then in both cases you only end up with a single rounding.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
HasFp16 implies the relaxed instructions are available.
That seems like an odd assumption to make. And, unfortunately, the FP16 proposal doesn't seem to have much information in it... but if it is a fused operation, then we can do lowering differently and use the FMA. I'll make the change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The proposal is still fairly early in the process. We'll certainly revisit the LLVM as we go.
/// FMULADD - Performs a * b + c, with, or without, intermediate rounding. | ||
FMULADD, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand why you would need this. It makes sense to have the fmuladd in the IR, but in the backend from the context you know the exact choice and should just go to it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh...wasm has its own fmuladd. Sigh.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add comment here explaining the use case here? Unless the underlying target directly has an fmuladd, nothing else should be using it and continue creating fma or the split sequence based on target preference
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, this looks good.
On FP16 it's good to get this baseline. I think it probably makes sense to defer further optimization work until we get the proposal to phase 2 and have a better idea of exactly which operations it will include (unless we want to do more benchmarking work to inform that decision of course).
Introduce an fmuladd ISD node as an equivalent to the LLVM intrinsic and lower this to madd and nmadd, when we have relaxed-simd.
bec19e5
to
a8c36a8
Compare
This reverts commit a4eb7ea.
Lower v4f32 and v2f64 fmuladd calls to relaxed_madd instructions. If we have FP16, then lower v8f16 fmuladds to FMA. I've introduced an ISD node for fmuladd to maintain the rounding ambiguity through legalization / combine / isel.
Reverts #161355 Looks like I've broken some intrinsic code generation.
…63171) Reverts llvm/llvm-project#161355 Looks like I've broken some intrinsic code generation.
Lower v8f16, v4f32 and v2f64 fmuladd calls to fma, when we have relaxed simd.