[WebAssembly] Lower fmuladd to madd and nmadd #161355

sparker-arm · 2025-09-30T11:24:09Z

Lower v8f16, v4f32 and v2f64 fmuladd calls to fma, when we have relaxed simd.

llvmbot · 2025-09-30T11:24:40Z

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-webassembly

Author: Sam Parker (sparker-arm)

Changes

Lower v8f16, v4f32 and v2f64 fmuladd calls to fma, when we have relaxed simd.

Full diff: https://github.com/llvm/llvm-project/pull/161355.diff

4 Files Affected:

(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+31)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h (+2)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+3)
(modified) llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll (+37-24)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 163bf9ba5b089..b88bdd662e123 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -158,6 +158,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTruncStoreAction(T, MVT::f16, Expand);
   }
 
+  // Overwrite settings for FMA, when we have relaxed simd.
+  if (Subtarget->hasRelaxedSIMD()) {
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+    if (Subtarget->hasFP16()) {
+      setOperationAction(ISD::FMA, MVT::v8f16, Legal);
+    }
+  }
+
   // Expand unavailable integer operations.
   for (auto Op :
        {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
@@ -992,6 +1001,28 @@ bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
          (ExtT == MVT::v2i64 && MemT == MVT::v2i32);
 }
 
+bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd(
+    const MachineFunction &MF, EVT VT) const {
+  if (!Subtarget->hasRelaxedSIMD() || !VT.isVector())
+    return false;
+
+  VT = VT.getScalarType();
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    break;
+  case MVT::f16:
+    return Subtarget->hasFP16();
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  }
+
+  return false;
+}
+
 bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode *GA) const {
   // Wasm doesn't support function addresses with offsets
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b33a8530310be..bfa968fdd2fac 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -67,6 +67,8 @@ class WebAssemblyTargetLowering final : public TargetLowering {
                                       unsigned *Fast) const override;
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT VT) const override;
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 130602650d34e..a3f007d960960 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1600,6 +1600,9 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
   def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
              (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
 
+  def : Pat<(fma (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)),
+             (!cast<Instruction>("MADD_"#vec) V128:$c, V128:$a, V128:$b)>, Requires<[HasRelaxedSIMD]>;
+
   def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
              (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de38951b1..7a2929f2f41a3 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -61,6 +61,39 @@ define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
   ret <8 x half> %add
 }
 
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.mul $push0=, $0, $1
+; STRICT-NEXT:    f16x8.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.mul $push0=, $0, $1
+; STRICT-NEXT:    f16x8.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+  %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
 
 define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_4xf32:
@@ -103,9 +136,8 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
 ; RELAXED-LABEL: fmuladd_4xf32:
 ; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT:    return $pop1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_4xf32:
 ; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -121,27 +153,8 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fma_4xf32:
 ; RELAXED:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.extract_lane $push2=, $0, 0
-; RELAXED-NEXT:    f32x4.extract_lane $push1=, $1, 0
-; RELAXED-NEXT:    f32x4.extract_lane $push0=, $2, 0
-; RELAXED-NEXT:    call $push3=, fmaf, $pop2, $pop1, $pop0
-; RELAXED-NEXT:    f32x4.splat $push4=, $pop3
-; RELAXED-NEXT:    f32x4.extract_lane $push7=, $0, 1
-; RELAXED-NEXT:    f32x4.extract_lane $push6=, $1, 1
-; RELAXED-NEXT:    f32x4.extract_lane $push5=, $2, 1
-; RELAXED-NEXT:    call $push8=, fmaf, $pop7, $pop6, $pop5
-; RELAXED-NEXT:    f32x4.replace_lane $push9=, $pop4, 1, $pop8
-; RELAXED-NEXT:    f32x4.extract_lane $push12=, $0, 2
-; RELAXED-NEXT:    f32x4.extract_lane $push11=, $1, 2
-; RELAXED-NEXT:    f32x4.extract_lane $push10=, $2, 2
-; RELAXED-NEXT:    call $push13=, fmaf, $pop12, $pop11, $pop10
-; RELAXED-NEXT:    f32x4.replace_lane $push14=, $pop9, 2, $pop13
-; RELAXED-NEXT:    f32x4.extract_lane $push17=, $0, 3
-; RELAXED-NEXT:    f32x4.extract_lane $push16=, $1, 3
-; RELAXED-NEXT:    f32x4.extract_lane $push15=, $2, 3
-; RELAXED-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
-; RELAXED-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
-; RELAXED-NEXT:    return $pop19
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fma_4xf32:
 ; STRICT:         .functype fma_4xf32 (v128, v128, v128) -> (v128)

sparker-arm · 2025-10-01T07:53:53Z

The pattern matching confused me yesterday and, after looking with fresh eyes, I'm pretty sure that's because the operands were ordered incorrectly in the original patterns. AFAICT the wasm instructions and ISD nodes both use the same operand order for FMA. @badumbatish Could you please take a look and confirm?

edit: and changing this fixed a NaN error I was encountering with a workload.

sparker-arm · 2025-10-01T08:20:33Z

Performance wise, this looks really good for ML. Running NCNN, via V8 on AArch64, I observe a mean execution time reduction of ~15% over >30 workloads.

badumbatish

The pattern matching confused me yesterday and, after looking with fresh eyes, I'm pretty sure that's because the operands were ordered incorrectly in the original patterns. AFAICT the wasm instructions and ISD nodes both use the same operand order for FMA. @badumbatish Could you please take a look and confirm?

edit: and changing this fixed a NaN error I was encountering with a workload.

can confirmed it was an ordering mistake. Both wasm fmadd and fma should follow IREE754.

badumbatish · 2025-10-01T17:18:05Z

llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll

-; RELAXED-NEXT:    f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $2


im curious is there a reason why we remove the fmul_contract and fsub_contract?

You're talking about the change in TargetSelectionDAG.td, right? We don't need those patterns anymore since during DAG (lower/legalize/combine?) the fmuladd will, with relaxed-simd, no longer be broken into its constituent parts and instead it will be lowered to the FMA node.

oh i see, i'm also not quite sure how to reconcile with the relaxed-simd madd's optional round once or twice. if we're going from fma (i assume is just rounding once) to relaxed madd (optionally rounding once or twice), and the platform that runs relaxed madd can only support rounding twice, isn't that some what dangerous?

Yes, the ISD FMA follows the IEEE convention of rounding only once, so I understand the concern.

The key thing here is that we (backend developers) follow the semantics of the LLVM langref, which allows the fmuladd to either be lowered to a fused op or not, and obey the user.

Whether a fmuladd call is generated in the first place is determined by the language semantics, and frontend options provided by the user.

As noted by @dschuff here, for C/C++ the operations can be fused in a single statement, without providing additional flags to relax FP constraints.

WebAssembly adds a further restriction though because we only allow the fusing in the presence of relaxed-simd, and this is something that the user has to manually opt-in for. The user needs to understand that the precision of these operations are no longer deterministic when using relaxed-simd.

My reading of the LLVM FMA intrinsic is that it only allows double-rounding if the 'afn' flag is set, and it looks like the ISD FMA is the same. That means that the wasm relaxed madd is not actually a valid implementation of that, because it might round once or twice. So in principle, this pattern should only match when the fast math flag is set on the ISD node. (And unfortunately I think that just because the user enabled relaxed-simd doesn't necessarily mean that they are opting in to all FMA operations being allowed to double round. All the feature-enablement flags just control what proposal is allowed in the output, rather than semantics).

So relaxed-simd or no, we have the general problem that if single-rounding is required, there really is no guaranteed valid implementation, short of some kind of slow software emulation.
If the FMAs are generated from e.g. C++ where double or single rounding is allowed, the mul + add or whatever fused operations are generated should already have the appropriate flags and things should just work. (or we could in principle fix them so they do). But something like std::fma requires single rounding, and I don't think we can claim to correctly implement that with relaxed simd without some way for the user to opt-in to the nondeterminism. Which, probably we should go ahead and add somehow, because as you found above, the performance benefit seems pretty substantial.

Actually I guess we don't know how much of the performance benefit above comes from fusing mul+add operations that would be valid either way according to C++, and how much of it is from using the wasm instruction for FMAs that are in the source and are required to be fused, right?

AFAICT, the code base is just relying on the C/C++ semantics. The IR I noticed included fmuladd and not fma, and there's no special constraints attached the fmuladd call.

I will look into an alternative route of lowering the fmuladd.

Okay, I've now introduced a FMULADD node that we can perform pattern match on directly.

Thanks, I like this approach.

badumbatish · 2025-10-01T17:28:50Z

Performance wise, this looks really good for ML. Running NCNN, via V8 on AArch64, I observe a mean execution time reduction of ~15% over >30 workloads.

is there some sort of generic automated test suite that you're running that you can share with me? for llvm-test-suite, it's a bit hard since cmake can't build the targets (missing libs) so it requires some manual scripting

sparker-arm · 2025-10-02T07:35:26Z

is there some sort of generic automated test suite that you're running that you can share with me?

I've developed my own suite over the last few years... which I've just started the process of open sourcing. I will let you know when that happens, but I highly doubt it will happen this year.

EDIT: Running the aforementioned NCNN is rather easy though. I build it with wasi-sdk and use the following cmake command:

cmake \
  -DNCNN_OPENMP=OFF \
  -DNCNN_BENCHMARK=OFF \
  -DNCNN_BUILD_BENCHMARK=ON \
  -DNCNN_THREADS=OFF \
  -DNCNN_BF16=OFF \
  -DNCNN_SSE2=OFF \
  -DNCNN_AVX=OFF \
  -DNCNN_RUNTIME_CPU=ON \
  -DCMAKE_SYSTEM_NAME=WASI \
  -DCMAKE_TOOLCHAIN_FILE=$WASI_SDK/share/cmake/wasi-sdk-p1.cmake \
  -DCMAKE_CROSSCOMPILING=ON \
  -DCMAKE_FIND_ROOT_PATH=$SYSROOT \
  -DCMAKE_SYSROOT=$SYSROOT \
  -DCMAKE_C_COMPILER_WORKS=ON \
  -DCMAKE_CXX_COMPILER_WORKS=ON

github-actions · 2025-10-06T14:10:40Z

✅ With the latest revision this PR passed the C/C++ code formatter.

sparker-arm · 2025-10-06T14:37:38Z

Results still look good:

Metric                    Execution Time Change(%)
----------------------  --------------------------
Mean (filtered)                             14.854
Median (filtered)                           16.472
Max (filtered)                              25.636
Min (filtered)                               0.015
Geomean (non-filtered)                      13.953 

Benchmark                             Execution Time Change(%)
----------------------------------  --------------------------
ncnn-FastestDet-run_times                               11.111
ncnn-alexnet-run_times                                  16.683
ncnn-blazeface-run_times                                12.803
ncnn-efficientnet_b0-run_times                          11.927
ncnn-efficientnetv2_b0-run_times                        17.4
ncnn-googlenet-run_times                                18.773
ncnn-googlenet_int8-run_times                            0.627
ncnn-mnasnet-run_times                                  17.153
ncnn-mobilenet-run_times                                25.044
ncnn-mobilenet_ssd-run_times                            24.454
ncnn-mobilenet_ssd_int8-run_times                        0.015
ncnn-mobilenet_v2-run_times                             19.737
ncnn-mobilenet_v3-run_times                             14.865
ncnn-mobilenet_yolo-run_times                           25.636
ncnn-mobilenetv2_yolov3-run_times                       21.615
ncnn-nanodet_m-run_times                                18.885
ncnn-proxylessnasnet-run_times                          14.983
ncnn-regnety_400m-run_times                             16.26
ncnn-resnet18-run_times                                 19.614
ncnn-resnet50-run_times                                 23.153
ncnn-shufflenet-run_times                               15.25
ncnn-shufflenet_v2-run_times                            20.588
ncnn-squeezenet-run_times                               19.472
ncnn-squeezenet_int8-run_times                           0.451
ncnn-squeezenet_ssd-run_times                           13.072
ncnn-squeezenet_ssd_int8-run_times                       0.027
ncnn-vgg16-run_times                                    18.597
ncnn-vgg16_int8-run_times                                0.132
ncnn-vision_transformer-run_times                       13.76
ncnn-yolo_fastest_1.1-run_times                         12.267
ncnn-yolo_fastestv2-run_times                           12.088
ncnn-yolov4_tiny-run_times                              18.895

llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll

badumbatish

LGTM overall i just have a question on the truncate and extending for f16 based operations, we should wait for another reviewer

dschuff · 2025-10-07T22:30:21Z

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

    }

+    if (Subtarget->hasRelaxedSIMD()) {
+      if (Subtarget->hasFP16()) {


@brendandahl Does the FP16 proposal have a relaxed FMA?

Yes, though they don't currently have the relaxed prefix in the spec. WebAssembly/half-precision#5

OK, great. if you look at the tests below, do all of those conversion libcalls look like something we should be able to eventually lower to instructions?

not necessarily needed for this PR, but something for the future.

I think so. IIRC, since we don't have a fp16 registers some of the instructions weren't considered legal and were expanded.

dschuff · 2025-10-07T22:35:01Z

llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll

-; RELAXED-NEXT:    f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $2


Thanks, I like this approach.

brendandahl · 2025-10-07T23:01:52Z

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

 defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
 defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasRelaxedSIMD, HasFP16]>;


The relaxed fp16 instructions are not a separate feature. HasFp16 implies the relaxed instructions are available.

oh, interesting. Do you happen to know why? since the spec already has a deterministic profile, this would preclude FP16 from being used in that profile unless we break that dependency (or carve out a deterministic-profile subset of FP16, but that sounds like basically the same thing).

Unfortunately, my chat history is gone for this topic... Now that I'm thinking about it more though, I think Ilya mentioned the instructions are actually deterministic (and also why relaxed was not in the name).

If the hardware supports fp16 then you have an fma instruction. If the hardware doesn't support fp16, you do the math as fp32. Then in both cases you only end up with a single rounding.

HasFp16 implies the relaxed instructions are available.

That seems like an odd assumption to make. And, unfortunately, the FP16 proposal doesn't seem to have much information in it... but if it is a fused operation, then we can do lowering differently and use the FMA. I'll make the change.

The proposal is still fairly early in the process. We'll certainly revisit the LLVM as we go.

arsenm · 2025-10-08T01:16:07Z

llvm/include/llvm/CodeGen/ISDOpcodes.h

+  /// FMULADD - Performs a * b + c, with, or without, intermediate rounding.
+  FMULADD,


I don't understand why you would need this. It makes sense to have the fmuladd in the IR, but in the backend from the context you know the exact choice and should just go to it

Oh...wasm has its own fmuladd. Sigh.

Can you add comment here explaining the use case here? Unless the underlying target directly has an fmuladd, nothing else should be using it and continue creating fma or the split sequence based on target preference

dschuff

Thanks, this looks good.
On FP16 it's good to get this baseline. I think it probably makes sense to defer further optimization work until we get the proposal to phase 2 and have a better idea of exactly which operations it will include (unless we want to do more benchmarking work to inform that decision of course).

Introduce an fmuladd ISD node as an equivalent to the LLVM intrinsic and lower this to madd and nmadd, when we have relaxed-simd.

This reverts commit a4eb7ea.

Lower v4f32 and v2f64 fmuladd calls to relaxed_madd instructions. If we have FP16, then lower v8f16 fmuladds to FMA. I've introduced an ISD node for fmuladd to maintain the rounding ambiguity through legalization / combine / isel.

Reverts #161355 Looks like I've broken some intrinsic code generation.

…63171) Reverts llvm/llvm-project#161355 Looks like I've broken some intrinsic code generation.

sparker-arm requested review from badumbatish and dschuff September 30, 2025 11:24

sparker-arm self-assigned this Sep 30, 2025

llvmbot added the backend:WebAssembly label Sep 30, 2025

llvmbot added the llvm:SelectionDAG SelectionDAGISel as well label Oct 1, 2025

badumbatish approved these changes Oct 1, 2025

View reviewed changes

sparker-arm force-pushed the wasm-fmuladd-relaxed branch from b3d5aa2 to 3ffead4 Compare October 6, 2025 14:08

llvmbot added the llvm:codegen label Oct 6, 2025

sparker-arm changed the title ~~[WebAssembly] Lower fmuladd to fma~~ [WebAssembly] Lower fmuladd to madd and nmadd Oct 6, 2025

badumbatish reviewed Oct 7, 2025

View reviewed changes

llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll Show resolved Hide resolved

badumbatish approved these changes Oct 7, 2025

View reviewed changes

dschuff reviewed Oct 7, 2025

View reviewed changes

brendandahl reviewed Oct 7, 2025

View reviewed changes

arsenm reviewed Oct 8, 2025

View reviewed changes

dschuff approved these changes Oct 8, 2025

View reviewed changes

sparker-arm added 6 commits October 13, 2025 09:26

[WebAssembly] relaxed madd, nmadd from fmuladd.

eddc2b2

Introduce an fmuladd ISD node as an equivalent to the LLVM intrinsic and lower this to madd and nmadd, when we have relaxed-simd.

formatting

c921ff1

Use FMA for v8f16

0ad93ff

changed simd-encodings test

64c4af0

extra test and TODO

cb60b77

added FMULADD comment#

a8c36a8

sparker-arm force-pushed the wasm-fmuladd-relaxed branch from bec19e5 to a8c36a8 Compare October 13, 2025 08:38

sparker-arm merged commit a4eb7ea into llvm:main Oct 13, 2025
10 checks passed

sparker-arm added a commit that referenced this pull request Oct 13, 2025

Revert "[WebAssembly] Lower fmuladd to madd and nmadd (#161355)"

08cf9c8

This reverts commit a4eb7ea.

sparker-arm mentioned this pull request Oct 13, 2025

Revert "[WebAssembly] Lower fmuladd to madd and nmadd" #163171

Merged

sparker-arm added a commit that referenced this pull request Oct 13, 2025

Revert "[WebAssembly] Lower fmuladd to madd and nmadd" (#163171)

30d3441

Reverts #161355 Looks like I've broken some intrinsic code generation.

sparker-arm mentioned this pull request Oct 13, 2025

Wasm fmuladd relaxed #163177

Open

llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Oct 13, 2025

Automerge: Revert "[WebAssembly] Lower fmuladd to madd and nmadd" (#1…

e8f646b

…63171) Reverts llvm/llvm-project#161355 Looks like I've broken some intrinsic code generation.

		; RELAXED-NEXT: f32x4.mul $push0=, $0, $1
		; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2

		/// FMULADD - Performs a * b + c, with, or without, intermediate rounding.
		FMULADD,

[WebAssembly] Lower fmuladd to madd and nmadd #161355

[WebAssembly] Lower fmuladd to madd and nmadd #161355

Uh oh!

Conversation

sparker-arm commented Sep 30, 2025

Uh oh!

llvmbot commented Sep 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

sparker-arm commented Oct 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

sparker-arm commented Oct 1, 2025

Uh oh!

badumbatish left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dschuff Oct 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

badumbatish commented Oct 1, 2025

Uh oh!

sparker-arm commented Oct 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Oct 6, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

sparker-arm commented Oct 6, 2025

Uh oh!

Uh oh!

badumbatish left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dschuff Oct 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

llvmbot commented Sep 30, 2025 •

edited

Loading

sparker-arm commented Oct 1, 2025 •

edited

Loading

dschuff Oct 3, 2025 •

edited

Loading

sparker-arm commented Oct 2, 2025 •

edited

Loading

github-actions bot commented Oct 6, 2025 •

edited

Loading

dschuff Oct 7, 2025 •

edited

Loading