From aefd5bdadab5cd4c8a83d1952592b612b195bc24 Mon Sep 17 00:00:00 2001 From: Kai Lin Date: Fri, 14 Nov 2025 18:12:31 +0800 Subject: [PATCH 1/2] [RVV] Add test for missed VWMACC combine Add a minimal reproducer for consecutive vwmacc-like operations to illustrate that the previous DAG combine logic may miss combining mul+add chains into a single vwmacc.vx instruction. --- .../CodeGen/RISCV/rvv/combine-vl-vw-macc.ll | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll new file mode 100644 index 0000000000000..663359bb5b4da --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +define void @matmul_min(ptr %vptr, ptr %scalars, ptr %acc0_ptr, ptr %acc1_ptr) { +; CHECK-LABEL: matmul_min: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a4, 64 +; CHECK-NEXT: li a5, 32 +; CHECK-NEXT: vsetvli zero, a4, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a2) +; CHECK-NEXT: vsetvli zero, a5, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v20, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: lb a1, 1(a1) +; CHECK-NEXT: vsetvli zero, a4, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a3) +; CHECK-NEXT: vsetvli zero, a5, e8, m2, ta, ma +; CHECK-NEXT: vwmacc.vx v8, a0, v20 +; CHECK-NEXT: vwmul.vx v16, v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vadd.vv v12, v16, v12 +; CHECK-NEXT: vsetvli zero, a4, e8, m4, ta, ma +; CHECK-NEXT: vse8.v v8, (a2) +; CHECK-NEXT: vse8.v v12, (a3) +; CHECK-NEXT: ret +entry: + %acc0 = load <32 x i16>, ptr %acc0_ptr, align 1 + %acc1 = load <32 x i16>, ptr %acc1_ptr, align 1 + + %v8 = load <32 x i8>, ptr %vptr, align 1 + %v16 = sext <32 x i8> %v8 to <32 x i16> + + %s0_ptr = getelementptr i8, ptr %scalars, i32 0 + %s0_i8 = load i8, ptr %s0_ptr, align 1 + %s0_i16 = sext i8 %s0_i8 to i16 + %tmp0 = insertelement <32 x i16> poison, i16 %s0_i16, i32 0 + %splat0 = shufflevector <32 x i16> %tmp0, <32 x i16> poison, <32 x i32> zeroinitializer + %mul0 = mul <32 x i16> %splat0, %v16 + %add0 = add <32 x i16> %mul0, %acc0 + + %s1_ptr = getelementptr i8, ptr %scalars, i32 1 + %s1_i8 = load i8, ptr %s1_ptr, align 1 + %s1_i16 = sext i8 %s1_i8 to i16 + %tmp1 = insertelement <32 x i16> poison, i16 %s1_i16, i32 0 + %splat1 = shufflevector <32 x i16> %tmp1, <32 x i16> poison, <32 x i32> zeroinitializer + %mul1 = mul <32 x i16> %splat1, %v16 + %add1 = add <32 x i16> %mul1, %acc1 + + store <32 x i16> %add0, ptr %acc0_ptr, align 1 + store <32 x i16> %add1, ptr %acc1_ptr, align 1 + + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} From 16d98bcce9449c2f8c2af217b51c2485b8697305 Mon Sep 17 00:00:00 2001 From: Kai Lin Date: Sat, 15 Nov 2025 16:30:52 +0800 Subject: [PATCH 2/2] [RISCV][DAGCombiner] Fix missed combines in combineOp_VLToVWOp_VL The previous implementation of combineOp_VLToVWOp_VL manually replaced old nodes with newly created widened nodes, but only added the new node itself to the DAGCombiner worklist. Since the users of the new node were not added, some combine opportunities could be missed when external DAGCombiner passes expected those users to be reconsidered. This patch replaces the custom replacement logic with a call to DCI.CombineTo(), which performs node replacement in a way consistent with DAGCombiner::Run: - Replace all uses of the old node. - Add the new node and its users to the worklist. - Clean up unused nodes when appropriate. Using CombineTo ensures that combineOp_VLToVWOp_VL behaves consistently with the standard DAGCombiner update model, avoiding discrepancies between the private worklist inside this routine and the global worklist managed by the combiner. This resolves missed combine cases involving VL -> VW operator widening. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 +-- llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll | 11 ++++------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 38cce26e44af4..8ba1215561dc3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -18300,8 +18300,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N, } } for (std::pair OldNewValues : ValuesToReplace) { - DAG.ReplaceAllUsesOfValueWith(OldNewValues.first, OldNewValues.second); - DCI.AddToWorklist(OldNewValues.second.getNode()); + DCI.CombineTo(OldNewValues.first.getNode(), OldNewValues.second); } return InputRootReplacement; } diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll index 663359bb5b4da..943d8d2409ffd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll @@ -7,19 +7,16 @@ define void @matmul_min(ptr %vptr, ptr %scalars, ptr %acc0_ptr, ptr %acc1_ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a4, 64 ; CHECK-NEXT: li a5, 32 -; CHECK-NEXT: vsetvli zero, a4, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a2) ; CHECK-NEXT: vsetvli zero, a5, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v20, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: lb a0, 0(a1) ; CHECK-NEXT: lb a1, 1(a1) ; CHECK-NEXT: vsetvli zero, a4, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a2) ; CHECK-NEXT: vle8.v v12, (a3) ; CHECK-NEXT: vsetvli zero, a5, e8, m2, ta, ma -; CHECK-NEXT: vwmacc.vx v8, a0, v20 -; CHECK-NEXT: vwmul.vx v16, v20, a1 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vadd.vv v12, v16, v12 +; CHECK-NEXT: vwmacc.vx v8, a0, v16 +; CHECK-NEXT: vwmacc.vx v12, a1, v16 ; CHECK-NEXT: vsetvli zero, a4, e8, m4, ta, ma ; CHECK-NEXT: vse8.v v8, (a2) ; CHECK-NEXT: vse8.v v12, (a3)