From d264d2421d94becadcc76469b57d87523bc909ad Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Wed, 5 Nov 2025 19:22:16 +0000 Subject: [PATCH] [LoopUnroll] Introduce parallel accumulators when unrolling reductions with vector instructions. --- llvm/lib/Analysis/IVDescriptors.cpp | 6 +- .../LoopUnroll/partial-unroll-reductions.ll | 54 +++++++++++++++ .../LoopUnroll/runtime-unroll-reductions.ll | 67 +++++++++++++++++++ 3 files changed, 125 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 9f8ac6e8e2e0b..3dd04405a53da 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -270,10 +270,12 @@ bool RecurrenceDescriptor::AddReductionVar( // resulting from the type promotion performed by InstCombine. Vector // operations are not limited to the legal integer widths, so we may be able // to evaluate the reduction in the narrower width. - if (RecurrenceType->isFloatingPointTy()) { + // Check the scalar type to handle both scalar and vector types. + Type *ScalarTy = RecurrenceType->getScalarType(); + if (ScalarTy->isFloatingPointTy()) { if (!isFloatingPointRecurrenceKind(Kind)) return false; - } else if (RecurrenceType->isIntegerTy()) { + } else if (ScalarTy->isIntegerTy()) { if (!isIntegerRecurrenceKind(Kind)) return false; if (!isMinMaxRecurrenceKind(Kind)) diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll index 2d48d20ba9c5c..220a4a29a3041 100644 --- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll @@ -358,6 +358,7 @@ loop: exit: ret float %rdx.next } + define i32 @test_smin(ptr %src, i64 %n) { ; CHECK-LABEL: define i32 @test_smin( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { @@ -623,3 +624,56 @@ loop: exit: ret i32 %rdx.next } + +define <4 x i32> @test_vector_add(ptr %p, i64 %n, <4 x i32> %start) { +; CHECK-LABEL: define <4 x i32> @test_vector_add( +; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]], <4 x i32> [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load <4 x i32>, ptr [[GEP]], align 16 +; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[L]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load <4 x i32>, ptr [[GEP_1]], align 16 +; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load <4 x i32>, ptr [[GEP_2]], align 16 +; CHECK-NEXT: [[RDX_NEXT_2]] = add <4 x i32> [[RDX_2]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_3:%.*]] = load <4 x i32>, ptr [[GEP_3]], align 16 +; CHECK-NEXT: [[RDX_NEXT_3]] = add <4 x i32> [[RDX_3]], [[L_3]] +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi <4 x i32> [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]] +; CHECK-NEXT: [[BIN_RDX1:%.*]] = add <4 x i32> [[RDX_NEXT_2]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[RDX_NEXT_3]], [[BIN_RDX1]] +; CHECK-NEXT: ret <4 x i32> [[BIN_RDX2]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi <4 x i32> [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep = getelementptr inbounds nuw <4 x i32>, ptr %p, i64 %iv + %l = load <4 x i32>, ptr %gep, align 16 + %rdx.next = add <4 x i32> %rdx, %l + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret <4 x i32> %rdx.next +} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll index a5ac2cf46653d..fb1f2fcf5c190 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll @@ -220,6 +220,72 @@ exit: ret i32 %res } +define <4 x i32> @test_vector_add_reduction(ptr %a, i64 %n) { +; CHECK-LABEL: define <4 x i32> @test_vector_add_reduction( +; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1 +; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]] +; CHECK: [[ENTRY_NEW]]: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP_A]], align 16 +; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[TMP2]] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GEP_A_1]], align 16 +; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[TMP3]] +; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[RES_PH:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_UNR:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]] +; CHECK: [[LOOP_EPIL_PREHEADER]]: +; CHECK-NEXT: [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[RDX_EPIL_INIT:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[LCMP_MOD2]]) +; CHECK-NEXT: br label %[[LOOP_EPIL:.*]] +; CHECK: [[LOOP_EPIL]]: +; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_EPIL_INIT]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GEP_A_EPIL]], align 16 +; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = add <4 x i32> [[RDX_EPIL_INIT]], [[TMP4]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi <4 x i32> [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: ret <4 x i32> [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi <4 x i32> [ zeroinitializer, %entry ], [ %rdx.next, %loop ] + %gep.a = getelementptr inbounds nuw <4 x i32>, ptr %a, i64 %iv + %1 = load <4 x i32>, ptr %gep.a, align 16 + %rdx.next = add <4 x i32> %rdx, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop, !llvm.loop !0 + +exit: + %res = phi <4 x i32> [ %rdx.next, %loop ] + ret <4 x i32> %res +} !0 = distinct !{!0, !1} @@ -234,4 +300,5 @@ exit: ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} ;.