Skip to content

Commit d264d24

Browse files
committed
[LoopUnroll] Introduce parallel accumulators when unrolling reductions with vector instructions.
1 parent bb14b83 commit d264d24

File tree

3 files changed

+125
-2
lines changed

3 files changed

+125
-2
lines changed

llvm/lib/Analysis/IVDescriptors.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,10 +270,12 @@ bool RecurrenceDescriptor::AddReductionVar(
270270
// resulting from the type promotion performed by InstCombine. Vector
271271
// operations are not limited to the legal integer widths, so we may be able
272272
// to evaluate the reduction in the narrower width.
273-
if (RecurrenceType->isFloatingPointTy()) {
273+
// Check the scalar type to handle both scalar and vector types.
274+
Type *ScalarTy = RecurrenceType->getScalarType();
275+
if (ScalarTy->isFloatingPointTy()) {
274276
if (!isFloatingPointRecurrenceKind(Kind))
275277
return false;
276-
} else if (RecurrenceType->isIntegerTy()) {
278+
} else if (ScalarTy->isIntegerTy()) {
277279
if (!isIntegerRecurrenceKind(Kind))
278280
return false;
279281
if (!isMinMaxRecurrenceKind(Kind))

llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ loop:
358358
exit:
359359
ret float %rdx.next
360360
}
361+
361362
define i32 @test_smin(ptr %src, i64 %n) {
362363
; CHECK-LABEL: define i32 @test_smin(
363364
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
@@ -623,3 +624,56 @@ loop:
623624
exit:
624625
ret i32 %rdx.next
625626
}
627+
628+
define <4 x i32> @test_vector_add(ptr %p, i64 %n, <4 x i32> %start) {
629+
; CHECK-LABEL: define <4 x i32> @test_vector_add(
630+
; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]], <4 x i32> [[START:%.*]]) {
631+
; CHECK-NEXT: [[ENTRY:.*]]:
632+
; CHECK-NEXT: br label %[[LOOP:.*]]
633+
; CHECK: [[LOOP]]:
634+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
635+
; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
636+
; CHECK-NEXT: [[RDX_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
637+
; CHECK-NEXT: [[RDX_3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
638+
; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
639+
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
640+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV]]
641+
; CHECK-NEXT: [[L:%.*]] = load <4 x i32>, ptr [[GEP]], align 16
642+
; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[L]]
643+
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
644+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT]]
645+
; CHECK-NEXT: [[L_1:%.*]] = load <4 x i32>, ptr [[GEP_1]], align 16
646+
; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[L_1]]
647+
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
648+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_1]]
649+
; CHECK-NEXT: [[L_2:%.*]] = load <4 x i32>, ptr [[GEP_2]], align 16
650+
; CHECK-NEXT: [[RDX_NEXT_2]] = add <4 x i32> [[RDX_2]], [[L_2]]
651+
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
652+
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_2]]
653+
; CHECK-NEXT: [[L_3:%.*]] = load <4 x i32>, ptr [[GEP_3]], align 16
654+
; CHECK-NEXT: [[RDX_NEXT_3]] = add <4 x i32> [[RDX_3]], [[L_3]]
655+
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
656+
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
657+
; CHECK: [[EXIT]]:
658+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi <4 x i32> [ [[RDX_NEXT_3]], %[[LOOP]] ]
659+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]]
660+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add <4 x i32> [[RDX_NEXT_2]], [[BIN_RDX]]
661+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[RDX_NEXT_3]], [[BIN_RDX1]]
662+
; CHECK-NEXT: ret <4 x i32> [[BIN_RDX2]]
663+
;
664+
entry:
665+
br label %loop
666+
667+
loop:
668+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
669+
%rdx = phi <4 x i32> [ %start, %entry ], [ %rdx.next, %loop ]
670+
%iv.next = add i64 %iv, 1
671+
%gep = getelementptr inbounds nuw <4 x i32>, ptr %p, i64 %iv
672+
%l = load <4 x i32>, ptr %gep, align 16
673+
%rdx.next = add <4 x i32> %rdx, %l
674+
%ec = icmp ne i64 %iv.next, 1000
675+
br i1 %ec, label %loop, label %exit
676+
677+
exit:
678+
ret <4 x i32> %rdx.next
679+
}

llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,72 @@ exit:
220220
ret i32 %res
221221
}
222222

223+
define <4 x i32> @test_vector_add_reduction(ptr %a, i64 %n) {
224+
; CHECK-LABEL: define <4 x i32> @test_vector_add_reduction(
225+
; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
226+
; CHECK-NEXT: [[ENTRY:.*]]:
227+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
228+
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
229+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
230+
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
231+
; CHECK: [[ENTRY_NEW]]:
232+
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
233+
; CHECK-NEXT: br label %[[LOOP:.*]]
234+
; CHECK: [[LOOP]]:
235+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
236+
; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
237+
; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
238+
; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
239+
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV]]
240+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP_A]], align 16
241+
; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[TMP2]]
242+
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
243+
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_NEXT]]
244+
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GEP_A_1]], align 16
245+
; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[TMP3]]
246+
; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
247+
; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
248+
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
249+
; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
250+
; CHECK: [[EXIT_UNR_LCSSA]]:
251+
; CHECK-NEXT: [[RES_PH:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ]
252+
; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
253+
; CHECK-NEXT: [[RDX_UNR:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ]
254+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]]
255+
; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
256+
; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]]
257+
; CHECK: [[LOOP_EPIL_PREHEADER]]:
258+
; CHECK-NEXT: [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ]
259+
; CHECK-NEXT: [[RDX_EPIL_INIT:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ]
260+
; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0
261+
; CHECK-NEXT: call void @llvm.assume(i1 [[LCMP_MOD2]])
262+
; CHECK-NEXT: br label %[[LOOP_EPIL:.*]]
263+
; CHECK: [[LOOP_EPIL]]:
264+
; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_EPIL_INIT]]
265+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GEP_A_EPIL]], align 16
266+
; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = add <4 x i32> [[RDX_EPIL_INIT]], [[TMP4]]
267+
; CHECK-NEXT: br label %[[EXIT]]
268+
; CHECK: [[EXIT]]:
269+
; CHECK-NEXT: [[RES:%.*]] = phi <4 x i32> [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
270+
; CHECK-NEXT: ret <4 x i32> [[RES]]
271+
;
272+
entry:
273+
br label %loop
274+
275+
loop:
276+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
277+
%rdx = phi <4 x i32> [ zeroinitializer, %entry ], [ %rdx.next, %loop ]
278+
%gep.a = getelementptr inbounds nuw <4 x i32>, ptr %a, i64 %iv
279+
%1 = load <4 x i32>, ptr %gep.a, align 16
280+
%rdx.next = add <4 x i32> %rdx, %1
281+
%iv.next = add nuw nsw i64 %iv, 1
282+
%ec = icmp eq i64 %iv.next, %n
283+
br i1 %ec, label %exit, label %loop, !llvm.loop !0
284+
285+
exit:
286+
%res = phi <4 x i32> [ %rdx.next, %loop ]
287+
ret <4 x i32> %res
288+
}
223289

224290

225291
!0 = distinct !{!0, !1}
@@ -234,4 +300,5 @@ exit:
234300
; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
235301
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
236302
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
303+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
237304
;.

0 commit comments

Comments
 (0)