diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 83844287adc5f..9f0d6fcb237ef 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8395,7 +8395,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // --------------------------------------------------------------------------- VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, Builder, BlockMaskCache, LVer); - RecipeBuilder.collectScaledReductions(Range); + // TODO: Handle partial reductions with EVL tail folding. + if (!CM.foldTailWithEVL()) + RecipeBuilder.collectScaledReductions(Range); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 61e3a1848ceed..8d3026e63748a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -3,6 +3,7 @@ ; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ ; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V ; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -prefer-predicate-over-epilogue=scalar-epilogue -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ +; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,TAILFOLD ; TODO: Remove -prefer-predicate-over-epilogue=scalar-epilogue when partial reductions with EVL tail folding is supported. @@ -147,6 +148,37 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ: for.exit: ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] ; +; TAILFOLD-LABEL: define i32 @vqdot( +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP2:%.*]] = sext [[VP_OP_LOAD]] to +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP4:%.*]] = sext [[VP_OP_LOAD1]] to +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul [[TMP4]], [[TMP2]] +; TAILFOLD-NEXT: [[TMP6:%.*]] = add [[TMP5]], [[VEC_PHI]] +; TAILFOLD-NEXT: [[TMP7]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP6]], [[VEC_PHI]], i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] +; TAILFOLD: for.exit: +; TAILFOLD-NEXT: ret i32 [[TMP10]] +; entry: br label %for.body @@ -309,6 +341,37 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ: for.exit: ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] ; +; TAILFOLD-LABEL: define i32 @vqdotu( +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP2:%.*]] = zext [[VP_OP_LOAD]] to +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP4:%.*]] = zext [[VP_OP_LOAD1]] to +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul [[TMP4]], [[TMP2]] +; TAILFOLD-NEXT: [[TMP6:%.*]] = add [[TMP5]], [[VEC_PHI]] +; TAILFOLD-NEXT: [[TMP7]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP6]], [[VEC_PHI]], i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] +; TAILFOLD: for.exit: +; TAILFOLD-NEXT: ret i32 [[TMP10]] +; entry: br label %for.body @@ -471,6 +534,37 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ: for.exit: ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] ; +; TAILFOLD-LABEL: define i32 @vqdotsu( +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP2:%.*]] = zext [[VP_OP_LOAD]] to +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP4:%.*]] = sext [[VP_OP_LOAD1]] to +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul [[TMP4]], [[TMP2]] +; TAILFOLD-NEXT: [[TMP6:%.*]] = add [[TMP5]], [[VEC_PHI]] +; TAILFOLD-NEXT: [[TMP7]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP6]], [[VEC_PHI]], i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] +; TAILFOLD: for.exit: +; TAILFOLD-NEXT: ret i32 [[TMP10]] +; entry: br label %for.body @@ -632,6 +726,37 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; FIXED-ZVQDOTQ: for.exit: ; FIXED-ZVQDOTQ-NEXT: ret i32 [[TMP13]] ; +; TAILFOLD-LABEL: define i32 @vqdotsu2( +; TAILFOLD-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[VECTOR_PH:%.*]] +; TAILFOLD: vector.ph: +; TAILFOLD-NEXT: br label [[VECTOR_BODY:%.*]] +; TAILFOLD: vector.body: +; TAILFOLD-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; TAILFOLD-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; TAILFOLD-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP1]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP2:%.*]] = sext [[VP_OP_LOAD]] to +; TAILFOLD-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], splat (i1 true), i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP4:%.*]] = zext [[VP_OP_LOAD1]] to +; TAILFOLD-NEXT: [[TMP5:%.*]] = mul [[TMP4]], [[TMP2]] +; TAILFOLD-NEXT: [[TMP6:%.*]] = add [[TMP5]], [[VEC_PHI]] +; TAILFOLD-NEXT: [[TMP7]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP6]], [[VEC_PHI]], i32 [[TMP0]]) +; TAILFOLD-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 +; TAILFOLD-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]] +; TAILFOLD-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TAILFOLD: middle.block: +; TAILFOLD-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; TAILFOLD-NEXT: br label [[FOR_EXIT:%.*]] +; TAILFOLD: for.exit: +; TAILFOLD-NEXT: ret i32 [[TMP10]] +; entry: br label %for.body