diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 8b1b0e5c98103..10801c0119e62 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -113,12 +113,12 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { return TypeSwitch(R) .Case([](const auto *R) { return true; }) .Case([](const auto *R) { - // Loads and stores that are uniform across VF lanes are handled by - // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if - // all their operands are invariant. - // TODO: Further relax the restrictions. + // Be conservative about side-effects, except for the + // known-side-effecting assumes and stores, which we know will be + // uniform. return R->isSingleScalar() && - (isa(R->getUnderlyingValue())) && + (!R->mayHaveSideEffects() || + isa(R->getUnderlyingInstr())) && all_of(R->operands(), isUniformAcrossVFsAndUFs); }) .Case([](const auto *VPI) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index 488098d1bdfe2..7f345133f51dd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -66,8 +66,9 @@ define void @replicating_load_used_as_store_addr_2(ptr noalias %invar.dst, ptr n ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop @@ -107,15 +108,15 @@ define void @replicating_load_used_as_store_addr_3(ptr noalias %src, ptr noalias ; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] ; CHECK-NEXT: store i8 0, ptr [[TMP7]], align 1 -; CHECK-NEXT: store i8 0, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP5]] to i8 ; CHECK-NEXT: store i8 [[TMP8]], ptr [[INVAR_DST]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop @@ -466,21 +467,21 @@ define void @test_prefer_vector_addressing(ptr %start, ptr %ms, ptr noalias %src ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP11]] ; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]] ; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA14:![0-9]+]] -; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA14]] -; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA14]] -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA14]] +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA12:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA12]] +; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA12]] +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA12]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]] -; CHECK-NEXT: store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA19:![0-9]+]] -; CHECK-NEXT: store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA19]] -; CHECK-NEXT: store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA19]] -; CHECK-NEXT: store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA19]] +; CHECK-NEXT: store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA17:![0-9]+]] +; CHECK-NEXT: store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA17]] +; CHECK-NEXT: store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA17]] +; CHECK-NEXT: store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA17]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] @@ -581,10 +582,11 @@ define double @test_scalarization_cost_for_load_of_address(ptr %src.0, ptr %src. ; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP9]], [[TMP19]] ; CHECK-NEXT: [[TMP21]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> [[TMP20]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br [[EXIT:label %.*]] -; CHECK: [[SCALAR_PH:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret double [[TMP21]] ; entry: br label %loop