Skip to content

Commit 4348cd4

Browse files
committed
[LV] Drop integer poison-generating flags from instructions that need predication
This patch fixes PR52111. The problem is that LV propagates poison-generating flags (`nuw`/`nsw`, `exact` and `inbounds`) in instructions that contribute to the address computation of widen loads/stores that are guarded by a condition. It may happen that when the code is vectorized and the control flow within the loop is linearized, these flags may lead to generating a poison value that is effectively used as the base address of the widen load/store. The fix drops all the integer poison-generating flags from instructions that contribute to the address computation of a widen load/store whose original instruction was in a basic block that needed predication and is not predicated after vectorization. Reviewed By: fhahn, spatel, nlopes Differential Revision: https://reviews.llvm.org/D111846
1 parent 789c88e commit 4348cd4

15 files changed

+618
-494
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 147 additions & 29 deletions
Large diffs are not rendered by default.

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class Value;
5959
class VPBasicBlock;
6060
class VPRegionBlock;
6161
class VPlan;
62+
class VPReplicateRecipe;
6263
class VPlanSlp;
6364

6465
/// Returns a calculation for the total number of elements for a given \p VF.
@@ -346,6 +347,10 @@ struct VPTransformState {
346347

347348
/// Pointer to the VPlan code is generated for.
348349
VPlan *Plan;
350+
351+
/// Holds recipes that may generate a poison value that is used after
352+
/// vectorization, even when their operands are not poison.
353+
SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
349354
};
350355

351356
/// VPUsers instance used by VPBlockBase to manage CondBit and the block
@@ -1511,7 +1516,7 @@ class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
15111516
/// - For store: Address, stored value, optional mask
15121517
/// TODO: We currently execute only per-part unless a specific instance is
15131518
/// provided.
1514-
class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
1519+
class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public VPValue {
15151520
Instruction &Ingredient;
15161521

15171522
// Whether the loaded-from / stored-to addresses are consecutive.
@@ -1533,17 +1538,18 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
15331538
public:
15341539
VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
15351540
bool Consecutive, bool Reverse)
1536-
: VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),
1541+
: VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}),
1542+
VPValue(VPValue::VPVMemoryInstructionSC, &Load, this), Ingredient(Load),
15371543
Consecutive(Consecutive), Reverse(Reverse) {
15381544
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
1539-
new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
15401545
setMask(Mask);
15411546
}
15421547

15431548
VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
15441549
VPValue *StoredValue, VPValue *Mask,
15451550
bool Consecutive, bool Reverse)
15461551
: VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
1552+
VPValue(VPValue::VPVMemoryInstructionSC, &Store, this),
15471553
Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
15481554
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
15491555
setMask(Mask);

llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ define void @mloadstore_f32(float* noalias nocapture %a, float* noalias nocaptur
55
; CHECK: vector.body:
66
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
77
; CHECK-NEXT: %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
8-
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr inbounds float, float* %a,
8+
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr float, float* %a,
99
; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x float>*
1010
; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
1111
; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
@@ -42,7 +42,7 @@ define void @mloadstore_i32(i32* noalias nocapture %a, i32* noalias nocapture re
4242
; CHECK: vector.body:
4343
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
4444
; CHECK-NEXT: %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
45-
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr inbounds i32, i32* %a,
45+
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr i32, i32* %a,
4646
; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
4747
; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
4848
; CHECK-NEXT: %[[FADD:.*]] = add <vscale x 4 x i32> %[[LOAD1]], %[[LOAD2]]

llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; This is the loop in c++ being vectorize in this file with
1+
; This is the loop in c++ being vectorize in this file with
22
; experimental.vector.reverse
33

44
;#pragma clang loop vectorize_width(4, scalable)
@@ -18,7 +18,7 @@ target triple = "aarch64-unknown-linux-gnu"
1818
define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 {
1919
; CHECK-LABEL: vector.body:
2020
; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
21-
; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* nonnull %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
21+
; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
2222
; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[WIDEMSKLOAD]]
2323
; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
2424
; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]

llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,16 @@ define void @vector_reverse_mask_v4i1(double* %a, double* %cond, i64 %N) #0 {
5151
; CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_LOAD6]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
5252
; CHECK-NEXT: [[TMP8:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
5353
; CHECK-NEXT: [[TMP9:%.*]] = fcmp une <4 x double> [[REVERSE7]], zeroinitializer
54-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
55-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[TMP10]], i64 -3
54+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]]
55+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, double* [[TMP10]], i64 -3
5656
; CHECK-NEXT: [[REVERSE8:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
5757
; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>*
58-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP12]], i32 8, <4 x i1> [[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0
59-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[TMP10]], i64 -4
60-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 -3
58+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP12]], i32 8, <4 x i1> [[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0
59+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP10]], i64 -4
60+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, double* [[TMP13]], i64 -3
6161
; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
6262
; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
63-
; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP15]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0
63+
; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0
6464
; CHECK-NEXT: [[TMP16:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
6565
; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
6666
; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP11]] to <4 x double>*

llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ define void @drop_scalar_nuw_nsw(float* noalias nocapture readonly %input,
2626
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
2727
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
2828
; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
29-
; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw i64 [[TMP0]], 1
30-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], i64 [[TMP5]]
29+
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], 1
30+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
3131
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
32-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
32+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
3333
; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
3434
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
3535
entry:
@@ -67,20 +67,20 @@ define void @drop_nonpred_scalar_nuw_nsw(float* noalias nocapture readonly %inpu
6767
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
6868
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
6969
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
70-
; CHECK: [[TMP5:%.*]] = sub nuw nsw i64 [[TMP0]], 1
71-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], i64 [[TMP5]]
70+
; CHECK: [[TMP5:%.*]] = sub i64 [[TMP0]], 1
71+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
7272
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
7373
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
74-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
74+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
7575
; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
7676
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
7777
entry:
7878
br label %loop.header
7979

8080
loop.header:
8181
%iv = phi i64 [ 0, %entry ], [ %iv.inc, %if.end ]
82-
%i27 = sub nuw nsw i64 %iv, 1
83-
%i29 = getelementptr inbounds float, float* %input, i64 %i27
82+
%i27 = sub i64 %iv, 1
83+
%i29 = getelementptr float, float* %input, i64 %i27
8484
%i23 = icmp eq i64 %iv, 0
8585
br i1 %i23, label %if.end, label %if.then
8686

@@ -151,8 +151,8 @@ define void @drop_vector_nuw_nsw(float* noalias nocapture readonly %input,
151151
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
152152
; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
153153
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float*, float** [[PTRS:%.*]], i64 [[TMP0]]
154-
; CHECK-NEXT: [[TMP6:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
155-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP6]]
154+
; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
155+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[INPUT:%.*]], <4 x i64> [[TMP6]]
156156
; CHECK: [[TMP10:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
157157
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float*> [[TMP7]], i32 0
158158
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
@@ -238,10 +238,10 @@ define void @drop_scalar_exact(float* noalias nocapture readonly %input,
238238
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
239239
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer
240240
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]]
241-
; CHECK-NEXT: [[TMP8:%.*]] = sdiv exact i64 [[TMP0]], 1
242-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], i64 [[TMP8]]
241+
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i64 [[TMP0]], 1
242+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP8]]
243243
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
244-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
244+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i32 0
245245
; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
246246
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP12]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0
247247
entry:

0 commit comments

Comments
 (0)