686 changes: 316 additions & 370 deletions llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll

Large diffs are not rendered by default.

597 changes: 246 additions & 351 deletions llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll

Large diffs are not rendered by default.

486 changes: 240 additions & 246 deletions llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll

Large diffs are not rendered by default.

156 changes: 140 additions & 16 deletions llvm/test/Transforms/LoopVectorize/X86/optsize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,48 @@ define i32 @foo_optsize() #0 {
; CHECK: for.end:
; CHECK-NEXT: ret i32 0
;
; AUTOVF-LABEL: @foo_optsize(
; AUTOVF-NEXT: entry:
; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AUTOVF: vector.ph:
; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]]
; AUTOVF: vector.body:
; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> undef, i32 [[INDEX]], i32 0
; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> undef, <32 x i32> zeroinitializer
; AUTOVF-NEXT: [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; AUTOVF-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
; AUTOVF-NEXT: [[TMP2:%.*]] = icmp ule <32 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef)
; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0
; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]])
; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32
; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; AUTOVF: middle.block:
; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AUTOVF: scalar.ph:
; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; AUTOVF-NEXT: br label [[FOR_BODY:%.*]]
; AUTOVF: for.body:
; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
; AUTOVF: for.end:
; AUTOVF-NEXT: ret i32 0
;

entry:
br label %for.body
Expand Down Expand Up @@ -116,6 +158,48 @@ define i32 @foo_minsize() #1 {
; CHECK: for.end:
; CHECK-NEXT: ret i32 0
;
; AUTOVF-LABEL: @foo_minsize(
; AUTOVF-NEXT: entry:
; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AUTOVF: vector.ph:
; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]]
; AUTOVF: vector.body:
; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> undef, i32 [[INDEX]], i32 0
; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> undef, <32 x i32> zeroinitializer
; AUTOVF-NEXT: [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; AUTOVF-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
; AUTOVF-NEXT: [[TMP2:%.*]] = icmp ule <32 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef)
; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0
; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]])
; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32
; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
; AUTOVF: middle.block:
; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AUTOVF: scalar.ph:
; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; AUTOVF-NEXT: br label [[FOR_BODY:%.*]]
; AUTOVF: for.body:
; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5
; AUTOVF: for.end:
; AUTOVF-NEXT: ret i32 0
;

entry:
br label %for.body
Expand All @@ -140,15 +224,39 @@ attributes #1 = { minsize }

; We can't vectorize this one because we version for stride==1; even having TC
; a multiple of VF.
; CHECK-LABEL: @scev4stride1
; CHECK-NOT: vector.scevcheck
; CHECK-NOT: vector.body:
; CHECK-LABEL: for.body:
; AUTOVF-LABEL: @scev4stride1
; AUTOVF-NOT: vector.scevcheck
; AUTOVF-NOT: vector.body:
; AUTOVF-LABEL: for.body:
define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
; CHECK-LABEL: @scev4stride1(
; CHECK-NEXT: for.body.preheader:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]]
; CHECK-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: ret void
;
; AUTOVF-LABEL: @scev4stride1(
; AUTOVF-NEXT: for.body.preheader:
; AUTOVF-NEXT: br label [[FOR_BODY:%.*]]
; AUTOVF: for.body:
; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
; AUTOVF-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]]
; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]]
; AUTOVF-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AUTOVF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]]
; AUTOVF-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4
; AUTOVF-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
; AUTOVF: for.end.loopexit:
; AUTOVF-NEXT: ret void
;
for.body.preheader:
br label %for.body

Expand All @@ -174,15 +282,31 @@ attributes #2 = { optsize }
; We can't vectorize this one because we version for overflow check and tiny
; trip count leads to opt-for-size (which otherwise could fold the tail by
; masking).
; CHECK-LABEL: @main
; CHECK-NOT: vector.scevcheck
; CHECK-NOT: vector.body:
; CHECK-LABEL: for.cond:
; AUTOVF-LABEL: @main
; AUTOVF-NOT: vector.scevcheck
; AUTOVF-NOT: vector.body:
; AUTOVF-LABEL: for.cond:
define i32 @main() local_unnamed_addr {
; CHECK-LABEL: @main(
; CHECK-NEXT: while.cond:
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: [[D_0:%.*]] = phi i32 [ 0, [[WHILE_COND:%.*]] ], [ [[ADD:%.*]], [[FOR_COND]] ]
; CHECK-NEXT: [[CONV:%.*]] = and i32 [[D_0]], 65535
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], 4
; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[CONV]], 1
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[WHILE_COND_LOOPEXIT:%.*]]
; CHECK: while.cond.loopexit:
; CHECK-NEXT: ret i32 0
;
; AUTOVF-LABEL: @main(
; AUTOVF-NEXT: while.cond:
; AUTOVF-NEXT: br label [[FOR_COND:%.*]]
; AUTOVF: for.cond:
; AUTOVF-NEXT: [[D_0:%.*]] = phi i32 [ 0, [[WHILE_COND:%.*]] ], [ [[ADD:%.*]], [[FOR_COND]] ]
; AUTOVF-NEXT: [[CONV:%.*]] = and i32 [[D_0]], 65535
; AUTOVF-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], 4
; AUTOVF-NEXT: [[ADD]] = add nuw nsw i32 [[CONV]], 1
; AUTOVF-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[WHILE_COND_LOOPEXIT:%.*]]
; AUTOVF: while.cond.loopexit:
; AUTOVF-NEXT: ret i32 0
;
while.cond:
br label %for.cond

Expand Down
20 changes: 8 additions & 12 deletions llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,16 @@ define i32 @main() local_unnamed_addr #0 {
; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP6]], i32 [[TMP2]], i32 [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMAX]]
; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP6]], i32 [[TMP2]], i32 [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMIN]]
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP7]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[CONV3]], -1
; CHECK-NEXT: [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i32 [[TMP2]], [[TMP9]]
; CHECK-NEXT: [[UMAX1:%.*]] = select i1 [[TMP10]], i32 [[TMP2]], i32 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[UMAX1]]
; CHECK-NEXT: [[UMIN1:%.*]] = select i1 [[TMP10]], i32 [[TMP2]], i32 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[UMIN1]]
; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP12]])
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
Expand Down Expand Up @@ -77,10 +77,6 @@ define i32 @main() local_unnamed_addr #0 {
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[INDEX]] to i8
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i8 [[CONV3]], [[TMP23]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 0, i8 -1, i8 -2, i8 -3>
; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 -4, i8 -5, i8 -6, i8 -7>
; CHECK-NEXT: [[TMP24:%.*]] = add i8 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP25:%.*]] = add i8 [[OFFSET_IDX]], -4
; CHECK-NEXT: [[TMP26]] = add <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
Expand All @@ -95,10 +91,10 @@ define i32 @main() local_unnamed_addr #0 {
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP27]], [[TMP26]]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX4]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
; CHECK-NEXT: [[BIN_RDX3:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX3]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX3]], [[RDX_SHUF4]]
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP7]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
Expand Down
3 changes: 0 additions & 3 deletions llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ define void @foo() {
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 3
; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 2, [[INDEX]]
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[OFFSET_IDX1]] to i32
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 0
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
Expand Down
376 changes: 323 additions & 53 deletions llvm/test/Transforms/LoopVectorize/X86/small-size.ll

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
Expand Down
30 changes: 20 additions & 10 deletions llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,18 @@ define void @vectorized(float* noalias nocapture %A, float* noalias nocapture re
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1
Expand Down Expand Up @@ -112,13 +109,26 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]])
; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]), !llvm.access.group !6
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]]
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9
; CHECK: for.end:
; CHECK-NEXT: ret void
;
Expand Down Expand Up @@ -163,14 +173,14 @@ define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture r
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
Expand Down
1,921 changes: 1,730 additions & 191 deletions llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ define void @test1() {
; CHECK-LABEL: test1(
; CHECK-LABEL: vector.body:
; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
; CHECK: %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
; CHECK: %index.next = add i32 %index, 4

entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,22 @@

define void @can_sink_after_store(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 {
; CHECK-LABEL: vector.ph:
; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0
; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %x, i32 0
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
; CHECK-NEXT: br label %vector.body

; CHECK-LABEL: vector.body:
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
; CHECK-NEXT: %offset.idx = add i64 1, %index
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: %0 = add i64 %offset.idx, 0
; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0
; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>*
; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat2
; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat
; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load
; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0
Expand Down Expand Up @@ -65,25 +62,22 @@ exit:
; and not introduce traps on additional paths.
define void @sink_sdiv(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 {
; CHECK-LABEL: vector.ph:
; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0
; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %x, i32 0
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
; CHECK-NEXT: br label %vector.body

; CHECK-LABEL: vector.body:
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
; CHECK-NEXT: %offset.idx = add i64 1, %index
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: %0 = add i64 %offset.idx, 0
; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0
; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>*
; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: %5 = sdiv <4 x i32> %4, %broadcast.splat2
; CHECK-NEXT: %5 = sdiv <4 x i32> %4, %broadcast.splat
; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load
; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,6 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ARG]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP1]], i32 0
Expand Down
27 changes: 9 additions & 18 deletions llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,6 @@ define i32 @test(i32* nocapture %f) #0 {
; VEC-NEXT: br label [[VECTOR_BODY:%.*]]
; VEC: vector.body:
; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0
; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer
; VEC-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[TMP0]]
; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
Expand Down Expand Up @@ -458,25 +455,19 @@ define void @minimal_bit_widths(i1 %c) {
;
; VEC-LABEL: @minimal_bit_widths(
; VEC-NEXT: entry:
; VEC-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> undef, i1 [[C:%.*]], i32 0
; VEC-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT5]], <2 x i1> undef, <2 x i32> zeroinitializer
; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> undef, i1 [[C:%.*]], i32 0
; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> undef, <2 x i32> zeroinitializer
; VEC-NEXT: br label [[VECTOR_BODY:%.*]]
; VEC: vector.body:
; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0
; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer
; VEC-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; VEC-NEXT: [[OFFSET_IDX:%.*]] = sub i64 undef, [[INDEX]]
; VEC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> undef, i64 [[OFFSET_IDX]], i32 0
; VEC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> undef, <2 x i32> zeroinitializer
; VEC-NEXT: [[INDUCTION4:%.*]] = add <2 x i64> [[BROADCAST_SPLAT3]], <i64 0, i64 -1>
; VEC-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i32 0
; VEC-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <2 x i8>*
; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP4]], align 1
; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0
; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
; VEC-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; VEC: pred.store.if:
; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
Expand All @@ -485,17 +476,17 @@ define void @minimal_bit_widths(i1 %c) {
; VEC-NEXT: store i8 [[TMP8]], i8* [[TMP2]], align 1
; VEC-NEXT: br label [[PRED_STORE_CONTINUE]]
; VEC: pred.store.continue:
; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1
; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
; VEC: pred.store.if7:
; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
; VEC: pred.store.if2:
; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
; VEC-NEXT: [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
; VEC-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
; VEC-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1
; VEC-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* undef, i64 [[TMP13]]
; VEC-NEXT: store i8 [[TMP12]], i8* [[TMP14]], align 1
; VEC-NEXT: br label [[PRED_STORE_CONTINUE8]]
; VEC: pred.store.continue8:
; VEC-NEXT: br label [[PRED_STORE_CONTINUE3]]
; VEC: pred.store.continue3:
; VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
; VEC-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/if-reduction.ll
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ for.end: ; preds = %for.body, %entry
; }

; CHECK-LABEL: @fcmp_val_fadd_select1(
; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2
; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat
; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly {
Expand Down Expand Up @@ -143,7 +143,7 @@ for.end: ; preds = %for.body, %entry
; }

; CHECK-LABEL: @fcmp_val_fadd_select2(
; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2
; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat
; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -S < %s | FileCheck %s

; This is the test case from PR26314.
Expand All @@ -22,16 +23,95 @@
; }
; }

; CHECK-LABEL: Test
; CHECK: <4 x i64>
; CHECK: <4 x i32>, <4 x i32>
; CHECK: !{!"llvm.loop.isvectorized", i32 1}

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

%struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }

define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
; CHECK-LABEL: @Test(
; CHECK-NEXT: [[OBJ4:%.*]] = bitcast %struct.s* [[OBJ:%.*]] to i8*
; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr [[STRUCT_S:%.*]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[Z:%.*]]
; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
; CHECK-NEXT: br label [[DOTOUTER_PREHEADER:%.*]]
; CHECK: .outer.preheader:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[I_NEXT:%.*]], [[DOTOUTER:%.*]] ]
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 0
; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[Z]]
; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8*
; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]]
; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP78]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]]
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[Z]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[OBJ4]], [[SCEVGEP23]]
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; CHECK-NEXT: [[BC:%.*]] = bitcast i32* [[TMP1]] to i8*
; CHECK-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[SCEVGEP1]], [[UGLYGEP]]
; CHECK-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[BC]], [[SCEVGEP23]]
; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true
; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[Z]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !0
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4, !alias.scope !5, !noalias !7
; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[TMP14]], [[WIDE_LOAD12]]
; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4, !alias.scope !5, !noalias !7
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTOUTER_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label [[DOTINNER:%.*]]
; CHECK: .exit:
; CHECK-NEXT: ret void
; CHECK: .outer:
; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
; CHECK-NEXT: [[EXITCOND_OUTER:%.*]] = icmp eq i64 [[I_NEXT]], 32
; CHECK-NEXT: br i1 [[EXITCOND_OUTER]], label [[DOTEXIT:%.*]], label [[DOTOUTER_PREHEADER]]
; CHECK: .inner:
; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[DOTINNER]] ]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]]
; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP1]]
; CHECK-NEXT: [[TMP24:%.*]] = add nsw i32 [[TMP23]], [[TMP22]]
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]]
; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = add nsw i32 [[TMP24]], [[TMP26]]
; CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP25]]
; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1
; CHECK-NEXT: [[EXITCOND_INNER:%.*]] = icmp eq i64 [[J_NEXT]], [[Z]]
; CHECK-NEXT: br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], !llvm.loop !10
;
br label %.outer.preheader


Expand All @@ -42,7 +122,7 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {

.exit:
ret void

.outer:
%i.next = add nuw nsw i64 %i, 1
%exitcond.outer = icmp eq i64 %i.next, 32
Expand All @@ -57,7 +137,7 @@ define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
%6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j
%7 = load i32, i32* %6
%8 = add nsw i32 %5, %7
store i32 %8, i32* %6
store i32 %8, i32* %6
%j.next = add nuw nsw i64 %j, 1
%exitcond.inner = icmp eq i64 %j.next, %z
br i1 %exitcond.inner, label %.outer, label %.inner
Expand Down
3 changes: 0 additions & 3 deletions llvm/test/Transforms/LoopVectorize/pr35773.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@ for.body:
; CHECK-NEXT: [[I32_IV:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[IV_FROM_TRUNC:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.*]], [[VECTOR_BODY]] ]

; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[MAIN_IV]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0

; CHECK-NEXT: [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
Expand Down
75 changes: 53 additions & 22 deletions llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s

; Test case for PR44488. Checks that the correct predicates are created for
Expand All @@ -9,8 +10,12 @@

define i16 @test_true_and_false_branch_equal() {
; CHECK-LABEL: @test_true_and_false_branch_equal(
; CHECK-LABEL: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE2:%.*]] ]
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE2:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 99, [[TMP0]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0
Expand All @@ -21,33 +26,59 @@ define i16 @test_true_and_false_branch_equal() {
; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* @v_38, align 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i16> undef, i16 [[TMP2]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> [[TMP4]], i16 [[TMP3]], i32 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq <2 x i16> [[TMP5]], <i16 32767, i16 32767>
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <2 x i16> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[NOT_CMP2:%.*]] = xor <2 x i1> [[CMP2]], <i1 true, i1 true>
; CHECK-NEXT: [[PRED1:%.*]] = extractelement <2 x i1> [[NOT_CMP2]], i32 0
; CHECK-NEXT: br i1 [[PRED1]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP5]], <i16 32767, i16 32767>
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <2 x i16> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
; CHECK: pred.srem.if:
; CHECK-NEXT: [[TMP14:%.*]] = srem i16 5786, [[TMP2]]
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i16> undef, i16 [[TMP14]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = srem i16 5786, [[TMP2]]
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> undef, i16 [[TMP10]], i32 0
; CHECK-NEXT: br label [[PRED_SREM_CONTINUE]]
; CHECK: pred.srem.continue:
; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i16> [ undef, %vector.body ], [ [[TMP15]], [[PRED_SREM_IF]] ]
; CHECK-NEXT: [[PRED2:%.*]] = extractelement <2 x i1> [[NOT_CMP2]], i32 1
; CHECK-NEXT: br i1 [[PRED2]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2]]
; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ undef, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_SREM_IF]] ]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2]]
; CHECK: pred.srem.if1:
; CHECK-NEXT: [[TMP18:%.*]] = srem i16 5786, [[TMP3]]
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> [[TMP16]], i16 [[TMP18]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = srem i16 5786, [[TMP3]]
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i16> [[TMP12]], i16 [[TMP14]], i32 1
; CHECK-NEXT: br label [[PRED_SREM_CONTINUE2]]
; CHECK: pred.srem.continue2:
; CHECK-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ [[TMP16]], [[PRED_SREM_CONTINUE]] ], [ [[TMP19]], [[PRED_SREM_IF1]] ]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[CMP2]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP20]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
; CHECK-NEXT: store i16 [[TMP22]], i16* @v_39, align 1
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
; CHECK-NEXT: store i16 [[TMP23]], i16* @v_39, align 1
; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_SREM_CONTINUE]] ], [ [[TMP15]], [[PRED_SREM_IF1]] ]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP16]]
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
; CHECK-NEXT: store i16 [[TMP17]], i16* @v_39, align 1
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
; CHECK-NEXT: store i16 [[TMP18]], i16* @v_39, align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
; CHECK-NEXT: br i1 [[TMP24]], label %middle.block, label %vector.body, !llvm.loop !0
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 12, 12
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 111, [[MIDDLE_BLOCK]] ], [ 99, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I_07:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC7:%.*]], [[FOR_LATCH:%.*]] ]
; CHECK-NEXT: [[LV:%.*]] = load i16, i16* @v_38, align 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i16 [[LV]], 32767
; CHECK-NEXT: br i1 [[CMP1]], label [[COND_END:%.*]], label [[COND_END]]
; CHECK: cond.end:
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i16 [[LV]], 0
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_LATCH]], label [[COND_FALSE4:%.*]]
; CHECK: cond.false4:
; CHECK-NEXT: [[REM:%.*]] = srem i16 5786, [[LV]]
; CHECK-NEXT: br label [[FOR_LATCH]]
; CHECK: for.latch:
; CHECK-NEXT: [[COND6:%.*]] = phi i16 [ [[REM]], [[COND_FALSE4]] ], [ 5786, [[COND_END]] ]
; CHECK-NEXT: store i16 [[COND6]], i16* @v_39, align 1
; CHECK-NEXT: [[INC7]] = add nsw i16 [[I_07]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[INC7]], 111
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop !2
; CHECK: exit:
; CHECK-NEXT: [[RV:%.*]] = load i16, i16* @v_39, align 1
; CHECK-NEXT: ret i16 [[RV]]
;
entry:
br label %for.body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ define void @test(float* %A, i32 %x) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[TMP15]], 1
; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
Expand All @@ -46,9 +43,6 @@ define void @test(float* %A, i32 %x) {
; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP22]], align 4
; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[INDEX]] to i32
; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP23]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION6:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 0
; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], [[X]]
; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP25]] to i64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
; CHECK-LABEL: @test_fshl
; CHECK-LABEL: vector.body:
; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: %0 = add i32 %index, 0
; CHECK-NEXT: %1 = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> <i16 15, i16 15, i16 15, i16 15>)
; CHECK-NEXT: %index.next = add i32 %index, 4
Expand Down