44 changes: 0 additions & 44 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vldrb.s16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vadd.i16 q0, q1, q0
Expand All @@ -28,21 +26,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%0 = getelementptr inbounds i8, i8* %b, i32 %index

; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%2 = bitcast i8* %0 to <8 x i8>*
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
%3 = sext <8 x i8> %wide.masked.load to <8 x i16>
Expand All @@ -69,11 +58,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB1_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.16 lr, r2
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vldrb.u16 q0, [r1], #8
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vadd.i16 q0, q1, q0
Expand All @@ -88,21 +75,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%0 = getelementptr inbounds i8, i8* %b, i32 %index

; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%2 = bitcast i8* %0 to <8 x i8>*
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
%3 = zext <8 x i8> %wide.masked.load to <8 x i16>
Expand All @@ -129,11 +107,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB2_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vldrh.s32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vadd.i32 q0, q1, q0
Expand All @@ -148,21 +124,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %b, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = sext <4 x i16> %wide.masked.load to <4 x i32>
Expand All @@ -189,11 +156,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB3_1: @ %vector.ph
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vldrh.u32 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vadd.i32 q0, q1, q0
Expand All @@ -208,21 +173,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i16, i16* %b, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast i16* %0 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
%3 = zext <4 x i16> %wide.masked.load to <4 x i32>
Expand Down
22 changes: 0 additions & 22 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,9 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB0_8
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: vmul.f32 q0, q1, q0
Expand Down Expand Up @@ -122,21 +120,12 @@ for.body.preheader.new: ; preds = %for.body.preheader
vector.ph: ; preds = %vector.memcheck
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%2 = getelementptr inbounds float, float* %b, i32 %index

; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%4 = bitcast float* %2 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef)
%5 = getelementptr inbounds float, float* %c, i32 %index
Expand Down Expand Up @@ -225,12 +214,10 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: add.w lr, r12, r3, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpstt
Expand Down Expand Up @@ -262,22 +249,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds float, float* %b, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast float* %0 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
%3 = getelementptr inbounds float, float* %c, i32 %index
Expand Down
121 changes: 0 additions & 121 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll

Large diffs are not rendered by default.

30 changes: 0 additions & 30 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
; CHECK: for.cond1.preheader.us.preheader:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4
; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2
; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
Expand All @@ -30,9 +27,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
Expand Down Expand Up @@ -66,9 +60,6 @@ entry:
for.cond1.preheader.us.preheader: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer
%tt = add i32 %n.vec, -4
%tt1 = lshr i32 %tt, 2
%tt2 = add nuw nsw i32 %tt1, 1
Expand All @@ -88,14 +79,8 @@ vector.body: ; preds = %vector.body, %for.c
%index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ]
%tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index

; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29
%tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%tt8 = bitcast i16* %tt6 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef)
%tt9 = sext <4 x i16> %wide.masked.load to <4 x i32>
Expand Down Expand Up @@ -130,9 +115,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
; CHECK: for.cond1.preheader.us.preheader:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TT:%.*]] = add i32 [[N_VEC]], -4
; CHECK-NEXT: [[TT1:%.*]] = lshr i32 [[TT]], 2
; CHECK-NEXT: [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
Expand All @@ -151,9 +133,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
Expand Down Expand Up @@ -185,9 +164,6 @@ entry:
for.cond1.preheader.us.preheader: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer
%tt = add i32 %n.vec, -4
%tt1 = lshr i32 %tt, 2
%tt2 = add nuw nsw i32 %tt1, 1
Expand All @@ -207,14 +183,8 @@ vector.body: ; preds = %vector.body, %for.c
%index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ]
%tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index

; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28
%tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%tt8 = bitcast i32* %tt6 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef)
%tt9 = getelementptr inbounds i32, i32* %B, i32 %index
Expand Down
10 changes: 0 additions & 10 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -100,7 +99,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -172,7 +170,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -242,7 +239,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -314,7 +310,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -384,7 +379,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -481,7 +475,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -510,7 +503,6 @@ middle.block: ; preds = %vector.body
vector.ph47: ; preds = %middle.block
%n.rnd.up48 = add i32 %N, 3
%n.vec50 = and i32 %n.rnd.up48, -4
%trip.count.minus.154 = add i32 %N, -1
%i11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %i10, i32 0
br label %vector.body46

Expand Down Expand Up @@ -594,7 +586,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -719,7 +710,6 @@ lor.end: ; preds = %entry, %lor.rhs
vector.ph: ; preds = %lor.end
%n.rnd.up = add i32 %4, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %4, -1
%5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0
br label %vector.body

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -enable-arm-maskedgatscat=false %s -o - | FileCheck %s

define void @remat_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5) {
; CHECK-LABEL: remat_vctp:
Expand Down
330 changes: 330 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-vpt-debug.mir

Large diffs are not rendered by default.

45 changes: 0 additions & 45 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
%induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tmp = getelementptr inbounds i8, i8* %a, i32 %index

; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)

%tmp2 = bitcast i8* %tmp to <16 x i8>*
%wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
%tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
Expand Down Expand Up @@ -79,23 +70,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index

; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
Expand Down Expand Up @@ -135,20 +117,13 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
Expand Down Expand Up @@ -190,20 +165,13 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
Expand Down Expand Up @@ -262,10 +230,7 @@ vector.body: ; preds = %vector.body, %vecto
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index

; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
Expand Down Expand Up @@ -321,10 +286,7 @@ vector.body: ; preds = %vector.body, %vecto
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index

; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
Expand Down Expand Up @@ -370,7 +332,6 @@ entry:


vector.ph:
%trip.count.minus.1 = add i32 %N, -1
%scevgep = getelementptr i32, i32* %A, i32 8
%scevgep30 = getelementptr i32, i32* %C, i32 8
%scevgep37 = getelementptr i32, i32* %B, i32 8
Expand Down Expand Up @@ -459,9 +420,7 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*

%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
Expand Down Expand Up @@ -495,7 +454,6 @@ entry:
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body

Expand All @@ -509,9 +467,7 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*

%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
Expand Down Expand Up @@ -546,7 +502,6 @@ entry:
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body

Expand Down
40 changes: 0 additions & 40 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,17 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture rea
; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
Expand All @@ -48,13 +43,7 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

; %1 = icmp ult <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
Expand Down Expand Up @@ -244,11 +233,7 @@ vector.body:
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

; Non-uniform constant vector here. This can't be represented with
; @llvm.get.active.lane.mask, but let's keep this test as a sanity check:
%1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
Expand Down Expand Up @@ -285,13 +270,8 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
Expand Down Expand Up @@ -328,12 +308,7 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
Expand Down Expand Up @@ -371,13 +346,8 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

; The induction variable %N is not an IV:
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
Expand Down Expand Up @@ -414,12 +384,7 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
Expand Down Expand Up @@ -460,9 +425,6 @@ vector.body:
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
%lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
Expand Down Expand Up @@ -514,10 +476,8 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
%lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
%lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*

; It's using %j.025, the induction variable from its outer loop:
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ entry:
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body

Expand All @@ -92,13 +91,10 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]

%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*

%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -77,7 +76,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -72,7 +71,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -118,7 +116,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -164,7 +161,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -210,7 +206,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -260,7 +255,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %n, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %n, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -77,7 +76,6 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %blockSize, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %blockSize, -1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down
25 changes: 0 additions & 25 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ vector.ph:
%tmp = add i32 %N, -1
%n.rnd.up = add i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
%1 = lshr i32 %0, 3
%2 = add i32 %1, 1
Expand All @@ -30,14 +28,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ]
%3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index

; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = getelementptr inbounds i16, i16* %B, i32 %index
Expand Down Expand Up @@ -87,8 +79,6 @@ vector.ph:
%tmp = add i32 %N, -1
%n.rnd.up = add nuw nsw i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
%broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
Expand All @@ -101,14 +91,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ]
%3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index

; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
Expand Down Expand Up @@ -151,8 +135,6 @@ entry:
%tmp = add i32 %N, -1
%n.rnd.up = add nuw nsw i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
%broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
Expand All @@ -165,14 +147,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %entry], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ]
%3 = phi i32 [ %start, %entry ], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index

; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
Expand Down Expand Up @@ -227,7 +203,6 @@ for.body:
br i1 %cmp433, label %vector.ph, label %for.end

vector.ph: ; preds = %for.body
%trip.count.minus.1 = add i32 %8, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %7)
br label %vector.body

Expand Down
28 changes: 1 addition & 27 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index

; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
Expand Down Expand Up @@ -72,8 +63,6 @@ entry:

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
Expand All @@ -83,14 +72,8 @@ vector.body: ; preds = %vector.body, %vecto
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index

; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
Expand Down Expand Up @@ -136,23 +119,14 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index

; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ for.body: ; preds = %for.end, %for.body.
br i1 %cmp433, label %vector.ph, label %for.end

vector.ph: ; preds = %for.body
%trip.count.minus.1 = add i32 %i8, -1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %i7)
br label %vector.body

Expand Down
89 changes: 6 additions & 83 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,25 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT: .LBB0_1: @ %vector.ph
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: vmul.i32 q0, q2, q0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: pop {r7, pc}
entry:
Expand All @@ -41,22 +39,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = getelementptr inbounds i32, i32* %b, i32 %index
Expand Down Expand Up @@ -93,15 +82,13 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB1_2
Expand All @@ -116,22 +103,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
Expand Down Expand Up @@ -164,15 +142,13 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: le lr, .LBB2_2
Expand All @@ -187,22 +163,13 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %a, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
Expand All @@ -228,11 +195,9 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB3_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vmul.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r0], #16
Expand All @@ -246,23 +211,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %b, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
Expand All @@ -285,11 +241,9 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB4_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vadd.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r0], #16
Expand All @@ -303,23 +257,14 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%0 = getelementptr inbounds i32, i32* %b, i32 %index

; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%2 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
%3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
Expand All @@ -342,11 +287,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB5_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.8 lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #16
; CHECK-NEXT: vldrb.u8 q0, [r1], #16
; CHECK-NEXT: vldrb.u8 q1, [r2], #16
; CHECK-NEXT: vmul.i8 q0, q1, q0
Expand All @@ -361,21 +304,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 15
%n.vec = and i32 %n.rnd.up, -16
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
%induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%0 = getelementptr inbounds i8, i8* %b, i32 %index

; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
%1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)

%2 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
%3 = getelementptr inbounds i8, i8* %c, i32 %index
Expand All @@ -402,11 +336,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB6_1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.16 lr, r3
; CHECK-NEXT: .LBB6_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add.w r12, r12, #8
; CHECK-NEXT: vldrh.u16 q0, [r1], #16
; CHECK-NEXT: vldrh.u16 q1, [r2], #16
; CHECK-NEXT: vmul.i16 q0, q1, q0
Expand All @@ -421,21 +353,12 @@ entry:
vector.ph: ; preds = %entry
%n.rnd.up = add i32 %N, 7
%n.vec = and i32 %n.rnd.up, -8
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%0 = getelementptr inbounds i16, i16* %b, i32 %index

; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)

%2 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
%3 = getelementptr inbounds i16, i16* %c, i32 %index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ entry:
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
br label %vector.body

Expand All @@ -44,13 +41,7 @@ vector.body: ; preds = %vector.body, %vecto
%6 = phi i32 [ %start, %vector.ph ], [ %10, %vector.body ]
%lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
%lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>

; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
%7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)

%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
%8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/block-placement.mir
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ body: |
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr
; CHECK: tB %bb.3, 14 /* CC::al */, $noreg
; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg
; CHECK: bb.1:
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
; CHECK: bb.3:
Expand Down Expand Up @@ -145,7 +145,7 @@ body: |
; CHECK: $lr = tMOVr $r0, 14 /* CC::al */, $noreg
; CHECK: renamable $r0 = t2ADDrs killed renamable $r2, killed $r0, 18, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2WhileLoopStart killed renamable $lr, %bb.1, implicit-def dead $cpsr
; CHECK: tB %bb.3, 14 /* CC::al */, $noreg
; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg
; CHECK: bb.1:
; CHECK: successors: %bb.4(0x80000000)
; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/lsll0.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s

define void @_Z4loopPxS_iS_i(i64* %d) {
; CHECK-LABEL: _Z4loopPxS_iS_i:
Expand Down
156 changes: 12 additions & 144 deletions llvm/test/CodeGen/Thumb2/mve-fma-loops.ll

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,6 @@ define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %dat
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
; CHECK-NEXT: .long 0 @ 0x0
vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -288,7 +287,6 @@ define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %da
; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
vector.ph41: ; preds = %for.body6.preheader
%ind.end47 = shl i32 %n.vec43, 1
br label %vector.body39

vector.body39: ; preds = %vector.body39, %vector.ph41
Expand Down
73 changes: 54 additions & 19 deletions llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py


; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o 2>/dev/null - | FileCheck %s
; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o - | FileCheck %s

define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_add_sub_block(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
; CHECK: lower.block:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
Expand All @@ -23,20 +20,19 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: br label [[VECTOR_BODY_END]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;

vector.ph:
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
%0 = icmp eq i32 %index, 50
%0 = icmp eq i32 %index, 48
br i1 %0, label %lower.block, label %end

lower.block: ; preds = %vector.body
Expand All @@ -61,15 +57,14 @@ end:
define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_mul_sub_block(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
; CHECK-NEXT: [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
; CHECK: lower.block:
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
Expand All @@ -80,20 +75,19 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado
; CHECK-NEXT: br label [[VECTOR_BODY_END]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;

vector.ph:
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
%0 = icmp eq i32 %index, 50
%0 = icmp eq i32 %index, 48
br i1 %0, label %lower.block, label %end

lower.block: ; preds = %vector.body
Expand All @@ -120,7 +114,6 @@ end:
define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @push_out_mul_sub_loop(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 2
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
Expand All @@ -138,19 +131,18 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readon
; CHECK-NEXT: br label [[VECTOR_2_BODY_END:%.*]]
; CHECK: vector.2.body.end:
; CHECK-NEXT: [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
; CHECK: vector.body.end:
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
; CHECK-NEXT: br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;

vector.ph:
%ind.end = shl i32 %n.vec, 2
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand All @@ -162,7 +154,6 @@ vector.2.ph:
br label %vector.2.body

vector.2.body: ; preds = %vector.body
%index.2 = phi i32 [ 0, %vector.2.ph ], [ %index.2.next, %vector.2.body.end ]
%0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
%2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
Expand All @@ -174,7 +165,7 @@ vector.2.body: ; preds = %vector.body

vector.2.body.end: ; preds = %lower.block
%index.2.next = add i32 %index, 4
%5 = icmp eq i32 %index.2.next, 15
%5 = icmp eq i32 %index.2.next, 16
br i1 %5, label %vector.body.end, label %vector.2.body

vector.body.end: ; preds = %lower.block
Expand All @@ -187,4 +178,48 @@ end:
ret void;
}

define arm_aapcs_vfpcc void @invariant_add(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
; CHECK-LABEL: @invariant_add(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[L0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[L1:%.*]] = add <4 x i32> [[L0]], [[VEC_IND]]
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[L1]], i32 32, i32 2, i32 1)
; CHECK-NEXT: [[L3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[L4:%.*]] = bitcast i32* [[L3]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP0]], <4 x i32>* [[L4]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
; CHECK-NEXT: [[L5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
; CHECK-NEXT: br i1 [[L5]], label [[END:%.*]], label [[VECTOR_BODY]]
; CHECK: end:
; CHECK-NEXT: ret void
;

vector.ph:
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%l0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
%l1 = add <4 x i32> %l0, %vec.ind
%l2 = getelementptr inbounds i32, i32* %data, <4 x i32> %l1
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %l2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%l3 = getelementptr inbounds i32, i32* %dst, i32 %index
%l4 = bitcast i32* %l3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %l4, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%l5 = icmp eq i32 %index.next, %n.vec
br i1 %l5, label %end, label %vector.body

end:
ret void;
}


declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
85 changes: 42 additions & 43 deletions llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -83,7 +82,6 @@ define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly
; CHECK-NEXT: .long 16 @ 0x10

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -127,7 +125,6 @@ define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture read
; CHECK-NEXT: .long 0 @ 0x0

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -173,7 +170,6 @@ define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonl
<4 x i32> %to.store) {

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -215,7 +211,6 @@ define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonl
<4 x i32> %to.store) {

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -259,7 +254,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture
i32* noalias nocapture %dst, i32 %n.vec) {

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand Down Expand Up @@ -301,7 +295,6 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
; CHECK-NEXT: .long 16 @ 0x10

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand All @@ -328,26 +321,29 @@ end:
ret void;
}

define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
; CHECK-LABEL: non_gatscat_use1:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r3, .LCPI7_0
; CHECK-NEXT: vmov.i32 q0, #0x8
; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: adr.w r12, .LCPI7_0
; CHECK-NEXT: vmov.i32 q0, #0x9
; CHECK-NEXT: vldrw.u32 q3, [r12]
; CHECK-NEXT: vmov.i32 q1, #0xc
; CHECK-NEXT: vmov.i32 q2, #0x8
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q3, q2, q0
; CHECK-NEXT: vmlas.u32 q2, q1, r0
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
; CHECK-NEXT: vadd.i32 q4, q3, q2
; CHECK-NEXT: vmul.i32 q5, q3, q0
; CHECK-NEXT: vmlas.u32 q3, q1, r0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: vldrw.u32 q6, [q3, #24]
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vstrw.32 q5, [r3]
; CHECK-NEXT: vstrb.8 q6, [r1], #16
; CHECK-NEXT: bne .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
Expand All @@ -358,7 +354,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %d
; CHECK-NEXT: .long 6 @ 0x6

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand All @@ -372,6 +367,7 @@ vector.body: ; preds = %vector.body, %vecto
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %0, <i32 3, i32 3, i32 3, i32 3>
store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
Expand All @@ -381,26 +377,31 @@ end:
ret void;
}

define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
; CHECK-LABEL: non_gatscat_use2:
; CHECK: @ %bb.0: @ %vector.ph
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r3, .LCPI8_0
; CHECK-NEXT: vmov.i32 q0, #0x8
; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: vmov.i32 q1, #0xc
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: adr.w r12, .LCPI8_0
; CHECK-NEXT: vmov.i32 q0, #0x12
; CHECK-NEXT: vldrw.u32 q4, [r12]
; CHECK-NEXT: vmov.i32 q1, #0x9
; CHECK-NEXT: vmov.i32 q2, #0x8
; CHECK-NEXT: vmov.i32 q3, #0xc
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q3, q2, q0
; CHECK-NEXT: vmlas.u32 q2, q1, r0
; CHECK-NEXT: vldrw.u32 q4, [q2, #24]
; CHECK-NEXT: vadd.i32 q5, q4, q2
; CHECK-NEXT: vmul.i32 q6, q4, q1
; CHECK-NEXT: vmlas.u32 q4, q3, r0
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: vldrw.u32 q7, [q4, #24]
; CHECK-NEXT: vadd.i32 q4, q6, q0
; CHECK-NEXT: vstrw.32 q4, [r3]
; CHECK-NEXT: vmov q4, q5
; CHECK-NEXT: vstrb.8 q7, [r1], #16
; CHECK-NEXT: bne .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
Expand All @@ -411,7 +412,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %d
; CHECK-NEXT: .long 6 @ 0x6

vector.ph: ; preds = %for.body.preheader
%ind.end = shl i32 %n.vec, 1
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
Expand All @@ -425,6 +425,7 @@ vector.body: ; preds = %vector.body, %vecto
%4 = bitcast i32* %3 to <4 x i32>*
store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
%non_gatscat_use = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
%index.next = add i32 %index, 4
%vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
%5 = icmp eq i32 %index.next, %n.vec
Expand Down Expand Up @@ -528,7 +529,6 @@ for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry
%2 = add nuw i32 %1, 1
%min.iters.check = icmp ult i32 %0, 6
%n.vec = and i32 %2, -4
%ind.end = shl i32 %n.vec, 1
%broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0
%broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp.n = icmp eq i32 %2, %n.vec
Expand Down Expand Up @@ -854,12 +854,12 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: add.w r8, r7, #10
; CHECK-NEXT: adr r7, .LCPI11_0
; CHECK-NEXT: ldr r1, [sp, #96]
; CHECK-NEXT: vdup.32 q1, r2
; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: vldrw.u32 q1, [r7]
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: mov.w r9, #6
; CHECK-NEXT: movs r6, #11
; CHECK-NEXT: vshl.i32 q1, q1, #2
; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: .LBB11_1: @ %for.body10.i
; CHECK-NEXT: @ =>This Loop Header: Depth=1
Expand Down Expand Up @@ -894,10 +894,10 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: mul r4, r11, r6
; CHECK-NEXT: vdup.32 q3, r5
; CHECK-NEXT: vdup.32 q2, r7
; CHECK-NEXT: vadd.i32 q4, q0, r4
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: vmla.u32 q3, q4, r2
; CHECK-NEXT: adds r4, #113
; CHECK-NEXT: vadd.i32 q4, q0, r4
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: mov r4, r8
; CHECK-NEXT: vmla.u32 q2, q4, r2
; CHECK-NEXT: .LBB11_5: @ %vector.body
Expand All @@ -907,8 +907,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
; CHECK-NEXT: vadd.i32 q5, q2, q1
; CHECK-NEXT: vadd.i32 q4, q3, q1
; CHECK-NEXT: vadd.i32 q5, q2, q0
; CHECK-NEXT: vadd.i32 q4, q3, q0
; CHECK-NEXT: subs r4, #4
; CHECK-NEXT: vadd.i32 q2, q6, r2
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
Expand Down Expand Up @@ -978,7 +978,6 @@ for.body10.i: ; preds = %for.cond.cleanup20.
br i1 0, label %for.cond.cleanup20.i, label %for.cond22.preheader.lr.ph.i

for.cond22.preheader.lr.ph.i: ; preds = %for.body10.i
%ind.end = add nsw i32 0, %n.vec
%.splatinsert = insertelement <4 x i32> undef, i32 0, i32 0
%.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %.splat, <i32 0, i32 1, i32 2, i32 3>
Expand Down
38 changes: 38 additions & 0 deletions llvm/test/CodeGen/Thumb2/mve-gather-unused.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s

; This files has some unused gathers, making sure that they do not cause
; problems as the function gets simplified.

define arm_aapcs_vfpcc void @unused1(<4 x i32*> %offs) {
; CHECK-LABEL: unused1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
ret void
}

define arm_aapcs_vfpcc void @unused2(<4 x i32*> %offs) {
; CHECK-LABEL: unused2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
ret void
}

define arm_aapcs_vfpcc void @unused2_used(<4 x i32*> %offs) {
; CHECK-LABEL: unused2_used:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: bx lr
entry:
%gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%unused = add <4 x i32> %gather1, %gather2
ret void
}


declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/mve-phireg.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s

; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @cmpugez_v4i1(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: cmpugez_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp eq <4 x i32> %a, zeroinitializer
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/mve-selectcc.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK

define arm_aapcs_vfpcc <4 x i32> @test_v4i32(i32 %x, <4 x i32> %s0, <4 x i32> %s1) {
; CHECK-LABEL: test_v4i32:
Expand Down
145 changes: 145 additions & 0 deletions llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s

; This test has an instruction that gets sunk into the loop, that is a
; active.lane.mask operand. (%exitcount.ptrcnt.to.int = ptrtoint). We
; need to make sure it is loop invariant.

define i32 @a(i32* readnone %b, i8* %c) {
; CHECK-LABEL: a:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: it ls
; CHECK-NEXT: popls {r4, pc}
; CHECK-NEXT: .LBB0_1: @ %while.body.preheader
; CHECK-NEXT: subs r0, r0, r1
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w r2, r0, #15
; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: bic r2, r2, #15
; CHECK-NEXT: subs r2, #16
; CHECK-NEXT: add.w lr, r3, r2, lsr #4
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r3, r1, r2
; CHECK-NEXT: vctp.8 r0
; CHECK-NEXT: vmov.8 q0[0], r3
; CHECK-NEXT: adds r4, r3, #1
; CHECK-NEXT: vmov.8 q0[1], r4
; CHECK-NEXT: adds r4, r3, #2
; CHECK-NEXT: vmov.8 q0[2], r4
; CHECK-NEXT: adds r4, r3, #3
; CHECK-NEXT: vmov.8 q0[3], r4
; CHECK-NEXT: adds r4, r3, #4
; CHECK-NEXT: vmov.8 q0[4], r4
; CHECK-NEXT: adds r4, r3, #5
; CHECK-NEXT: vmov.8 q0[5], r4
; CHECK-NEXT: adds r4, r3, #6
; CHECK-NEXT: vmov.8 q0[6], r4
; CHECK-NEXT: adds r4, r3, #7
; CHECK-NEXT: vmov.8 q0[7], r4
; CHECK-NEXT: add.w r4, r3, #8
; CHECK-NEXT: vmov.8 q0[8], r4
; CHECK-NEXT: add.w r4, r3, #9
; CHECK-NEXT: vmov.8 q0[9], r4
; CHECK-NEXT: add.w r4, r3, #10
; CHECK-NEXT: vmov.8 q0[10], r4
; CHECK-NEXT: add.w r4, r3, #11
; CHECK-NEXT: vmov.8 q0[11], r4
; CHECK-NEXT: add.w r4, r3, #12
; CHECK-NEXT: vmov.8 q0[12], r4
; CHECK-NEXT: add.w r4, r3, #13
; CHECK-NEXT: vmov.8 q0[13], r4
; CHECK-NEXT: add.w r4, r3, #14
; CHECK-NEXT: adds r2, #16
; CHECK-NEXT: subs r0, #16
; CHECK-NEXT: vmov.8 q0[14], r4
; CHECK-NEXT: adds r3, #15
; CHECK-NEXT: vmov.8 q0[15], r3
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q0, [r12], #16
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r4, pc}
entry:
%0 = bitcast i32* %b to i8*
%cmp3 = icmp ugt i8* %0, %c
br i1 %cmp3, label %while.body.preheader, label %while.end

while.body.preheader: ; preds = %entry
%c5 = ptrtoint i8* %c to i32
%1 = sub i32 0, %c5
%uglygep = getelementptr i8, i8* %0, i32 %1
%exitcount.ptrcnt.to.int = ptrtoint i8* %uglygep to i32
%n.rnd.up = add i32 %exitcount.ptrcnt.to.int, 15
%n.vec = and i32 %n.rnd.up, -16
br label %vector.body

vector.body: ; preds = %vector.body, %while.body.preheader
%index = phi i32 [ 0, %while.body.preheader ], [ %index.next, %vector.body ]
%next.gep = getelementptr i8, i8* %c, i32 %index
%2 = or i32 %index, 1
%next.gep7 = getelementptr i8, i8* %c, i32 %2
%3 = or i32 %index, 2
%next.gep8 = getelementptr i8, i8* %c, i32 %3
%4 = or i32 %index, 3
%next.gep9 = getelementptr i8, i8* %c, i32 %4
%5 = or i32 %index, 4
%next.gep10 = getelementptr i8, i8* %c, i32 %5
%6 = or i32 %index, 5
%next.gep11 = getelementptr i8, i8* %c, i32 %6
%7 = or i32 %index, 6
%next.gep12 = getelementptr i8, i8* %c, i32 %7
%8 = or i32 %index, 7
%next.gep13 = getelementptr i8, i8* %c, i32 %8
%9 = or i32 %index, 8
%next.gep14 = getelementptr i8, i8* %c, i32 %9
%10 = or i32 %index, 9
%next.gep15 = getelementptr i8, i8* %c, i32 %10
%11 = or i32 %index, 10
%next.gep16 = getelementptr i8, i8* %c, i32 %11
%12 = or i32 %index, 11
%next.gep17 = getelementptr i8, i8* %c, i32 %12
%13 = or i32 %index, 12
%next.gep18 = getelementptr i8, i8* %c, i32 %13
%14 = or i32 %index, 13
%next.gep19 = getelementptr i8, i8* %c, i32 %14
%15 = or i32 %index, 14
%next.gep20 = getelementptr i8, i8* %c, i32 %15
%16 = or i32 %index, 15
%next.gep21 = getelementptr i8, i8* %c, i32 %16
%17 = insertelement <16 x i8*> poison, i8* %next.gep, i32 0
%18 = insertelement <16 x i8*> %17, i8* %next.gep7, i32 1
%19 = insertelement <16 x i8*> %18, i8* %next.gep8, i32 2
%20 = insertelement <16 x i8*> %19, i8* %next.gep9, i32 3
%21 = insertelement <16 x i8*> %20, i8* %next.gep10, i32 4
%22 = insertelement <16 x i8*> %21, i8* %next.gep11, i32 5
%23 = insertelement <16 x i8*> %22, i8* %next.gep12, i32 6
%24 = insertelement <16 x i8*> %23, i8* %next.gep13, i32 7
%25 = insertelement <16 x i8*> %24, i8* %next.gep14, i32 8
%26 = insertelement <16 x i8*> %25, i8* %next.gep15, i32 9
%27 = insertelement <16 x i8*> %26, i8* %next.gep16, i32 10
%28 = insertelement <16 x i8*> %27, i8* %next.gep17, i32 11
%29 = insertelement <16 x i8*> %28, i8* %next.gep18, i32 12
%30 = insertelement <16 x i8*> %29, i8* %next.gep19, i32 13
%31 = insertelement <16 x i8*> %30, i8* %next.gep20, i32 14
%32 = insertelement <16 x i8*> %31, i8* %next.gep21, i32 15
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %exitcount.ptrcnt.to.int)
%33 = ptrtoint <16 x i8*> %32 to <16 x i32>
%34 = trunc <16 x i32> %33 to <16 x i8>
%35 = bitcast i8* %next.gep to <16 x i8>*
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %34, <16 x i8>* %35, i32 1, <16 x i1> %active.lane.mask)
%index.next = add i32 %index, 16
%36 = icmp eq i32 %index.next, %n.vec
br i1 %36, label %while.end, label %vector.body

while.end: ; preds = %vector.body, %entry
ret i32 undef
}

declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
Loading