Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,42 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
; ENABLED-NEXT: mov r9, r12
; ENABLED-NEXT: uxth r0, r0
; ENABLED-NEXT: rsbs r5, r0, #0
; ENABLED-NEXT: b .LBB0_4
; ENABLED-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: movs r0, #0
; ENABLED-NEXT: .LBB0_3: @ %for.end
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: b .LBB0_5
; ENABLED-NEXT: .LBB0_2: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_5 Depth=1
; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
; ENABLED-NEXT: vctp.32 r4
; ENABLED-NEXT: vmov q0, q1
; ENABLED-NEXT: vpstt
; ENABLED-NEXT: vldrht.s32 q1, [r0], #8
; ENABLED-NEXT: vldrht.s32 q2, [r7], #8
; ENABLED-NEXT: mov lr, r6
; ENABLED-NEXT: vmul.i32 q1, q2, q1
; ENABLED-NEXT: subs r6, #1
; ENABLED-NEXT: vshl.s32 q1, r5
; ENABLED-NEXT: subs r4, #4
; ENABLED-NEXT: vadd.i32 q1, q1, q0
; ENABLED-NEXT: le lr, .LBB0_2
; ENABLED-NEXT: @ %bb.3: @ %middle.block
; ENABLED-NEXT: @ in Loop: Header=BB0_5 Depth=1
; ENABLED-NEXT: vpsel q0, q1, q0
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: .LBB0_4: @ %for.end
; ENABLED-NEXT: @ in Loop: Header=BB0_5 Depth=1
; ENABLED-NEXT: lsrs r0, r0, #16
; ENABLED-NEXT: sub.w r9, r9, #1
; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1]
; ENABLED-NEXT: add.w r8, r8, #1
; ENABLED-NEXT: add.w r10, r10, #2
; ENABLED-NEXT: cmp r8, r3
; ENABLED-NEXT: beq .LBB0_8
; ENABLED-NEXT: .LBB0_4: @ %for.body
; ENABLED-NEXT: .LBB0_5: @ %for.body
; ENABLED-NEXT: @ =>This Loop Header: Depth=1
; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2
; ENABLED-NEXT: @ Child Loop BB0_2 Depth 2
; ENABLED-NEXT: cmp r2, r8
; ENABLED-NEXT: ble .LBB0_2
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: ble .LBB0_7
; ENABLED-NEXT: @ %bb.6: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_5 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
; ENABLED-NEXT: movs r7, #1
; ENABLED-NEXT: subs r0, #4
Expand All @@ -62,26 +79,10 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
; ENABLED-NEXT: mov r7, r10
; ENABLED-NEXT: dls lr, r0
; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
; ENABLED-NEXT: vctp.32 r4
; ENABLED-NEXT: vmov q0, q1
; ENABLED-NEXT: vpstt
; ENABLED-NEXT: vldrht.s32 q1, [r0], #8
; ENABLED-NEXT: vldrht.s32 q2, [r7], #8
; ENABLED-NEXT: mov lr, r6
; ENABLED-NEXT: vmul.i32 q1, q2, q1
; ENABLED-NEXT: subs r6, #1
; ENABLED-NEXT: vshl.s32 q1, r5
; ENABLED-NEXT: subs r4, #4
; ENABLED-NEXT: vadd.i32 q1, q1, q0
; ENABLED-NEXT: le lr, .LBB0_6
; ENABLED-NEXT: @ %bb.7: @ %middle.block
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: vpsel q0, q1, q0
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: b .LBB0_3
; ENABLED-NEXT: b .LBB0_2
; ENABLED-NEXT: .LBB0_7: @ in Loop: Header=BB0_5 Depth=1
; ENABLED-NEXT: movs r0, #0
; ENABLED-NEXT: b .LBB0_4
; ENABLED-NEXT: .LBB0_8: @ %for.end17
; ENABLED-NEXT: add sp, #4
; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
Expand All @@ -101,25 +102,42 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
; NOREDUCTIONS-NEXT: mov r9, r12
; NOREDUCTIONS-NEXT: uxth r0, r0
; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
; NOREDUCTIONS-NEXT: b .LBB0_4
; NOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: movs r0, #0
; NOREDUCTIONS-NEXT: .LBB0_3: @ %for.end
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: b .LBB0_5
; NOREDUCTIONS-NEXT: .LBB0_2: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_5 Depth=1
; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2
; NOREDUCTIONS-NEXT: vctp.32 r4
; NOREDUCTIONS-NEXT: vmov q0, q1
; NOREDUCTIONS-NEXT: vpstt
; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8
; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8
; NOREDUCTIONS-NEXT: mov lr, r6
; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1
; NOREDUCTIONS-NEXT: subs r6, #1
; NOREDUCTIONS-NEXT: vshl.s32 q1, r5
; NOREDUCTIONS-NEXT: subs r4, #4
; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0
; NOREDUCTIONS-NEXT: le lr, .LBB0_2
; NOREDUCTIONS-NEXT: @ %bb.3: @ %middle.block
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_5 Depth=1
; NOREDUCTIONS-NEXT: vpsel q0, q1, q0
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.end
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_5 Depth=1
; NOREDUCTIONS-NEXT: lsrs r0, r0, #16
; NOREDUCTIONS-NEXT: sub.w r9, r9, #1
; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1]
; NOREDUCTIONS-NEXT: add.w r8, r8, #1
; NOREDUCTIONS-NEXT: add.w r10, r10, #2
; NOREDUCTIONS-NEXT: cmp r8, r3
; NOREDUCTIONS-NEXT: beq .LBB0_8
; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body
; NOREDUCTIONS-NEXT: .LBB0_5: @ %for.body
; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1
; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2
; NOREDUCTIONS-NEXT: @ Child Loop BB0_2 Depth 2
; NOREDUCTIONS-NEXT: cmp r2, r8
; NOREDUCTIONS-NEXT: ble .LBB0_2
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: ble .LBB0_7
; NOREDUCTIONS-NEXT: @ %bb.6: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_5 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
; NOREDUCTIONS-NEXT: movs r7, #1
; NOREDUCTIONS-NEXT: subs r0, #4
Expand All @@ -133,26 +151,10 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
; NOREDUCTIONS-NEXT: mov r7, r10
; NOREDUCTIONS-NEXT: dls lr, r0
; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2
; NOREDUCTIONS-NEXT: vctp.32 r4
; NOREDUCTIONS-NEXT: vmov q0, q1
; NOREDUCTIONS-NEXT: vpstt
; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8
; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8
; NOREDUCTIONS-NEXT: mov lr, r6
; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1
; NOREDUCTIONS-NEXT: subs r6, #1
; NOREDUCTIONS-NEXT: vshl.s32 q1, r5
; NOREDUCTIONS-NEXT: subs r4, #4
; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0
; NOREDUCTIONS-NEXT: le lr, .LBB0_6
; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: vpsel q0, q1, q0
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: b .LBB0_3
; NOREDUCTIONS-NEXT: b .LBB0_2
; NOREDUCTIONS-NEXT: .LBB0_7: @ in Loop: Header=BB0_5 Depth=1
; NOREDUCTIONS-NEXT: movs r0, #0
; NOREDUCTIONS-NEXT: b .LBB0_4
; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17
; NOREDUCTIONS-NEXT: add sp, #4
; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ define i32 @vcmp_new_vpst_combination(i32 %len, i32* nocapture readonly %arr) {
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp7 = icmp sgt i32 %len, 0
Expand Down
46 changes: 23 additions & 23 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -60,39 +60,39 @@ define void @nested(i32* nocapture readonly %x, i32* nocapture readnone %y, i32*
; CHECK-NEXT: ldr r5, [sp, #28]
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_2: @ in Loop: Header=BB1_4 Depth=1
; CHECK-NEXT: mov r4, r3
; CHECK-NEXT: .LBB1_3: @ %if.end
; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
; CHECK-NEXT: str.w r4, [r2, r1, lsl #2]
; CHECK-NEXT: adds r1, #1
; CHECK-NEXT: cmp r1, r3
; CHECK-NEXT: beq .LBB1_8
; CHECK-NEXT: .LBB1_4: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_6 Depth 2
; CHECK-NEXT: adds r7, r5, #3
; CHECK-NEXT: cmp.w r12, r7, lsr #2
; CHECK-NEXT: beq .LBB1_2
; CHECK-NEXT: @ %bb.5: @ %do.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_2: @ %do.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
; CHECK-NEXT: bic r9, r7, #3
; CHECK-NEXT: mov r7, r5
; CHECK-NEXT: mov r4, r3
; CHECK-NEXT: add.w r8, r0, r9, lsl #2
; CHECK-NEXT: dlstp.32 lr, r5
; CHECK-NEXT: .LBB1_6: @ %do.body
; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1
; CHECK-NEXT: .LBB1_3: @ %do.body
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vaddva.s32 r4, q0
; CHECK-NEXT: letp lr, .LBB1_6
; CHECK-NEXT: @ %bb.7: @ %if.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1
; CHECK-NEXT: letp lr, .LBB1_3
; CHECK-NEXT: @ %bb.4: @ %if.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
; CHECK-NEXT: sub.w r5, r5, r9
; CHECK-NEXT: mov r0, r8
; CHECK-NEXT: b .LBB1_3
; CHECK-NEXT: .LBB1_5: @ %if.end
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
; CHECK-NEXT: str.w r4, [r2, r1, lsl #2]
; CHECK-NEXT: adds r1, #1
; CHECK-NEXT: cmp r1, r3
; CHECK-NEXT: beq .LBB1_8
; CHECK-NEXT: .LBB1_6: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
; CHECK-NEXT: adds r7, r5, #3
; CHECK-NEXT: cmp.w r12, r7, lsr #2
; CHECK-NEXT: bne .LBB1_2
; CHECK-NEXT: @ %bb.7: @ in Loop: Header=BB1_6 Depth=1
; CHECK-NEXT: mov r4, r3
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
entry:
Expand Down
64 changes: 31 additions & 33 deletions llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1102,20 +1102,31 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: add.w r3, r12, #16
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_3: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: b .LBB16_6
; CHECK-NEXT: .LBB16_3: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r1, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: vfma.f16 q0, q1, r1
; CHECK-NEXT: bne .LBB16_3
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: add.w r5, r5, r8, lsl #1
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: subs.w r9, r9, #1
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: .LBB16_4: @ %while.body
; CHECK-NEXT: .LBB16_6: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
; CHECK-NEXT: @ Child Loop BB16_3 Depth 2
; CHECK-NEXT: vldrw.u32 q0, [r1], #8
; CHECK-NEXT: ldrh.w lr, [r12, #14]
; CHECK-NEXT: ldrh.w r0, [r12, #12]
Expand Down Expand Up @@ -1152,14 +1163,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, lr
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_7
; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: blo .LBB16_11
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_6: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r0, [r6], #16
; CHECK-NEXT: vldrw.u32 q1, [r5]
Expand Down Expand Up @@ -1190,32 +1201,19 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r1
; CHECK-NEXT: le lr, .LBB16_6
; CHECK-NEXT: b .LBB16_8
; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: .LBB16_9: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: cmp.w r8, #0
; CHECK-NEXT: beq.w .LBB16_3
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: beq.w .LBB16_5
; CHECK-NEXT: @ %bb.10: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: mov lr, r8
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r1, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: vfma.f16 q0, q1, r1
; CHECK-NEXT: bne .LBB16_10
; CHECK-NEXT: b .LBB16_11
; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: add.w r5, r5, r8, lsl #1
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_11: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_12: @ %if.end
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
Expand Down
90 changes: 45 additions & 45 deletions llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1074,19 +1074,30 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_3: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: b .LBB16_6
; CHECK-NEXT: .LBB16_3: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldr r0, [r7], #4
; CHECK-NEXT: vldrw.u32 q1, [r6], #4
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: bne .LBB16_3
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: add.w r5, r5, r3, lsl #2
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: subs.w r10, r10, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: add.w r0, r5, r0, lsl #2
; CHECK-NEXT: add.w r5, r0, #16
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: .LBB16_4: @ %while.body
; CHECK-NEXT: .LBB16_6: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
; CHECK-NEXT: @ Child Loop BB16_3 Depth 2
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: ldrd r7, r6, [r12]
; CHECK-NEXT: ldrd r0, r4, [r12, #8]
Expand All @@ -1112,14 +1123,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: vfma.f32 q0, q1, r8
; CHECK-NEXT: blo .LBB16_7
; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: blo .LBB16_11
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_6: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldm.w r7, {r0, r3, r4, r6}
; CHECK-NEXT: vldrw.u32 q1, [r5], #32
Expand All @@ -1142,34 +1153,21 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: adds r7, #32
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: vfma.f32 q0, q1, r9
; CHECK-NEXT: le lr, .LBB16_6
; CHECK-NEXT: b .LBB16_8
; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: .LBB16_9: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldrd r9, r1, [sp, #24] @ 8-byte Folded Reload
; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: cmp.w r3, #0
; CHECK-NEXT: beq .LBB16_3
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: beq .LBB16_5
; CHECK-NEXT: @ %bb.10: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: mov r6, r5
; CHECK-NEXT: mov lr, r3
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldr r0, [r7], #4
; CHECK-NEXT: vldrw.u32 q1, [r6], #4
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: bne .LBB16_10
; CHECK-NEXT: b .LBB16_11
; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: add.w r5, r5, r3, lsl #2
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_11: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_12: @ %if.end
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
Expand Down Expand Up @@ -1581,25 +1579,27 @@ define arm_aapcs_vfpcc void @fms(float* nocapture readonly %pSrc1, float* nocapt
; CHECK-NEXT: @ %bb.1: @ %do.body.preheader
; CHECK-NEXT: ldr.w r12, [sp, #20]
; CHECK-NEXT: lsr.w r5, lr, #2
; CHECK-NEXT: .LBB18_2: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB18_3 Depth 2
; CHECK-NEXT: ldr r4, [r2]
; CHECK-NEXT: dls lr, r5
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: .LBB18_3: @ %while.body
; CHECK-NEXT: @ Parent Loop BB18_2 Depth=1
; CHECK-NEXT: b .LBB18_4
; CHECK-NEXT: .LBB18_2: @ %while.body
; CHECK-NEXT: @ Parent Loop BB18_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
; CHECK-NEXT: vfms.f32 q2, q1, q0
; CHECK-NEXT: vstrb.8 q2, [r3], #16
; CHECK-NEXT: le lr, .LBB18_3
; CHECK-NEXT: @ %bb.4: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB18_2 Depth=1
; CHECK-NEXT: le lr, .LBB18_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB18_4 Depth=1
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: add.w r2, r2, #4
; CHECK-NEXT: bne .LBB18_2
; CHECK-NEXT: beq .LBB18_5
; CHECK-NEXT: .LBB18_4: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB18_2 Depth 2
; CHECK-NEXT: ldr r4, [r2]
; CHECK-NEXT: dls lr, r5
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: b .LBB18_2
; CHECK-NEXT: .LBB18_5: @ %do.end
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
Expand Down
146 changes: 78 additions & 68 deletions llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
Original file line number Diff line number Diff line change
Expand Up @@ -325,23 +325,25 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read
; CHECK-NEXT: adr r3, .LCPI8_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB8_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB8_3 Depth 2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: .LBB8_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1
; CHECK-NEXT: b .LBB8_4
; CHECK-NEXT: .LBB8_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB8_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q2, [q1, #16]!
; CHECK-NEXT: vstrb.8 q2, [r0], #16
; CHECK-NEXT: le lr, .LBB8_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB8_2 Depth=1
; CHECK-NEXT: le lr, .LBB8_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB8_4 Depth=1
; CHECK-NEXT: cmp r12, r2
; CHECK-NEXT: bne .LBB8_2
; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup
; CHECK-NEXT: beq .LBB8_5
; CHECK-NEXT: .LBB8_4: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB8_2 Depth 2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: b .LBB8_2
; CHECK-NEXT: .LBB8_5: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
Expand Down Expand Up @@ -402,28 +404,30 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture rea
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: .LBB9_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_3 Depth 2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: vmov q5, q2
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1
; CHECK-NEXT: b .LBB9_4
; CHECK-NEXT: .LBB9_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q6, [q5, #48]!
; CHECK-NEXT: vldrw.u32 q7, [q3, #48]!
; CHECK-NEXT: vadd.i32 q6, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [q4, #48]!
; CHECK-NEXT: vadd.i32 q6, q6, q7
; CHECK-NEXT: vstrb.8 q6, [r0], #16
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1
; CHECK-NEXT: le lr, .LBB9_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_4 Depth=1
; CHECK-NEXT: cmp r12, r2
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: beq .LBB9_5
; CHECK-NEXT: .LBB9_4: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_2 Depth 2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: vmov q5, q2
; CHECK-NEXT: b .LBB9_2
; CHECK-NEXT: .LBB9_5: @ %for.cond.cleanup
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, r5, r7, pc}
Expand Down Expand Up @@ -498,23 +502,25 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture reado
; CHECK-NEXT: adr r3, .LCPI10_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB10_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB10_3 Depth 2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: .LBB10_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1
; CHECK-NEXT: b .LBB10_4
; CHECK-NEXT: .LBB10_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q2, [q1, #508]!
; CHECK-NEXT: vstrb.8 q2, [r0], #16
; CHECK-NEXT: le lr, .LBB10_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1
; CHECK-NEXT: le lr, .LBB10_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_4 Depth=1
; CHECK-NEXT: cmp r12, r2
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup
; CHECK-NEXT: beq .LBB10_5
; CHECK-NEXT: .LBB10_4: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB10_2 Depth 2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: b .LBB10_2
; CHECK-NEXT: .LBB10_5: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
Expand Down Expand Up @@ -578,15 +584,9 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
; CHECK-NEXT: adr r6, .LCPI11_0
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB11_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB11_3 Depth 2
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB11_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
; CHECK-NEXT: b .LBB11_4
; CHECK-NEXT: .LBB11_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vmov.u16 r7, q2[6]
; CHECK-NEXT: vmov.u16 r3, q2[4]
Expand Down Expand Up @@ -632,11 +632,19 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
; CHECK-NEXT: vmov.16 q3[6], r5
; CHECK-NEXT: vmov.16 q3[7], r6
; CHECK-NEXT: vstrb.8 q3, [r4], #16
; CHECK-NEXT: le lr, .LBB11_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1
; CHECK-NEXT: le lr, .LBB11_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=1
; CHECK-NEXT: cmp r8, r2
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: beq .LBB11_5
; CHECK-NEXT: .LBB11_4: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB11_2 Depth 2
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: b .LBB11_2
; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9}
Expand Down Expand Up @@ -717,17 +725,9 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
; CHECK-NEXT: str r1, [sp, #52] @ 4-byte Spill
; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: .LBB12_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB12_3 Depth 2
; CHECK-NEXT: ldr r1, [sp, #52] @ 4-byte Reload
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload
; CHECK-NEXT: .LBB12_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1
; CHECK-NEXT: b .LBB12_4
; CHECK-NEXT: .LBB12_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB12_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vmov.u16 r3, q5[2]
; CHECK-NEXT: vmov.u16 r5, q5[0]
Expand Down Expand Up @@ -864,12 +864,22 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vadd.i16 q0, q0, q1
; CHECK-NEXT: vstrb.8 q0, [r4], #16
; CHECK-NEXT: le lr, .LBB12_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1
; CHECK-NEXT: le lr, .LBB12_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB12_4 Depth=1
; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload
; CHECK-NEXT: cmp r1, r2
; CHECK-NEXT: bne.w .LBB12_2
; CHECK-NEXT: beq .LBB12_5
; CHECK-NEXT: .LBB12_4: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB12_2 Depth 2
; CHECK-NEXT: ldr r1, [sp, #52] @ 4-byte Reload
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload
; CHECK-NEXT: b .LBB12_2
; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #104
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
Expand Down
156 changes: 81 additions & 75 deletions llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -460,29 +460,23 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
; CHECK-NEXT: vldrw.u32 q2, [r7]
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader
; CHECK-NEXT: b .LBB9_2
; CHECK-NEXT: .LBB9_1: @ %for.cond4.for.cond.cleanup6_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1
; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: beq .LBB9_6
; CHECK-NEXT: .LBB9_2: @ %for.cond8.preheader.us.us.preheader
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_2 Depth 2
; CHECK-NEXT: @ Child Loop BB9_5 Depth 2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: mul r11, r8, r9
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: mul r7, r8, r12
; CHECK-NEXT: .LBB9_2: @ %vector.ph
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vdup.32 q5, r7
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vshl.i32 q5, q5, #2
; CHECK-NEXT: vmov q6, q1
; CHECK-NEXT: vadd.i32 q5, q5, r0
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: vmov.i32 q4, #0x0
; CHECK-NEXT: vadd.i32 q5, q5, q0
; CHECK-NEXT: vmlas.u32 q6, q2, r5
; CHECK-NEXT: b .LBB9_5
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_5 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vadd.i32 q7, q6, q3
; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2]
Expand All @@ -492,19 +486,28 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
; CHECK-NEXT: vadd.i32 q4, q0, q4
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
; CHECK-NEXT: @ in Loop: Header=BB9_5 Depth=2
; CHECK-NEXT: add.w r4, r5, r11
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: vaddv.u32 r6, q4
; CHECK-NEXT: cmp r5, r9
; CHECK-NEXT: str.w r6, [r2, r4, lsl #2]
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1
; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: bne .LBB9_1
; CHECK-NEXT: @ %bb.6: @ %for.end25
; CHECK-NEXT: beq .LBB9_1
; CHECK-NEXT: .LBB9_5: @ %vector.ph
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vdup.32 q5, r7
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vshl.i32 q5, q5, #2
; CHECK-NEXT: vmov q6, q1
; CHECK-NEXT: vadd.i32 q5, q5, r0
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: vmov.i32 q4, #0x0
; CHECK-NEXT: vadd.i32 q5, q5, q0
; CHECK-NEXT: vmlas.u32 q6, q2, r5
; CHECK-NEXT: b .LBB9_3
; CHECK-NEXT: .LBB9_6: @ %for.end25
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
Expand Down Expand Up @@ -861,36 +864,43 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: movs r6, #11
; CHECK-NEXT: vshl.i32 q1, q1, #2
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: .LBB11_1: @ %for.body10.i
; CHECK-NEXT: b .LBB11_2
; CHECK-NEXT: .LBB11_1: @ %for.cond.cleanup20.i
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1
; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: ldr r7, [sp, #148]
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: cmp r5, r7
; CHECK-NEXT: it eq
; CHECK-NEXT: moveq r5, #0
; CHECK-NEXT: .LBB11_2: @ %for.body10.i
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB11_2 Depth 2
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: @ Child Loop BB11_4 Depth 2
; CHECK-NEXT: @ Child Loop BB11_9 Depth 3
; CHECK-NEXT: @ Child Loop BB11_5 Depth 4
; CHECK-NEXT: @ Child Loop BB11_6 Depth 5
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: b .LBB11_4
; CHECK-NEXT: .LBB11_3: @ %for.cond.cleanup26.i
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=2
; CHECK-NEXT: adds r7, #1
; CHECK-NEXT: cmp r7, r3
; CHECK-NEXT: beq .LBB11_1
; CHECK-NEXT: .LBB11_4: @ %for.cond22.preheader.i
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: @ Child Loop BB11_9 Depth 3
; CHECK-NEXT: @ Child Loop BB11_5 Depth 4
; CHECK-NEXT: @ Child Loop BB11_6 Depth 5
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: .LBB11_3: @ %for.body27.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ => This Loop Header: Depth=3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: dls lr, r9
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov.w r11, #4
; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: b .LBB11_9
; CHECK-NEXT: .LBB11_5: @ %for.body78.us.i
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_9 Depth=3
; CHECK-NEXT: @ => This Loop Header: Depth=4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: @ Child Loop BB11_6 Depth 5
; CHECK-NEXT: mul r4, r11, r6
; CHECK-NEXT: vdup.32 q3, r5
; CHECK-NEXT: vdup.32 q2, r7
Expand All @@ -900,11 +910,11 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: vadd.i32 q4, q0, r4
; CHECK-NEXT: mov r4, r8
; CHECK-NEXT: vmla.u32 q2, q4, r2
; CHECK-NEXT: .LBB11_5: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: .LBB11_6: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_9 Depth=3
; CHECK-NEXT: @ Parent Loop BB11_5 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
; CHECK-NEXT: vadd.i32 q5, q2, q1
Expand All @@ -915,31 +925,27 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vmlava.u32 r12, q2, q6
; CHECK-NEXT: vmov q2, q5
; CHECK-NEXT: bne .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4
; CHECK-NEXT: bne .LBB11_6
; CHECK-NEXT: @ %bb.7: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_5 Depth=4
; CHECK-NEXT: add.w r11, r11, #1
; CHECK-NEXT: le lr, .LBB11_4
; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i
; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3
; CHECK-NEXT: le lr, .LBB11_5
; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup77.i
; CHECK-NEXT: @ in Loop: Header=BB11_9 Depth=3
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: add.w r10, r10, #1
; CHECK-NEXT: cmp r5, r2
; CHECK-NEXT: bne .LBB11_3
; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2
; CHECK-NEXT: adds r7, #1
; CHECK-NEXT: cmp r7, r3
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i
; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1
; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: ldr r7, [sp, #148]
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: cmp r5, r7
; CHECK-NEXT: it eq
; CHECK-NEXT: moveq r5, #0
; CHECK-NEXT: b .LBB11_1
; CHECK-NEXT: beq .LBB11_3
; CHECK-NEXT: .LBB11_9: @ %for.body27.i
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=2
; CHECK-NEXT: @ => This Loop Header: Depth=3
; CHECK-NEXT: @ Child Loop BB11_5 Depth 4
; CHECK-NEXT: @ Child Loop BB11_6 Depth 5
; CHECK-NEXT: dls lr, r9
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov.w r11, #4
; CHECK-NEXT: b .LBB11_5
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI11_0:
Expand Down
28 changes: 15 additions & 13 deletions llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,25 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read
; CHECK-NEXT: adr r3, .LCPI0_0
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r0
; CHECK-NEXT: .LBB0_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: .LBB0_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB0_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q2, [q1, #16]!
; CHECK-NEXT: vstrb.8 q2, [r0], #16
; CHECK-NEXT: le lr, .LBB0_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=1
; CHECK-NEXT: cmp r12, r2
; CHECK-NEXT: bne .LBB0_2
; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup
; CHECK-NEXT: beq .LBB0_5
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_2 Depth 2
; CHECK-NEXT: dls lr, r4
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1866,8 +1866,7 @@ define arm_aapcs_vfpcc void @usatmul_4_q15(i16* nocapture readonly %pSrcA, i16*
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB11_8
; CHECK-NEXT: cbz r3, .LBB11_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB11_3
Expand Down Expand Up @@ -2132,8 +2131,7 @@ define arm_aapcs_vfpcc void @ssatmul_4_q7(i8* nocapture readonly %pSrcA, i8* noc
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB13_8
; CHECK-NEXT: cbz r3, .LBB13_8
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhi .LBB13_3
Expand Down
28 changes: 15 additions & 13 deletions llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
Original file line number Diff line number Diff line change
Expand Up @@ -160,24 +160,26 @@ define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i3
; CHECK-NEXT: vadd.i32 q4, q3, r0
; CHECK-NEXT: vldrw.u32 q3, [r12]
; CHECK-NEXT: vadd.i32 q3, q3, r0
; CHECK-NEXT: .LBB3_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: .LBB3_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
; CHECK-NEXT: b .LBB3_4
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB3_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vstrw.32 q0, [q5, #48]!
; CHECK-NEXT: vstrw.32 q1, [q6, #48]!
; CHECK-NEXT: vstrw.32 q2, [q7, #48]!
; CHECK-NEXT: le lr, .LBB3_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
; CHECK-NEXT: le lr, .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB3_4 Depth=1
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: bne .LBB3_2
; CHECK-NEXT: beq .LBB3_5
; CHECK-NEXT: .LBB3_4: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_2 Depth 2
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: b .LBB3_2
; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
Expand Down
114 changes: 56 additions & 58 deletions llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB0_7
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB0_9
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand All @@ -45,7 +45,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: ldr r1, [r2], #4
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: le lr, .LBB0_8
; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
Expand Down Expand Up @@ -206,8 +206,8 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB2_7
; CHECK-NEXT: .LBB2_3:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: b .LBB2_9
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB2_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -306,8 +306,8 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB3_7
; CHECK-NEXT: .LBB3_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB3_9
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB3_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -406,8 +406,8 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB4_7
; CHECK-NEXT: .LBB4_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB4_9
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB4_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -507,7 +507,8 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB5_7
; CHECK-NEXT: .LBB5_3:
; CHECK-NEXT: vldr s0, .LCPI5_0
; CHECK-NEXT: b .LBB5_9
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB5_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
Expand Down Expand Up @@ -608,7 +609,8 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB6_7
; CHECK-NEXT: .LBB6_3:
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
; CHECK-NEXT: b .LBB6_9
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB6_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
Expand Down Expand Up @@ -704,8 +706,8 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB7_7
; CHECK-NEXT: .LBB7_3:
; CHECK-NEXT: mvn r2, #-2147483648
; CHECK-NEXT: b .LBB7_9
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB7_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -804,7 +806,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB8_7
; CHECK-NEXT: .LBB8_3:
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: b .LBB8_9
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB8_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -832,7 +834,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, lt
; CHECK-NEXT: le lr, .LBB8_8
; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
Expand Down Expand Up @@ -900,8 +902,8 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB9_7
; CHECK-NEXT: .LBB9_3:
; CHECK-NEXT: mov.w r2, #-2147483648
; CHECK-NEXT: b .LBB9_9
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB9_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -1000,7 +1002,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .LBB10_3:
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: b .LBB10_9
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB10_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -1028,7 +1030,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, gt
; CHECK-NEXT: le lr, .LBB10_8
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
Expand Down Expand Up @@ -1096,8 +1098,8 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: b .LBB11_7
; CHECK-NEXT: .LBB11_3:
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: b .LBB11_9
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB11_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -1196,7 +1198,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB12_7
; CHECK-NEXT: .LBB12_3:
; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: b .LBB12_9
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB12_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -1224,7 +1226,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, hi
; CHECK-NEXT: le lr, .LBB12_8
; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
Expand Down Expand Up @@ -1292,8 +1294,8 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB13_7
; CHECK-NEXT: .LBB13_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: b .LBB13_9
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB13_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -1392,7 +1394,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB14_7
; CHECK-NEXT: .LBB14_3:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: b .LBB14_9
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB14_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
Expand Down Expand Up @@ -1420,7 +1422,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, hi
; CHECK-NEXT: le lr, .LBB14_8
; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
Expand Down Expand Up @@ -1489,7 +1491,8 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB15_7
; CHECK-NEXT: .LBB15_3:
; CHECK-NEXT: vldr s0, .LCPI15_0
; CHECK-NEXT: b .LBB15_9
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB15_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
Expand Down Expand Up @@ -1594,7 +1597,8 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB16_7
; CHECK-NEXT: .LBB16_3:
; CHECK-NEXT: vldr s0, .LCPI16_0
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB16_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
Expand Down Expand Up @@ -1701,8 +1705,7 @@ define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB17_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6.not = icmp eq i32 %n, 0
Expand Down Expand Up @@ -1752,8 +1755,7 @@ define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture r
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB18_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp8.not = icmp eq i32 %n, 0
Expand Down Expand Up @@ -1806,8 +1808,7 @@ define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB19_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6.not = icmp eq i32 %n, 0
Expand Down Expand Up @@ -1858,8 +1859,7 @@ define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture r
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB20_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp9.not = icmp eq i32 %n, 0
Expand Down Expand Up @@ -1914,8 +1914,7 @@ define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB21_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6.not = icmp eq i32 %n, 0
Expand Down Expand Up @@ -1966,8 +1965,7 @@ define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture re
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB22_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp9.not = icmp eq i32 %n, 0
Expand Down Expand Up @@ -2327,7 +2325,7 @@ define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r1, .LBB29_3
; CHECK-NEXT: cbz r1, .LBB29_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r3, r2
Expand All @@ -2337,14 +2335,14 @@ define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vaddlva.s32 r2, r3, q0
; CHECK-NEXT: letp lr, .LBB29_2
; CHECK-NEXT: b .LBB29_4
; CHECK-NEXT: .LBB29_3:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup
; CHECK-NEXT: .LBB29_3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB29_4:
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: b .LBB29_3
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
Expand Down Expand Up @@ -2380,7 +2378,7 @@ define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture r
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB30_3
; CHECK-NEXT: cbz r2, .LBB30_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
Expand All @@ -2391,14 +2389,14 @@ define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture r
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0
; CHECK-NEXT: letp lr, .LBB30_2
; CHECK-NEXT: b .LBB30_4
; CHECK-NEXT: .LBB30_3:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup
; CHECK-NEXT: .LBB30_3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB30_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: b .LBB30_3
entry:
%cmp9.not = icmp eq i32 %n, 0
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
Expand Down Expand Up @@ -2439,7 +2437,7 @@ define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture r
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: cbz r2, .LBB31_3
; CHECK-NEXT: cbz r2, .LBB31_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
Expand All @@ -2450,14 +2448,14 @@ define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture r
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0
; CHECK-NEXT: letp lr, .LBB31_2
; CHECK-NEXT: b .LBB31_4
; CHECK-NEXT: .LBB31_3:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup
; CHECK-NEXT: .LBB31_3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB31_4:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: b .LBB31_3
entry:
%cmp9.not = icmp eq i32 %n, 0
br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: beq .LBB0_8
; CHECK-NEXT: cbz r2, .LBB0_8
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
; CHECK-NEXT: cmp r2, #8
; CHECK-NEXT: blo .LBB0_9
Expand Down
25 changes: 0 additions & 25 deletions llvm/test/Transforms/HardwareLoops/ARM/structure.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -hardware-loops %s -S -o - | \
; RUN: FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi %s -o - | \
; RUN: FileCheck %s --check-prefix=CHECK-LLC
; RUN: opt -mtriple=thumbv8.1m.main -loop-unroll -unroll-remainder=false -S < %s | \
; RUN: llc -mtriple=thumbv8.1m.main | FileCheck %s --check-prefix=CHECK-UNROLL
; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -hardware-loops \
Expand Down Expand Up @@ -65,15 +63,6 @@ do.end:
; CHECK-NOT: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-NOT: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7

; CHECK-LLC: nested:
; CHECK-LLC-NOT: mov lr, r1
; CHECK-LLC: dls lr, r1
; CHECK-LLC-NOT: mov lr, r1
; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9._]+]]:
; CHECK-LLC: le lr, [[LOOP_HEADER]]
; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9._]+]]
; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9._]+]]:

define void @nested(i32* nocapture %A, i32 %N) {
entry:
%cmp20 = icmp eq i32 %N, 0
Expand Down Expand Up @@ -363,12 +352,6 @@ for.body:
; CHECK: call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
; CHECK: call i32 @llvm.loop.decrement.reg.i32(

; CHECK-LLC-LABEL: unroll_inc_unsigned:
; CHECK-LLC: wls lr, r3, [[EXIT:.LBB[0-9_]+]]
; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]:
; CHECK-LLC: le lr, [[HEADER]]
; CHECK-LLC-NEXT: [[EXIT]]:

; TODO: We should be able to support the unrolled loop body.
; CHECK-UNROLL-LABEL: unroll_inc_unsigned
; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader
Expand Down Expand Up @@ -407,14 +390,6 @@ for.body:
; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 %N)
; CHECK: call i32 @llvm.loop.decrement.reg.i32(

; TODO: An unnecessary register is being held to hold COUNT, lr should just
; be used instead.
; CHECK-LLC-LABEL: unroll_dec_int:
; CHECK-LLC: dls lr, r3
; CHECK-LLC-NOT: mov lr, r3
; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]:
; CHECK-LLC: le lr, [[HEADER]]

; CHECK-UNROLL-LABEL: unroll_dec_int:
; CHECK-UNROLL: wls lr, {{.*}}, [[PROLOGUE_EXIT:.LBB[0-9_]+]]
; CHECK-UNROLL-NEXT: [[PROLOGUE:.LBB[0-9_]+]]:
Expand Down