146 changes: 75 additions & 71 deletions llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -447,78 +447,79 @@ end:
define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
; CHECK-LABEL: arm_mat_mult_q31:
; CHECK: @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: ldrd r9, r12, [sp, #120]
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: ldrd r9, r12, [sp, #128]
; CHECK-NEXT: sub.w r7, r12, #1
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: add.w r7, r6, r7, lsr #1
; CHECK-NEXT: vdup.32 q1, r9
; CHECK-NEXT: bic r7, r7, #3
; CHECK-NEXT: vshl.i32 q3, q1, #3
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: add.w r10, r6, r7, lsr #2
; CHECK-NEXT: adr r7, .LCPI9_0
; CHECK-NEXT: adr r6, .LCPI9_1
; CHECK-NEXT: vldrw.u32 q2, [r7]
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: adr r7, .LCPI9_0
; CHECK-NEXT: vldrw.u32 q1, [r7]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: vdup.32 q0, r9
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vshl.i32 q3, q0, #3
; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_2 Depth 2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: mul r11, r8, r9
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: mul r7, r8, r12
; CHECK-NEXT: mul lr, r8, r12
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: mul r6, r8, r9
; CHECK-NEXT: vdup.32 q4, lr
; CHECK-NEXT: vshl.i32 q4, q4, #2
; CHECK-NEXT: vadd.i32 q4, q4, r0
; CHECK-NEXT: vadd.i32 q4, q4, q0
; CHECK-NEXT: .LBB9_2: @ %vector.ph
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vdup.32 q5, r7
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vshl.i32 q5, q5, #2
; CHECK-NEXT: vmov q6, q1
; CHECK-NEXT: vadd.i32 q5, q5, r0
; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vmov q7, q2
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: vmov.i32 q4, #0x0
; CHECK-NEXT: vadd.i32 q5, q5, q0
; CHECK-NEXT: vmlas.i32 q6, q2, r5
; CHECK-NEXT: vmov.i32 q5, #0x0
; CHECK-NEXT: vmlas.i32 q7, q0, r7
; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vadd.i32 q7, q6, q3
; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2]
; CHECK-NEXT: vldrw.u32 q6, [q5, #32]!
; CHECK-NEXT: vmul.i32 q0, q0, q6
; CHECK-NEXT: vmov q6, q7
; CHECK-NEXT: vadd.i32 q4, q0, q4
; CHECK-NEXT: vadd.i32 q0, q7, q3
; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2]
; CHECK-NEXT: vldrw.u32 q7, [q6, #32]!
; CHECK-NEXT: vmul.i32 q1, q1, q7
; CHECK-NEXT: vmov q7, q0
; CHECK-NEXT: vadd.i32 q5, q1, q5
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
; CHECK-NEXT: add.w r4, r5, r11
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: vaddv.u32 r6, q4
; CHECK-NEXT: cmp r5, r9
; CHECK-NEXT: str.w r6, [r2, r4, lsl #2]
; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: adds r7, #1
; CHECK-NEXT: vaddv.u32 r4, q5
; CHECK-NEXT: cmp r7, r9
; CHECK-NEXT: str.w r4, [r2, r5, lsl #2]
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1
; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: bne .LBB9_1
; CHECK-NEXT: @ %bb.6: @ %for.end25
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.7:
; CHECK-NEXT: .LCPI9_0:
Expand Down Expand Up @@ -859,18 +860,18 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: ldrd r2, r7, [sp, #104]
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: ldrd r2, r7, [sp, #136]
; CHECK-NEXT: add.w r8, r7, #10
; CHECK-NEXT: adr r7, .LCPI11_0
; CHECK-NEXT: ldr r1, [sp, #96]
; CHECK-NEXT: ldr r1, [sp, #128]
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: vldrw.u32 q1, [r7]
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: mov.w r9, #6
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: mov.w r10, #6
; CHECK-NEXT: movs r6, #11
; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: movs r5, #0
Expand All @@ -880,74 +881,77 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: vdup.32 q2, r9
; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: .LBB11_3: @ %for.body27.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ => This Loop Header: Depth=3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: dls lr, r9
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov.w r11, #4
; CHECK-NEXT: vdup.32 q3, r7
; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ => This Loop Header: Depth=4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: mul r4, r11, r6
; CHECK-NEXT: vdup.32 q3, r5
; CHECK-NEXT: vdup.32 q2, r7
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: vmla.i32 q3, q4, r2
; CHECK-NEXT: adds r4, #113
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: mov r4, r8
; CHECK-NEXT: vmla.i32 q2, q4, r2
; CHECK-NEXT: mul r5, r11, r6
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vadd.i32 q5, q1, r5
; CHECK-NEXT: vmla.i32 q4, q5, r2
; CHECK-NEXT: vldrw.u32 q5, [sp, #8] @ 16-byte Reload
; CHECK-NEXT: adds r5, #113
; CHECK-NEXT: vadd.i32 q6, q1, r5
; CHECK-NEXT: mov r5, r8
; CHECK-NEXT: vmla.i32 q5, q6, r2
; CHECK-NEXT: .LBB11_5: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
; CHECK-NEXT: vadd.i32 q5, q2, q0
; CHECK-NEXT: vadd.i32 q4, q3, q0
; CHECK-NEXT: subs r4, #4
; CHECK-NEXT: vadd.i32 q2, q6, r2
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vmlava.u32 r12, q2, q6
; CHECK-NEXT: vmov q2, q5
; CHECK-NEXT: vldrb.s32 q2, [r0, q5]
; CHECK-NEXT: vadd.i32 q7, q5, q0
; CHECK-NEXT: vldrb.s32 q5, [r1, q4]
; CHECK-NEXT: vadd.i32 q6, q4, q0
; CHECK-NEXT: vadd.i32 q2, q2, r2
; CHECK-NEXT: subs r5, #4
; CHECK-NEXT: vmlava.u32 r12, q2, q5
; CHECK-NEXT: vmov q5, q7
; CHECK-NEXT: vmov q4, q6
; CHECK-NEXT: bne .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4
; CHECK-NEXT: add.w r11, r11, #1
; CHECK-NEXT: le lr, .LBB11_4
; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i
; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: add.w r10, r10, #1
; CHECK-NEXT: cmp r5, r2
; CHECK-NEXT: adds r7, #1
; CHECK-NEXT: adds r4, #1
; CHECK-NEXT: cmp r7, r2
; CHECK-NEXT: bne .LBB11_3
; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2
; CHECK-NEXT: adds r7, #1
; CHECK-NEXT: cmp r7, r3
; CHECK-NEXT: add.w r9, r9, #1
; CHECK-NEXT: cmp r9, r3
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i
; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1
; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: ldr r7, [sp, #148]
; CHECK-NEXT: ldr r7, [sp, #180]
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: cmp r5, r7
; CHECK-NEXT: it eq
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/WebAssembly/reg-stackify.ll
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,7 @@ define i32 @commute_to_fix_ordering(i32 %arg) {
; CHECK-LABEL: multiple_defs:
; CHECK: f64.add $push[[NUM0:[0-9]+]]=, ${{[0-9]+}}, $pop{{[0-9]+}}{{$}}
; CHECK-NEXT: local.tee $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
; CHECK-NEXT: f64.select $push{{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}}
; CHECK: $[[NUM2]]=,
; CHECK-NEXT: f64.select ${{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}}
; NOREGS-LABEL: multiple_defs:
; NOREGS: f64.add
; NOREGS: local.tee
Expand Down