Expand Up
@@ -447,78 +447,79 @@ end:
define dso_local void @arm_mat_mult_q31 (i32* noalias nocapture readonly %A , i32* noalias nocapture readonly %B , i32* noalias nocapture %C , i32 %n , i32 %m , i32 %l ) local_unnamed_addr #0 {
; CHECK-LABEL: arm_mat_mult_q31:
; CHECK: @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: ldrd r9, r12, [sp, #120 ]
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: ldrd r9, r12, [sp, #128 ]
; CHECK-NEXT: sub.w r7, r12, #1
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: add.w r7, r6, r7, lsr #1
; CHECK-NEXT: vdup.32 q1, r9
; CHECK-NEXT: bic r7, r7, #3
; CHECK-NEXT: vshl.i32 q3, q1, #3
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: add.w r10, r6, r7, lsr #2
; CHECK-NEXT: adr r7, .LCPI9_0
; CHECK-NEXT: adr r6, .LCPI9_1
; CHECK-NEXT: vldrw.u32 q2, [r7]
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: adr r7, .LCPI9_0
; CHECK-NEXT: vldrw.u32 q1, [r7]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: vdup.32 q0, r9
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vshl.i32 q3, q0, #3
; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_2 Depth 2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: mul r11, r8, r9
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: mul r7, r8, r12
; CHECK-NEXT: mul lr, r8, r12
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: mul r6, r8, r9
; CHECK-NEXT: vdup.32 q4, lr
; CHECK-NEXT: vshl.i32 q4, q4, #2
; CHECK-NEXT: vadd.i32 q4, q4, r0
; CHECK-NEXT: vadd.i32 q4, q4, q0
; CHECK-NEXT: .LBB9_2: @ %vector.ph
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vdup.32 q5, r7
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vshl.i32 q5, q5, #2
; CHECK-NEXT: vmov q6, q1
; CHECK-NEXT: vadd.i32 q5, q5, r0
; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vmov q7, q2
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: vmov.i32 q4 , #0x0
; CHECK-NEXT: vadd .i32 q5, q5, q0
; CHECK-NEXT: vmlas.i32 q6, q2, r5
; CHECK-NEXT: vmov.i32 q5 , #0x0
; CHECK-NEXT: vmlas .i32 q7, q0, r7
; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vadd.i32 q7, q6 , q3
; CHECK-NEXT: vldrw.u32 q0 , [r1, q6 , uxtw #2]
; CHECK-NEXT: vldrw.u32 q6 , [q5 , #32]!
; CHECK-NEXT: vmul.i32 q0, q0, q6
; CHECK-NEXT: vmov q6, q7
; CHECK-NEXT: vadd.i32 q4, q0, q4
; CHECK-NEXT: vadd.i32 q0, q7 , q3
; CHECK-NEXT: vldrw.u32 q1 , [r1, q7 , uxtw #2]
; CHECK-NEXT: vldrw.u32 q7 , [q6 , #32]!
; CHECK-NEXT: vmul.i32 q1, q1, q7
; CHECK-NEXT: vmov q7, q0
; CHECK-NEXT: vadd.i32 q5, q1, q5
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
; CHECK-NEXT: add.w r4, r5, r11
; CHECK-NEXT: adds r5 , #1
; CHECK-NEXT: vaddv.u32 r6, q4
; CHECK-NEXT: cmp r5 , r9
; CHECK-NEXT: str.w r6 , [r2, r4 , lsl #2]
; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: adds r7 , #1
; CHECK-NEXT: vaddv.u32 r4, q5
; CHECK-NEXT: cmp r7 , r9
; CHECK-NEXT: str.w r4 , [r2, r5 , lsl #2]
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1
; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: bne .LBB9_1
; CHECK-NEXT: @ %bb.6: @ %for.end25
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.7:
; CHECK-NEXT: .LCPI9_0:
Expand Down
Expand Up
@@ -859,18 +860,18 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: ldrd r2, r7, [sp, #104 ]
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15 }
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15 }
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: ldrd r2, r7, [sp, #136 ]
; CHECK-NEXT: add.w r8, r7, #10
; CHECK-NEXT: adr r7, .LCPI11_0
; CHECK-NEXT: ldr r1, [sp, #96 ]
; CHECK-NEXT: ldr r1, [sp, #128 ]
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: vldrw.u32 q1, [r7]
; CHECK-NEXT: mov.w r10 , #0
; CHECK-NEXT: mov.w r9 , #6
; CHECK-NEXT: movs r4 , #0
; CHECK-NEXT: mov.w r10 , #6
; CHECK-NEXT: movs r6, #11
; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: movs r5, #0
Expand All
@@ -880,74 +881,77 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: movs r7 , #0
; CHECK-NEXT: mov.w r9 , #0
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB11_3 Depth 3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: vdup.32 q2, r9
; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: .LBB11_3: @ %for.body27.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ => This Loop Header: Depth=3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: dls lr, r9
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov.w r11, #4
; CHECK-NEXT: vdup.32 q3, r7
; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ => This Loop Header: Depth=4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: mul r4 , r11, r6
; CHECK-NEXT: vdup.32 q3, r5
; CHECK-NEXT: vdup.32 q2, r7
; CHECK-NEXT: vadd .i32 q4, q1, r4
; CHECK-NEXT: vmla.i32 q3, q4, r2
; CHECK-NEXT: adds r4 , #113
; CHECK-NEXT: vadd.i32 q4 , q1, r4
; CHECK-NEXT: mov r4 , r8
; CHECK-NEXT: vmla.i32 q2, q4 , r2
; CHECK-NEXT: mul r5 , r11, r6
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vadd.i32 q5, q1, r5
; CHECK-NEXT: vmla .i32 q4, q5, r2
; CHECK-NEXT: vldrw.u32 q5, [sp, #8] @ 16-byte Reload
; CHECK-NEXT: adds r5 , #113
; CHECK-NEXT: vadd.i32 q6 , q1, r5
; CHECK-NEXT: mov r5 , r8
; CHECK-NEXT: vmla.i32 q5, q6 , r2
; CHECK-NEXT: .LBB11_5: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
; CHECK-NEXT: vldrb.s32 q6 , [r0, q2 ]
; CHECK-NEXT: vadd.i32 q5, q2 , q0
; CHECK-NEXT: vadd.i32 q4, q3, q0
; CHECK-NEXT: subs r4, #4
; CHECK-NEXT: vadd.i32 q2, q6 , r2
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vmlava.u32 r12, q2, q6
; CHECK-NEXT: vmov q2, q5
; CHECK-NEXT: vldrb.s32 q2 , [r0, q5 ]
; CHECK-NEXT: vadd.i32 q7, q5 , q0
; CHECK-NEXT: vldrb.s32 q5, [r1, q4]
; CHECK-NEXT: vadd.i32 q6, q4, q0
; CHECK-NEXT: vadd.i32 q2, q2 , r2
; CHECK-NEXT: subs r5, #4
; CHECK-NEXT: vmlava.u32 r12, q2, q5
; CHECK-NEXT: vmov q5, q7
; CHECK-NEXT: vmov q4, q6
; CHECK-NEXT: bne .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4
; CHECK-NEXT: add.w r11, r11, #1
; CHECK-NEXT: le lr, .LBB11_4
; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i
; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3
; CHECK-NEXT: adds r5 , #1
; CHECK-NEXT: add.w r10, r10 , #1
; CHECK-NEXT: cmp r5 , r2
; CHECK-NEXT: adds r7 , #1
; CHECK-NEXT: adds r4 , #1
; CHECK-NEXT: cmp r7 , r2
; CHECK-NEXT: bne .LBB11_3
; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2
; CHECK-NEXT: adds r7 , #1
; CHECK-NEXT: cmp r7 , r3
; CHECK-NEXT: add.w r9, r9 , #1
; CHECK-NEXT: cmp r9 , r3
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i
; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1
; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: ldr r7, [sp, #148 ]
; CHECK-NEXT: ldr r7, [sp, #180 ]
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: cmp r5, r7
; CHECK-NEXT: it eq
Expand Down