392 changes: 211 additions & 181 deletions llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll

Large diffs are not rendered by default.

97 changes: 61 additions & 36 deletions llvm/test/CodeGen/AMDGPU/swdev380865.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,63 +16,88 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
; CHECK-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
; CHECK-NEXT: ; kill: killed $sgpr0_sgpr1
; CHECK-NEXT: s_mov_b32 s7, 0x401c0000
; CHECK-NEXT: s_mov_b32 s5, 0x40280000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v2, s2, 0
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b32 s1, 0x40140000
; CHECK-NEXT: s_mov_b32 s1, 0x40180000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v2, s2, 0
; CHECK-NEXT: v_writelane_b32 v2, s0, 1
; CHECK-NEXT: v_writelane_b32 v2, s1, 2
; CHECK-NEXT: s_mov_b32 s1, 0x40240000
; CHECK-NEXT: s_mov_b32 s1, 0x40220000
; CHECK-NEXT: v_writelane_b32 v2, s0, 3
; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: v_writelane_b32 v2, s1, 4
; CHECK-NEXT: s_mov_b32 s3, 0x40260000
; CHECK-NEXT: s_mov_b32 s5, 0x40280000
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_mov_b32 s1, 0x40240000
; CHECK-NEXT: v_writelane_b32 v2, s0, 5
; CHECK-NEXT: v_writelane_b32 v2, s1, 6
; CHECK-NEXT: s_mov_b32 s1, 0x40260000
; CHECK-NEXT: v_writelane_b32 v2, s0, 7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_writelane_b32 v2, s1, 8
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0x40140000
; CHECK-NEXT: v_writelane_b32 v2, s0, 5
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s2, 0
; CHECK-NEXT: s_mov_b32 s3, 0x40140000
; CHECK-NEXT: v_writelane_b32 v2, s6, 9
; CHECK-NEXT: v_writelane_b32 v2, s7, 10
; CHECK-NEXT: v_writelane_b32 v2, s0, 11
; CHECK-NEXT: v_readlane_b32 s6, v2, 1
; CHECK-NEXT: v_readlane_b32 s7, v2, 2
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s1, s7
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0x40140000
; CHECK-NEXT: s_mov_b32 s0, s6
; CHECK-NEXT: v_readlane_b32 s6, v2, 6
; CHECK-NEXT: s_mov_b32 s0, s2
; CHECK-NEXT: v_writelane_b32 v2, s6, 1
; CHECK-NEXT: v_writelane_b32 v2, s7, 2
; CHECK-NEXT: v_readlane_b32 s6, v2, 9
; CHECK-NEXT: v_readlane_b32 s7, v2, 10
; CHECK-NEXT: s_mov_b32 s6, s2
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_readlane_b32 s7, v2, 7
; CHECK-NEXT: s_mov_b32 s1, 0x40140000
; CHECK-NEXT: s_mov_b32 s6, s0
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7]
; CHECK-NEXT: v_readlane_b32 s6, v2, 8
; CHECK-NEXT: v_readlane_b32 s7, v2, 9
; CHECK-NEXT: s_mov_b32 s6, s0
; CHECK-NEXT: v_readlane_b32 s0, v2, 3
; CHECK-NEXT: v_readlane_b32 s1, v2, 4
; CHECK-NEXT: s_mov_b32 s3, s1
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, 0x40140000
; CHECK-NEXT: s_mov_b32 s2, s0
; CHECK-NEXT: s_mov_b32 s1, s3
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0x40140000
; CHECK-NEXT: s_mov_b32 s0, s6
; CHECK-NEXT: s_mov_b32 s2, s6
; CHECK-NEXT: s_mov_b32 s4, s6
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[0:1]
; CHECK-NEXT: v_readlane_b32 s0, v2, 0
; CHECK-NEXT: v_writelane_b32 v2, s0, 3
; CHECK-NEXT: v_writelane_b32 v2, s1, 4
; CHECK-NEXT: v_readlane_b32 s0, v2, 5
; CHECK-NEXT: v_readlane_b32 s1, v2, 6
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s3, s1
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, 0x40140000
; CHECK-NEXT: s_mov_b32 s2, s0
; CHECK-NEXT: s_mov_b32 s1, s3
; CHECK-NEXT: v_writelane_b32 v2, s0, 5
; CHECK-NEXT: v_writelane_b32 v2, s1, 6
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
; CHECK-NEXT: v_readlane_b32 s0, v2, 7
; CHECK-NEXT: v_readlane_b32 s1, v2, 8
; CHECK-NEXT: s_mov_b32 s3, s1
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, 0x40140000
; CHECK-NEXT: s_mov_b32 s2, s0
; CHECK-NEXT: s_mov_b32 s1, s3
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
; CHECK-NEXT: v_readlane_b32 s2, v2, 5
; CHECK-NEXT: v_writelane_b32 v2, s0, 7
; CHECK-NEXT: s_mov_b32 s4, s0
; CHECK-NEXT: v_writelane_b32 v2, s1, 8
; CHECK-NEXT: v_readlane_b32 s0, v2, 0
; CHECK-NEXT: v_readlane_b32 s2, v2, 11
; CHECK-NEXT: s_add_i32 s2, s2, s0
; CHECK-NEXT: v_writelane_b32 v2, s2, 5
; CHECK-NEXT: v_readlane_b32 s0, v2, 5
; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00
; CHECK-NEXT: v_writelane_b32 v2, s2, 11
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5]
; CHECK-NEXT: v_readlane_b32 s0, v2, 11
; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit
; CHECK-NEXT: v_mov_b32_e32 v3, 0
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ body: |
%13 = S2_asl_r_p_acc %13, %47, %8.isub_lo
%51 = A2_tfrpi 0
; CHECK: $d0 = S2_extractup undef renamable $d0, 6, 25
; CHECK: $d1 = A2_tfrpi 2
; CHECK: $d2 = S2_extractup undef renamable $d0, 6, 25
; CHECK: $d0 = A2_tfrpi 2
; CHECK: $d13 = A2_tfrpi -1
; CHECK-NOT: undef $r4
Expand Down
42 changes: 27 additions & 15 deletions llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1024,8 +1024,10 @@ middle.block: ; preds = %vector.body
define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
; CHECK-LABEL: DCT_mve7:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #72
Expand Down Expand Up @@ -1072,6 +1074,7 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: vmov q5, q2
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vmov q6, q2
; CHECK-NEXT: vmov q1, q2
; CHECK-NEXT: mov r12, r7
; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
Expand All @@ -1080,16 +1083,20 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: adds r6, r3, r5
; CHECK-NEXT: add.w r10, r3, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
; CHECK-NEXT: add.w r11, r6, r5
; CHECK-NEXT: add.w r11, r10, r5
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q5, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r11]
; CHECK-NEXT: vldrwt.u32 q0, [r10]
; CHECK-NEXT: add.w r6, r11, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q6, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r11]
; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q1, q0, q7
Expand Down Expand Up @@ -1171,7 +1178,8 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #72
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r11, pc}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
%i = load i32, ptr %NumInputs, align 4
Expand Down Expand Up @@ -1346,6 +1354,7 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: adds r1, r0, #1
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vmov q6, q3
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vmov q7, q3
; CHECK-NEXT: vmov q2, q3
Expand All @@ -1358,43 +1367,46 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.32 r10
; CHECK-NEXT: add.w r11, r3, r6
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
; CHECK-NEXT: vldrwt.u32 q1, [r11]
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: add.w r5, r11, r6
; CHECK-NEXT: sub.w r10, r10, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q6, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r11]
; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q7, q1, q0
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q1, [r5]
; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload
; CHECK-NEXT: adds r7, r5, r6
; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: sub.w r10, r10, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r7]
; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
; CHECK-NEXT: adds r7, r5, r6
; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
; CHECK-NEXT: adds r7, r5, r6
; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
; CHECK-NEXT: vmov q2, q4
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r7]
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: vmov q3, q5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q4, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
; CHECK-NEXT: vmov q3, q5
; CHECK-NEXT: vmov q5, q6
; CHECK-NEXT: add r5, r6
; CHECK-NEXT: vpstt
Expand Down
57 changes: 31 additions & 26 deletions llvm/test/CodeGen/Thumb2/mve-vst3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1077,58 +1077,63 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r0, #160]
; CHECK-NEXT: vldrw.u32 q7, [r0, #112]
; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
; CHECK-NEXT: vmov.f32 s25, s1
; CHECK-NEXT: vldrw.u32 q3, [r0, #160]
; CHECK-NEXT: vmov.f32 s24, s9
; CHECK-NEXT: vldrw.u32 q5, [r0, #144]
; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
; CHECK-NEXT: vmov.f32 s26, s6
; CHECK-NEXT: vldrw.u32 q7, [r0, #112]
; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
; CHECK-NEXT: vmov.f32 s24, s9
; CHECK-NEXT: vmov.f32 s27, s10
; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
; CHECK-NEXT: vmov.f32 s26, s6
; CHECK-NEXT: vmov.f32 s25, s1
; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
; CHECK-NEXT: vmov.f32 s27, s10
; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q6, [r1, #16]
; CHECK-NEXT: vmov.f32 s24, s2
; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: vmov.f32 s27, s3
; CHECK-NEXT: vmov.f32 s14, s0
; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s12, s4
; CHECK-NEXT: vstrw.32 q6, [r1, #16]
; CHECK-NEXT: vmov.f32 s13, s8
; CHECK-NEXT: vmov.f32 s15, s5
; CHECK-NEXT: vmov.f32 s13, s8
; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s24, s2
; CHECK-NEXT: vmov.f32 s27, s3
; CHECK-NEXT: vmov.f32 s2, s12
; CHECK-NEXT: vmov.f32 s0, s16
; CHECK-NEXT: vmov.f32 s1, s28
; CHECK-NEXT: vmov.f32 s3, s17
; CHECK-NEXT: vmov.f32 s25, s7
; CHECK-NEXT: vmov.f32 s6, s0
; CHECK-NEXT: vmov.f32 s13, s1
; CHECK-NEXT: vmov.f32 s0, s2
; CHECK-NEXT: vmov.f32 s4, s16
; CHECK-NEXT: vmov.f32 s5, s28
; CHECK-NEXT: vmov.f32 s7, s17
; CHECK-NEXT: vmov.f32 s1, s19
; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
; CHECK-NEXT: vmov.f32 s2, s31
; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s26, s11
; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s15, s30
; CHECK-NEXT: vstrw.32 q6, [r1, #32]
; CHECK-NEXT: vmov.f32 s17, s1
; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s30, s0
; CHECK-NEXT: vmov.f32 s0, s2
; CHECK-NEXT: vmov.f32 s1, s11
; CHECK-NEXT: vmov.f32 s2, s7
; CHECK-NEXT: vmov.f32 s14, s18
; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
; CHECK-NEXT: vmov.f32 s18, s10
; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s28, s8
; CHECK-NEXT: vmov.f32 s31, s9
; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s12, s29
; CHECK-NEXT: vmov.f32 s29, s4
; CHECK-NEXT: vstrw.32 q3, [r1, #160]
Expand All @@ -1143,14 +1148,14 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.f32 s8, s1
; CHECK-NEXT: vmov.f32 s11, s2
; CHECK-NEXT: vmov.f32 s22, s3
; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s7, s9
; CHECK-NEXT: vstrw.32 q0, [r1, #128]
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s9, s21
; CHECK-NEXT: vstrw.32 q1, [r1, #48]
; CHECK-NEXT: vstrw.32 q0, [r1, #144]
; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s21, s27
; CHECK-NEXT: vstrw.32 q2, [r1, #64]
; CHECK-NEXT: vstrw.32 q0, [r1, #176]
Expand Down