diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index e5095e971add17..99c8f86e9081d2 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -792,8 +792,11 @@ defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; + +let GlobalPriority = true in { defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; +} def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, LDS_DIRECT_CLASS)> { @@ -833,8 +836,11 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>; + +let GlobalPriority = true in { defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; +} multiclass ARegClass regTypes, dag regList> { let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in { @@ -854,8 +860,11 @@ defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; + +let GlobalPriority = true in { defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; +} } // End GeneratePressureSet = 0 @@ -910,8 +919,11 @@ defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>; defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>; defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>; defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>; + +let GlobalPriority = true in { defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>; +} //===----------------------------------------------------------------------===// // Register operands diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 36bb7ca7af0e99..cfa56f05fac7a1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -11,237 +11,166 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 -; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v6, v2 +; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 -; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: v_and_b32_e32 v0, 63, v2 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 ; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 63, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v7 -; GCN-NEXT: v_mov_b32_e32 v5, v8 -; GCN-NEXT: v_mov_b32_e32 v6, v9 -; GCN-NEXT: v_mov_b32_e32 v7, v10 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v16, v20 +; GCN-NEXT: v_mov_b32_e32 v17, v21 +; GCN-NEXT: v_mov_b32_e32 v18, v22 +; GCN-NEXT: v_mov_b32_e32 v19, v23 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -257,241 +186,170 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 -; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v6, v2 +; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 -; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: v_bfe_u32 v0, v2, 1, 6 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: v_bfe_u32 v0, v6, 1, 6 +; GCN-NEXT: v_lshrrev_b32_e64 v5, 6, s33 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v2 +; GCN-NEXT: v_add_u32_e32 v5, 0x100, v5 +; GCN-NEXT: v_add_u32_e32 v0, v5, v0 +; GCN-NEXT: v_and_b32_e32 v1, 1, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v15 -; GCN-NEXT: v_mov_b32_e32 v13, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: v_mov_b32_e32 v15, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v7 -; GCN-NEXT: v_mov_b32_e32 v5, v8 -; GCN-NEXT: v_mov_b32_e32 v6, v9 -; GCN-NEXT: v_mov_b32_e32 v7, v10 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 -; GCN-NEXT: v_lshrrev_b32_e64 v7, 6, s33 -; GCN-NEXT: v_add_u32_e32 v7, 0x100, v7 -; GCN-NEXT: v_add_u32_e32 v0, v7, v0 +; GCN-NEXT: v_mov_b32_e32 v16, v20 +; GCN-NEXT: v_mov_b32_e32 v17, v21 +; GCN-NEXT: v_mov_b32_e32 v18, v22 +; GCN-NEXT: v_mov_b32_e32 v19, v23 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: s_waitcnt vmcnt(16) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -507,150 +365,111 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:80 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:160 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v6, v2 +; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 ; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:192 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:208 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:224 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 ; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 -; GCN-NEXT: v_and_b32_e32 v0, 31, v2 -; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 -; GCN-NEXT: v_add_u32_e32 v1, v2, v0 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 ; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v16 -; GCN-NEXT: v_mov_b32_e32 v4, v17 -; GCN-NEXT: v_mov_b32_e32 v5, v18 -; GCN-NEXT: v_mov_b32_e32 v6, v19 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -667,37 +486,46 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 31, v6 +; GCN-NEXT: v_lshrrev_b32_e64 v5, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_add_u32_e32 v5, 0x100, v5 +; GCN-NEXT: v_add_u32_e32 v1, v5, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v20 -; GCN-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NEXT: v_mov_b32_e32 v6, v22 -; GCN-NEXT: v_mov_b32_e32 v7, v23 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:492 +; GCN-NEXT: v_mov_b32_e32 v16, v20 +; GCN-NEXT: v_mov_b32_e32 v17, v21 +; GCN-NEXT: v_mov_b32_e32 v18, v22 +; GCN-NEXT: v_mov_b32_e32 v19, v23 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 2e97f4d74e9343..9b0943ea753bb0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -339,118 +339,115 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 -; GFX10-NEXT: v_add_nc_u32_e32 v19, 1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v19 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cndmask_b32_e32 v12, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v9, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v14, v8, v10, s4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v19 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v9, v11, s4 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v8, vcc_lo -; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v5, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v7, vcc_lo +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v19 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v3 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v14, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v19 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v16, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v17, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v18, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v15, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off -; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 -; GFX11-NEXT: global_load_b128 v[11:14], v[0:1], off offset:32 -; GFX11-NEXT: global_load_b128 v[15:18], v[0:1], off offset:48 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off +; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_dual_cndmask_b32 v20, v3, v5 :: v_dual_cndmask_b32 v21, v4, v6 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_dual_cndmask_b32 v17, v13, v15 :: v_dual_cndmask_b32 v16, v12, v14 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v12, v14, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v19, v13, v15, s0 +; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v21, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v20, v7 :: v_dual_add_nc_u32 v19, 1, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v17, v5 :: v_dual_cndmask_b32 v0, v16, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v9 :: v_dual_cndmask_b32 v1, v1, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v18, v4, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v19, v5, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v19 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v3 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v11 :: v_dual_cndmask_b32 v1, v1, v12 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v19 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v13 :: v_dual_cndmask_b32 v1, v1, v14 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v19 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v15 :: v_dual_cndmask_b32 v1, v1, v16 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v19 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_cndmask_b32 v1, v1, v18 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v14, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v19 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v19 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, v17, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, v18, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index b0bb5c516c3256..73dc2961df8f2e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 ; GCN-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 ; GCN-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 -; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 -; GCN-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 -; GCN-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 -; GCN-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GCN-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:192 +; GCN-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:208 +; GCN-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:224 +; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:240 ; GCN-NEXT: s_waitcnt vmcnt(6) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 ; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 @@ -34,11 +34,11 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:208 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:224 ; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] ; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 ; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 @@ -48,7 +48,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 ; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:240 ; GCN-NEXT: s_endpgm ; ; GFX10-LABEL: v_insert_v64i32_37: @@ -67,24 +67,24 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GFX10-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:160 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:192 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:208 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:224 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:240 ; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7 ; GFX10-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 ; GFX10-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:160 ; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:192 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 +; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:208 ; GFX10-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] ; GFX10-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 ; GFX10-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 @@ -94,9 +94,9 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GFX10-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 ; GFX10-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:224 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 +; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:240 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_insert_v64i32_37: @@ -117,24 +117,24 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7 ; GFX11-NEXT: s_clause 0x6 -; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128 -; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160 -; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176 -; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:192 -; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:208 -; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:224 -; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:240 +; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:128 +; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:160 +; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:176 +; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:192 +; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:208 +; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:224 +; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:240 ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:128 +; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:128 ; GFX11-NEXT: global_store_b128 v64, v[4:7], s[2:3] offset:144 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160 +; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:160 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176 +; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:176 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: s_clause 0x8 -; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192 +; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:192 ; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3] ; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16 ; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32 @@ -144,11 +144,11 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96 ; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:208 +; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:208 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:224 +; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:240 +; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:240 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 4f8ba532db04a5..41683b290d7aca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -819,22 +819,22 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s8, s18 ; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s17 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s18 +; GPRIDX-NEXT: v_mov_b32_e32 v19, s19 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 @@ -843,29 +843,29 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2 -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17] -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17] -; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v7, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v8, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v10, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v12, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v14, v0, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v16, v0, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v18, v0, s[14:15] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v1, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v15, v1, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v17, v1, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v19, v1, s[14:15] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[14:17], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; @@ -1025,23 +1025,23 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s18 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 @@ -1050,30 +1050,30 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s19 -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v0, s[10:11] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off +; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v6, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v8, v1, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v10, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v12, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v14, v1, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v16, v1, s[10:11] +; GPRIDX-NEXT: v_mov_b32_e32 v16, s19 +; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v3, v16, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v5, v16, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v16, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v16, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v16, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v15, v16, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v17, v16, s[10:11] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[13:16], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; @@ -1447,22 +1447,22 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v19, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 @@ -1471,29 +1471,29 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v2 -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[10:11] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v7, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v8, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v10, v0, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v12, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v14, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v16, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v18, v0, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v1, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v15, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v17, v1, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v19, v1, s[10:11] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[14:17], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index cd7b5018e9bd1b..1c955ee622d2c4 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -878,34 +878,33 @@ entry: define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) { ; GCN-LABEL: double15_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x104 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104 ; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v28, s2 -; GCN-NEXT: v_mov_b32_e32 v24, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_load_dword s4, s[0:1], 0x124 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s2, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NEXT: v_mov_b32_e32 v5, s13 -; GCN-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NEXT: v_mov_b32_e32 v10, s18 -; GCN-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NEXT: v_mov_b32_e32 v12, s20 -; GCN-NEXT: v_mov_b32_e32 v13, s21 -; GCN-NEXT: v_mov_b32_e32 v14, s22 -; GCN-NEXT: v_mov_b32_e32 v15, s23 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: v_mov_b32_e32 v8, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s13 +; GCN-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NEXT: v_mov_b32_e32 v12, s16 +; GCN-NEXT: v_mov_b32_e32 v13, s17 +; GCN-NEXT: v_mov_b32_e32 v14, s18 +; GCN-NEXT: v_mov_b32_e32 v15, s19 ; GCN-NEXT: v_mov_b32_e32 v16, s24 ; GCN-NEXT: v_mov_b32_e32 v17, s25 ; GCN-NEXT: v_mov_b32_e32 v18, s26 @@ -914,9 +913,10 @@ define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 ; GCN-NEXT: v_mov_b32_e32 v21, s29 ; GCN-NEXT: v_mov_b32_e32 v22, s30 ; GCN-NEXT: v_mov_b32_e32 v23, s31 -; GCN-NEXT: v_mov_b32_e32 v25, s5 -; GCN-NEXT: v_mov_b32_e32 v26, s6 -; GCN-NEXT: v_mov_b32_e32 v27, s7 +; GCN-NEXT: v_mov_b32_e32 v24, s20 +; GCN-NEXT: v_mov_b32_e32 v25, s21 +; GCN-NEXT: v_mov_b32_e32 v26, s22 +; GCN-NEXT: v_mov_b32_e32 v27, s23 ; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index c30a35996994bc..6248bef24b3a8a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -39,13 +39,13 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no ; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192 ; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288 ; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1 -; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[116:119], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:24608 +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24592 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 @@ -60,28 +60,28 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] ; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 -; GCN-NEXT: ds_read_b128 a[112:115], v4 offset:57440 -; GCN-NEXT: ds_read_b128 a[108:111], v4 offset:57424 -; GCN-NEXT: ds_read_b128 a[104:107], v4 offset:57408 -; GCN-NEXT: ds_read_b128 a[88:91], v4 offset:57344 -; GCN-NEXT: ds_read_b128 a[92:95], v4 offset:57360 -; GCN-NEXT: ds_read_b128 a[96:99], v4 offset:57376 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24576 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[28:31], v4 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v4 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v4 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v4 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v4 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v4 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v4 offset:57376 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49168 ; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49152 -; GCN-NEXT: ds_read_b128 a[100:103], v4 offset:57392 +; GCN-NEXT: ds_read_b128 a[12:15], v4 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 ; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272 ; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224 @@ -91,15 +91,15 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[56:87], v2, v3, a[56:87] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:32784 -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[88:119], v2, v3, a[88:119] ; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:24672 ; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:24688 ; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:24640 @@ -110,14 +110,14 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(<32 x float> addrspace(3)* no ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24592 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16400 ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 2c67f4a8cb7df1..10878248f46be5 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2162,26 +2162,30 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s35, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-HSA-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff @@ -2194,80 +2198,76 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff -; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff -; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: @@ -2548,112 +2548,112 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s20, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s21, s4, 16 -; GCN-HSA-NEXT: s_ashr_i32 s22, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s23, s6, 16 -; GCN-HSA-NEXT: s_ashr_i32 s24, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s25, s8, 16 -; GCN-HSA-NEXT: s_ashr_i32 s26, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s27, s10, 16 -; GCN-HSA-NEXT: s_ashr_i32 s28, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s29, s12, 16 -; GCN-HSA-NEXT: s_ashr_i32 s30, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s31, s14, 16 -; GCN-HSA-NEXT: s_ashr_i32 s33, s17, 16 -; GCN-HSA-NEXT: s_ashr_i32 s34, s16, 16 -; GCN-HSA-NEXT: s_ashr_i32 s35, s19, 16 -; GCN-HSA-NEXT: s_ashr_i32 s36, s18, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s16, s16 -; GCN-HSA-NEXT: s_sext_i32_i16 s19, s19 -; GCN-HSA-NEXT: s_sext_i32_i16 s18, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_sext_i32_i16 s17, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 +; GCN-HSA-NEXT: s_ashr_i32 s22, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s23, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s24, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s25, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s26, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s27, s6, 16 +; GCN-HSA-NEXT: s_ashr_i32 s28, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s29, s8, 16 +; GCN-HSA-NEXT: s_ashr_i32 s30, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s31, s10, 16 +; GCN-HSA-NEXT: s_ashr_i32 s33, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s34, s12, 16 +; GCN-HSA-NEXT: s_ashr_i32 s35, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s36, s14, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32: @@ -2853,28 +2853,28 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s39, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s51, s1, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s0, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s2, 0xffff @@ -2890,67 +2890,67 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s37, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s36, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s39, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s38, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s41, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s40, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s43, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s42, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s45, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s44, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s47, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s46, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s67, s49, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s68, s48, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s69, s51, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s70, s50, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s37, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s36, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s39, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s38, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s40, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s43, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s42, s42, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s45, s45, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s44, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s47, s47, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s46, s46, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s49, s49, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s48, s48, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s51, s51, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s50, s50, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s41, s41, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s17, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s19, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s21, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s20, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s23, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s22, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s25, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s24, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s27, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s26, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s67, s29, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s68, s28, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s69, s31, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s70, s30, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s29, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 @@ -2958,64 +2958,64 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -3024,32 +3024,65 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16 ; GCN-HSA-NEXT: s_lshr_b32 s21, s0, 16 ; GCN-HSA-NEXT: s_lshr_b32 s22, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s23, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s52, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s26, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s38, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s39, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s40, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s14, 16 +; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s31, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s50, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s51, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s52, s14, 0xffff +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s53, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s54, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s55, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s57, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s59, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s61, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s63, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s65, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s14, 16 +; GCN-HSA-NEXT: s_and_b32 s67, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s68, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s53, s7, 0xffff -; GCN-HSA-NEXT: s_and_b32 s54, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff @@ -3058,179 +3091,145 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_lshr_b32 s18, s37, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s36, 16 -; GCN-HSA-NEXT: s_lshr_b32 s55, s39, 16 -; GCN-HSA-NEXT: s_lshr_b32 s56, s38, 16 -; GCN-HSA-NEXT: s_lshr_b32 s57, s41, 16 -; GCN-HSA-NEXT: s_lshr_b32 s58, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s59, s43, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s42, 16 -; GCN-HSA-NEXT: s_lshr_b32 s61, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s63, s47, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s46, 16 -; GCN-HSA-NEXT: s_lshr_b32 s65, s49, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s48, 16 -; GCN-HSA-NEXT: s_lshr_b32 s67, s51, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s50, 16 -; GCN-HSA-NEXT: s_and_b32 s37, s37, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s36, 0xffff -; GCN-HSA-NEXT: s_and_b32 s39, s39, 0xffff -; GCN-HSA-NEXT: s_and_b32 s38, s38, 0xffff -; GCN-HSA-NEXT: s_and_b32 s41, s41, 0xffff -; GCN-HSA-NEXT: s_and_b32 s40, s40, 0xffff -; GCN-HSA-NEXT: s_and_b32 s43, s43, 0xffff -; GCN-HSA-NEXT: s_and_b32 s42, s42, 0xffff -; GCN-HSA-NEXT: s_and_b32 s45, s45, 0xffff -; GCN-HSA-NEXT: s_and_b32 s44, s44, 0xffff -; GCN-HSA-NEXT: s_and_b32 s47, s47, 0xffff -; GCN-HSA-NEXT: s_and_b32 s46, s46, 0xffff -; GCN-HSA-NEXT: s_and_b32 s49, s49, 0xffff -; GCN-HSA-NEXT: s_and_b32 s48, s48, 0xffff -; GCN-HSA-NEXT: s_and_b32 s51, s51, 0xffff -; GCN-HSA-NEXT: s_and_b32 s50, s50, 0xffff -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3756,216 +3755,217 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa ; ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x0 -; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x10 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s4, s17, 16 -; GCN-HSA-NEXT: s_ashr_i32 s5, s16, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s17 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s16 -; GCN-HSA-NEXT: s_ashr_i32 s8, s19, 16 -; GCN-HSA-NEXT: s_ashr_i32 s9, s18, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s19 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s18 -; GCN-HSA-NEXT: s_ashr_i32 s12, s21, 16 -; GCN-HSA-NEXT: s_ashr_i32 s13, s20, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s21 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s20 -; GCN-HSA-NEXT: s_ashr_i32 s16, s23, 16 -; GCN-HSA-NEXT: s_ashr_i32 s17, s22, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s18, s23 -; GCN-HSA-NEXT: s_sext_i32_i16 s19, s22 -; GCN-HSA-NEXT: s_ashr_i32 s20, s25, 16 -; GCN-HSA-NEXT: s_ashr_i32 s21, s24, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s22, s25 -; GCN-HSA-NEXT: s_sext_i32_i16 s23, s24 -; GCN-HSA-NEXT: s_ashr_i32 s24, s27, 16 -; GCN-HSA-NEXT: s_ashr_i32 s25, s26, 16 -; GCN-HSA-NEXT: s_ashr_i32 s33, s29, 16 -; GCN-HSA-NEXT: s_ashr_i32 s34, s28, 16 -; GCN-HSA-NEXT: s_ashr_i32 s35, s31, 16 -; GCN-HSA-NEXT: s_ashr_i32 s52, s30, 16 -; GCN-HSA-NEXT: s_ashr_i32 s53, s37, 16 -; GCN-HSA-NEXT: s_ashr_i32 s54, s36, 16 -; GCN-HSA-NEXT: s_ashr_i32 s55, s39, 16 -; GCN-HSA-NEXT: s_ashr_i32 s56, s38, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s41, 16 -; GCN-HSA-NEXT: s_ashr_i32 s58, s40, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s43, 16 -; GCN-HSA-NEXT: s_ashr_i32 s60, s42, 16 -; GCN-HSA-NEXT: s_ashr_i32 s61, s45, 16 -; GCN-HSA-NEXT: s_ashr_i32 s62, s44, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s47, 16 -; GCN-HSA-NEXT: s_ashr_i32 s64, s46, 16 -; GCN-HSA-NEXT: s_ashr_i32 s65, s49, 16 -; GCN-HSA-NEXT: s_ashr_i32 s66, s48, 16 -; GCN-HSA-NEXT: s_ashr_i32 s67, s51, 16 -; GCN-HSA-NEXT: s_ashr_i32 s68, s50, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s47, s47 -; GCN-HSA-NEXT: s_sext_i32_i16 s46, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: s_ashr_i32 s20, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s21, s0, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s22, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s23, s0 +; GCN-HSA-NEXT: s_ashr_i32 s24, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s25, s2, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s26, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s27, s2 +; GCN-HSA-NEXT: s_ashr_i32 s28, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s29, s4, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s30, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s31, s4 +; GCN-HSA-NEXT: s_ashr_i32 s33, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s34, s6, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s35, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s36, s6 +; GCN-HSA-NEXT: s_ashr_i32 s37, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s38, s8, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s39, s9 +; GCN-HSA-NEXT: s_sext_i32_i16 s40, s8 +; GCN-HSA-NEXT: s_ashr_i32 s41, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s42, s10, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s43, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s44, s10 +; GCN-HSA-NEXT: s_ashr_i32 s45, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s46, s12, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s47, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s48, s12 +; GCN-HSA-NEXT: s_ashr_i32 s49, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s50, s14, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s51, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s52, s14 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 +; GCN-HSA-NEXT: s_ashr_i32 s55, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s58, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s60, s6, 16 +; GCN-HSA-NEXT: s_ashr_i32 s61, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s62, s8, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s64, s10, 16 +; GCN-HSA-NEXT: s_ashr_i32 s65, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s66, s12, 16 +; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s68, s14, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s54, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-HSA-NEXT: s_sext_i32_i16 s53, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: s_sext_i32_i16 s45, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s44, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 +; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: s_sext_i32_i16 s51, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s50, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s43, s43 -; GCN-HSA-NEXT: s_sext_i32_i16 s42, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: s_sext_i32_i16 s49, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s48, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s36, s36 -; GCN-HSA-NEXT: s_sext_i32_i16 s39, s39 -; GCN-HSA-NEXT: s_sext_i32_i16 s38, s38 -; GCN-HSA-NEXT: s_sext_i32_i16 s41, s41 -; GCN-HSA-NEXT: s_sext_i32_i16 s40, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_sext_i32_i16 s29, s29 -; GCN-HSA-NEXT: s_sext_i32_i16 s28, s28 -; GCN-HSA-NEXT: s_sext_i32_i16 s31, s31 -; GCN-HSA-NEXT: s_sext_i32_i16 s30, s30 -; GCN-HSA-NEXT: s_sext_i32_i16 s37, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s54 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s27, s27 -; GCN-HSA-NEXT: s_sext_i32_i16 s26, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -6050,260 +6050,262 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s3, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s17, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s19, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s16, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s3, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s18, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s19, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 +; GCN-HSA-NEXT: s_and_b32 s35, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff -; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 3c26845ea48a38..28539a49a965ff 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A-GISEL %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s ; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC @@ -29,8 +29,15 @@ bb: } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: -; GREEDY: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] -; GREEDY: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] +; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] + +; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] +; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] + +; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] + ; FAST: v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] ; FAST: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] ; GCN: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 57e5342480929a..a615a5e6a7248c 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -792,23 +792,17 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s15, 0 ; GLOBALNESS0-NEXT: s_load_dwordx4 s[56:59], s[8:9], 0x0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s14, 1 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s10, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s11, 3 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v44, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s6, 4 ; GLOBALNESS0-NEXT: global_store_dword v[0:1], v44, off -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s7, 5 +; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: global_load_dword v0, v44, s[56:57] ; GLOBALNESS0-NEXT: s_mov_b32 s61, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s60, s61 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 6 ; GLOBALNESS0-NEXT: s_mov_b32 s62, s61 ; GLOBALNESS0-NEXT: s_mov_b32 s63, s61 ; GLOBALNESS0-NEXT: s_mov_b32 s64, s61 @@ -840,7 +834,6 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: s_mov_b32 s90, s61 ; GLOBALNESS0-NEXT: s_mov_b32 s91, s61 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a32, s60 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 7 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a33, s61 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a34, s62 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a35, s63 @@ -873,123 +866,123 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a62, s90 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a63, s91 ; GLOBALNESS0-NEXT: s_movk_i32 s60, 0x80 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s60, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s61, 9 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s62, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s63, 11 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s64, 12 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s65, 13 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s66, 14 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s67, 15 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s68, 16 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s69, 17 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s70, 18 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s71, 19 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s72, 20 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s73, 21 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s74, 22 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s75, 23 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s76, 24 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s77, 25 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s78, 26 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s79, 27 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s80, 28 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s81, 29 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s82, 30 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s83, 31 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s84, 32 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s85, 33 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s86, 34 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s60, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s61, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s62, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s63, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s64, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s65, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s66, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s67, 7 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s68, 8 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s69, 9 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s70, 10 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s71, 11 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s72, 12 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s73, 13 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s74, 14 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s75, 15 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s76, 16 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s77, 17 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s78, 18 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s79, 19 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s80, 20 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s81, 21 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s82, 22 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s83, 23 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s84, 24 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s85, 25 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s86, 26 +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s87, 35 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s88, 36 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s89, 37 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s87, 27 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s88, 28 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s89, 29 ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, 0x40994400 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s90, 38 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s90, 30 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[8:9] -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s91, 39 +; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s91, 31 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[44:45] ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s8, 40 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s8, 32 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s9, 41 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s9, 33 ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s58, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 42 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 43 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 34 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 35 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[38:39], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[46:47], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 44 +; GLOBALNESS0-NEXT: s_xor_b64 s[50:51], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 45 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] ; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_xor_b64 s[100:101], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[52:53], s[4:5], -1 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 46 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 47 -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 48 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 49 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 36 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 37 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 50 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 51 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 38 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 39 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 52 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s4, 40 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 41 +; GLOBALNESS0-NEXT: s_mov_b32 s57, 0x3ff00000 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s56, 42 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s57, 43 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s58, 44 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s59, 45 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s60, 46 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s61, 47 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s62, 48 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s63, 49 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s64, 50 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s65, 51 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s66, 52 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s67, 53 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s68, 54 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s78, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s69, 55 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s79, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s70, 56 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s80, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s71, 57 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s81, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s72, 58 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s82, 4 +; GLOBALNESS0-NEXT: s_load_dwordx2 s[100:101], s[6:7], 0x0 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s73, 59 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s83, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s74, 60 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s84, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s75, 61 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s85, 7 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s76, 62 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s86, 8 +; GLOBALNESS0-NEXT: s_mov_b32 s44, s16 +; GLOBALNESS0-NEXT: s_mov_b32 s45, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s42, s14 +; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s5, 53 -; GLOBALNESS0-NEXT: s_mov_b32 s41, 0x3ff00000 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s40, 54 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s50, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s51, 1 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s52, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s53, 3 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s54, 4 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s55, 5 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s56, 6 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s57, 7 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s58, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s59, 9 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s60, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s61, 11 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s62, 12 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s41, 55 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s63, 13 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s42, 56 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s64, 14 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s43, 57 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s65, 15 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s44, 58 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s66, 16 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s45, 59 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s67, 17 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s46, 60 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s68, 18 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s47, 61 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s69, 19 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s48, 62 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s70, 20 -; GLOBALNESS0-NEXT: s_mov_b32 s33, s16 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s49, 63 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s71, 21 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[48:49], 1, v0 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s77, 63 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s87, 9 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 52 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 53 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 40 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 41 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 @@ -999,7 +992,7 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow19 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a63, v31 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 22 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 10 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a62, v30 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a61, v29 @@ -1032,61 +1025,58 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a34, v2 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a33, v1 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a32, v0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 23 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 11 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_17 Depth 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1] ; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] -; GLOBALNESS0-NEXT: s_add_u32 s8, s48, 40 +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1] -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s10, v41, 2 -; GLOBALNESS0-NEXT: s_addc_u32 s9, s49, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 7 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 5 -; GLOBALNESS0-NEXT: v_readlane_b32 s11, v41, 3 -; GLOBALNESS0-NEXT: v_readlane_b32 s12, v41, 1 -; GLOBALNESS0-NEXT: v_readlane_b32 s13, v41, 0 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s33 +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s42 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s45 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 39 +; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 3 +; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 7 +; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 10 +; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 12 +; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 13 +; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 14 +; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 15 +; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 16 +; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 17 +; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 18 +; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 19 +; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 20 +; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 21 +; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 22 +; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 23 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 24 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 25 +; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 26 +; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 27 +; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 28 +; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 29 +; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 30 +; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 31 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[100:101] ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: ; kill: killed $sgpr4_sgpr5 @@ -1115,37 +1105,41 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow18 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s8, 22 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s9, 23 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s8, 10 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s9, 11 ; GLOBALNESS0-NEXT: .LBB1_10: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 17 -; GLOBALNESS0-NEXT: s_mov_b64 s[56:57], s[60:61] -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 39 +; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 1 +; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 5 +; GLOBALNESS0-NEXT: s_mov_b64 s[56:57], s[64:65] +; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 7 +; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 10 +; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 12 +; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 13 +; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 14 +; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 15 +; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 16 +; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 17 +; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 18 +; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 19 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 20 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 21 +; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 22 +; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 23 +; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 24 +; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 25 +; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 26 +; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 27 +; GLOBALNESS0-NEXT: v_readlane_b32 s92, v41, 28 +; GLOBALNESS0-NEXT: v_readlane_b32 s93, v41, 29 +; GLOBALNESS0-NEXT: v_readlane_b32 s94, v41, 30 +; GLOBALNESS0-NEXT: v_readlane_b32 s95, v41, 31 ; GLOBALNESS0-NEXT: s_mov_b32 s68, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s69, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s57 @@ -1196,68 +1190,74 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 15 +; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 3 ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS0-NEXT: ; %bb.11: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[0:1] -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 39 +; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 1 +; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 3 +; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 7 +; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 10 +; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 12 +; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 13 +; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 14 +; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 15 +; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 16 +; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 17 +; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 18 +; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 19 +; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 20 +; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 21 +; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 22 +; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 23 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 24 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 25 +; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 26 +; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 27 +; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 28 +; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 29 +; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 30 +; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 31 ; GLOBALNESS0-NEXT: s_mov_b64 s[56:57], s[60:61] -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 54 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 55 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 56 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 57 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 58 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 59 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 60 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 61 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 62 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 63 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v42, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v42, 1 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v42, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v42, 3 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v42, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v42, 5 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v42, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v42, 7 +; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 42 +; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 43 +; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 44 +; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 45 +; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 46 +; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 47 +; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 48 +; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 49 +; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 50 +; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 51 +; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 52 +; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 53 +; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 54 +; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 55 +; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 56 +; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 57 +; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 58 +; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 59 +; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 60 +; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 61 +; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 62 +; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 63 +; GLOBALNESS0-NEXT: v_readlane_b32 s82, v42, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s83, v42, 1 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v42, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v42, 3 +; GLOBALNESS0-NEXT: v_readlane_b32 s86, v42, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s87, v42, 5 ; GLOBALNESS0-NEXT: s_mov_b32 s60, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s62, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s63, s61 @@ -1269,63 +1269,53 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: s_mov_b32 s69, s61 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s71, s61 -; GLOBALNESS0-NEXT: s_mov_b32 s41, s61 -; GLOBALNESS0-NEXT: s_mov_b64 s[96:97], s[54:55] -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v42, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v42, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v42, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v42, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v42, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v42, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v42, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v42, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v42, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v42, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v42, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v42, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v42, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v42, 21 ; GLOBALNESS0-NEXT: s_mov_b32 s72, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s73, s61 ; GLOBALNESS0-NEXT: s_mov_b32 s74, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s75, s61 ; GLOBALNESS0-NEXT: s_mov_b32 s76, s57 ; GLOBALNESS0-NEXT: s_mov_b32 s77, s61 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s40, 54 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s50, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s51, 1 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s52, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s53, 3 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s54, 4 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s55, 5 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s56, 6 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s57, 7 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s58, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s59, 9 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s60, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s61, 11 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s62, 12 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s41, 55 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s63, 13 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s42, 56 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s64, 14 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s43, 57 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s65, 15 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s44, 58 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s66, 16 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s45, 59 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s67, 17 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s46, 60 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s68, 18 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s47, 61 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s69, 19 +; GLOBALNESS0-NEXT: s_mov_b32 s57, s61 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s56, 42 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s57, 43 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s58, 44 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s59, 45 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s60, 46 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s61, 47 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s62, 48 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s63, 49 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s64, 50 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s65, 51 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s66, 52 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s67, 53 +; GLOBALNESS0-NEXT: v_readlane_b32 s88, v42, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s89, v42, 7 +; GLOBALNESS0-NEXT: v_readlane_b32 s90, v42, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s91, v42, 9 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s68, 54 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s78, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s69, 55 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s79, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s70, 56 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s80, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s71, 57 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s81, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s72, 58 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s82, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s73, 59 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s83, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s74, 60 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s84, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s75, 61 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s85, 7 +; GLOBALNESS0-NEXT: s_mov_b64 s[92:93], s[54:55] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s48, 62 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s70, 20 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[54:55], 0, v0 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s76, 62 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s86, 8 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s49, 63 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s71, 21 +; GLOBALNESS0-NEXT: v_writelane_b32 v41, s77, 63 +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s87, 9 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[62:63], s[62:63] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[64:65], s[64:65] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[66:67], s[66:67] op_sel:[0,1] @@ -1341,17 +1331,15 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[86:87], s[86:87] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[34:35], s[6:7] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[88:89], s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 46 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s6, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 47 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s7, 25 -; GLOBALNESS0-NEXT: s_mov_b32 s99, s59 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 36 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 37 +; GLOBALNESS0-NEXT: s_mov_b32 s91, s59 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.13: ; %bb39.i @@ -1362,81 +1350,69 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 40 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 42 +; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 34 +; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 32 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 41 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 43 +; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 35 +; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 33 ; GLOBALNESS0-NEXT: s_branch .LBB1_17 ; GLOBALNESS0-NEXT: .LBB1_15: ; %Flow7 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[100:101] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_17: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[38:39] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_16 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 44 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 45 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_16 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[60:61] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_22 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[62:63] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[60:61] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_22 ; GLOBALNESS0-NEXT: ; %bb.21: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: .LBB1_22: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 48 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 49 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[48:49] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_16 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s64, s48, 40 -; GLOBALNESS0-NEXT: v_readlane_b32 s40, v41, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s42, v41, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s44, v41, 2 -; GLOBALNESS0-NEXT: s_addc_u32 s65, s49, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s41, v41, 7 -; GLOBALNESS0-NEXT: v_readlane_b32 s43, v41, 5 -; GLOBALNESS0-NEXT: v_readlane_b32 s45, v41, 3 -; GLOBALNESS0-NEXT: v_readlane_b32 s46, v41, 1 -; GLOBALNESS0-NEXT: v_readlane_b32 s47, v41, 0 +; GLOBALNESS0-NEXT: s_add_u32 s64, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s65, s39, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[42:43] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] -; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[44:45] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s46 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s47 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s33 +; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s42 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s45 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[100:101] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[42:43] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] -; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[44:45] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s46 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s47 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s33 +; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s42 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s45 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], a[32:33], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[100:101] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.24: ; %bb62.i @@ -1447,250 +1423,10 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: s_branch .LBB1_15 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s56, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s57, v41, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s58, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s59, v41, 11 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[48:49] -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[56:57] -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 17 -; GLOBALNESS0-NEXT: s_mov_b32 s56, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s57, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s58, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s59, s49 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 19 -; GLOBALNESS0-NEXT: s_mov_b32 s60, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s61, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s62, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s63, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s64, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s65, s49 -; GLOBALNESS0-NEXT: s_mov_b64 s[52:53], s[56:57] -; GLOBALNESS0-NEXT: s_mov_b32 s66, s49 -; GLOBALNESS0-NEXT: s_mov_b32 s67, s49 -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[58:59] -; GLOBALNESS0-NEXT: s_mov_b64 s[56:57], s[60:61] -; GLOBALNESS0-NEXT: s_mov_b64 s[58:59], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[60:61], s[64:65] -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 31 -; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[66:67] -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 39 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s48, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s49, 9 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s50, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s51, 11 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s52, 12 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s53, 13 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s54, 14 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s55, 15 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s56, 16 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s57, 17 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s58, 18 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s59, 19 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s60, 20 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s61, 21 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s62, 22 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s63, 23 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s64, 24 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s65, 25 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s66, 26 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s67, 27 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s68, 28 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s69, 29 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s70, 30 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s71, 31 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s72, 32 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s73, 33 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s74, 34 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s75, 35 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s76, 36 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s77, 37 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s78, 38 -; GLOBALNESS0-NEXT: v_writelane_b32 v41, s79, 39 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 9 -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[64:65] -; GLOBALNESS0-NEXT: s_mov_b32 s64, s49 -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[52:53] -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s92, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s93, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s94, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s95, v41, 39 -; GLOBALNESS0-NEXT: s_mov_b64 s[50:51], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[52:53], s[56:57] -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[58:59] -; GLOBALNESS0-NEXT: s_mov_b64 s[56:57], s[60:61] -; GLOBALNESS0-NEXT: s_mov_b64 s[58:59], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b32 s60, s64 +; GLOBALNESS0-NEXT: v_readlane_b32 s56, v41, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s57, v41, 1 ; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 8 ; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 9 -; GLOBALNESS0-NEXT: s_mov_b64 s[44:45], s[64:65] -; GLOBALNESS0-NEXT: s_mov_b32 s61, s45 -; GLOBALNESS0-NEXT: s_mov_b64 s[44:45], s[48:49] -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 35 -; GLOBALNESS0-NEXT: s_mov_b64 s[46:47], s[50:51] -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[52:53] -; GLOBALNESS0-NEXT: s_mov_b64 s[50:51], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[52:53], s[56:57] -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[58:59] -; GLOBALNESS0-NEXT: s_mov_b64 s[56:57], s[60:61] -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 39 -; GLOBALNESS0-NEXT: s_mov_b32 s58, s61 -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 9 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[60:61] -; GLOBALNESS0-NEXT: v_readlane_b32 s92, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s93, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s94, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s95, v41, 39 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 19 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 39 -; GLOBALNESS0-NEXT: s_mov_b32 s59, s37 -; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[58:59] -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 8 -; GLOBALNESS0-NEXT: s_mov_b64 s[60:61], s[56:57] -; GLOBALNESS0-NEXT: s_mov_b64 s[58:59], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[56:57], s[52:53] -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[50:51] -; GLOBALNESS0-NEXT: s_mov_b64 s[52:53], s[48:49] -; GLOBALNESS0-NEXT: s_mov_b64 s[50:51], s[46:47] -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[44:45] -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 9 ; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 10 ; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 11 ; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 12 @@ -1707,94 +1443,63 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 23 ; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 24 ; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 25 +; GLOBALNESS0-NEXT: s_mov_b32 s64, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s65, s57 +; GLOBALNESS0-NEXT: v_readlane_b32 s59, v41, 3 ; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 26 ; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 27 ; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 28 ; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 29 ; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 30 ; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 35 -; GLOBALNESS0-NEXT: s_mov_b32 s64, s65 -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[48:49] -; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[50:51] -; GLOBALNESS0-NEXT: s_mov_b64 s[44:45], s[52:53] -; GLOBALNESS0-NEXT: s_mov_b64 s[46:47], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[56:57] -; GLOBALNESS0-NEXT: s_mov_b64 s[50:51], s[58:59] -; GLOBALNESS0-NEXT: s_mov_b64 s[52:53], s[60:61] -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b32 s56, s65 -; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 10 -; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s64, v41, 12 -; GLOBALNESS0-NEXT: v_readlane_b32 s65, v41, 13 -; GLOBALNESS0-NEXT: v_readlane_b32 s66, v41, 14 -; GLOBALNESS0-NEXT: v_readlane_b32 s67, v41, 15 -; GLOBALNESS0-NEXT: v_readlane_b32 s68, v41, 16 -; GLOBALNESS0-NEXT: v_readlane_b32 s69, v41, 17 -; GLOBALNESS0-NEXT: v_readlane_b32 s70, v41, 18 -; GLOBALNESS0-NEXT: v_readlane_b32 s71, v41, 19 -; GLOBALNESS0-NEXT: s_mov_b32 s57, s61 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[40:41], s[40:41] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[42:43], s[42:43] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[44:45], s[44:45] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[46:47], s[46:47] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[48:49], s[48:49] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[50:51], s[50:51] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[52:53], s[52:53] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[54:55], s[54:55] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[56:57], s[56:57] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[58:59], s[58:59] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[60:61], s[60:61] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[62:63], s[62:63] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[64:65], s[64:65] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[68:69], s[68:69] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[70:71], s[70:71] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_mov_b64 s[48:49], s[6:7] -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 24 -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] -; GLOBALNESS0-NEXT: s_mov_b32 s59, s99 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s92, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s93, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s94, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s95, v41, 39 -; GLOBALNESS0-NEXT: v_readlane_b32 s72, v41, 20 -; GLOBALNESS0-NEXT: v_readlane_b32 s73, v41, 21 -; GLOBALNESS0-NEXT: v_readlane_b32 s74, v41, 22 -; GLOBALNESS0-NEXT: v_readlane_b32 s75, v41, 23 -; GLOBALNESS0-NEXT: v_readlane_b32 s76, v41, 24 -; GLOBALNESS0-NEXT: v_readlane_b32 s77, v41, 25 -; GLOBALNESS0-NEXT: v_readlane_b32 s78, v41, 26 -; GLOBALNESS0-NEXT: v_readlane_b32 s79, v41, 27 -; GLOBALNESS0-NEXT: v_readlane_b32 s80, v41, 28 -; GLOBALNESS0-NEXT: v_readlane_b32 s81, v41, 29 -; GLOBALNESS0-NEXT: v_readlane_b32 s82, v41, 30 -; GLOBALNESS0-NEXT: v_readlane_b32 s83, v41, 31 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v41, 32 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v41, 33 -; GLOBALNESS0-NEXT: v_readlane_b32 s86, v41, 34 -; GLOBALNESS0-NEXT: v_readlane_b32 s87, v41, 35 -; GLOBALNESS0-NEXT: v_readlane_b32 s88, v41, 36 -; GLOBALNESS0-NEXT: v_readlane_b32 s89, v41, 37 -; GLOBALNESS0-NEXT: v_readlane_b32 s90, v41, 38 -; GLOBALNESS0-NEXT: v_readlane_b32 s91, v41, 39 +; GLOBALNESS0-NEXT: s_mov_b32 s66, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s67, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s68, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s69, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s71, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s72, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s73, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s74, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s75, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s76, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s77, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s78, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s79, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s80, s57 +; GLOBALNESS0-NEXT: s_mov_b32 s81, s57 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[64:65], s[64:65] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_readlane_b32 s58, v41, 2 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[66:67], s[66:67] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[68:69], s[68:69] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[70:71], s[70:71] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[72:73], s[72:73] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[74:75], s[74:75] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[76:77], s[76:77] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[78:79], s[78:79] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[80:81], s[80:81] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[82:83], s[82:83] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[84:85], s[84:85] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[86:87], s[86:87] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[88:89], s[88:89] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[90:91], s[90:91] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[92:93], s[92:93] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[94:95], s[94:95] op_sel:[0,1] +; GLOBALNESS0-NEXT: s_mov_b32 s59, s91 +; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s63, v41, 7 ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[34:35] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[96:97] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[88:89] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[54:55] +; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[92:93] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 50 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 51 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 38 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 39 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i @@ -1814,40 +1519,32 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_32 ; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i -; GLOBALNESS0-NEXT: s_add_u32 s8, s48, 40 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s10, v41, 2 -; GLOBALNESS0-NEXT: s_addc_u32 s9, s49, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 7 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 5 -; GLOBALNESS0-NEXT: v_readlane_b32 s11, v41, 3 -; GLOBALNESS0-NEXT: v_readlane_b32 s12, v41, 1 -; GLOBALNESS0-NEXT: v_readlane_b32 s13, v41, 0 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s33 +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s42 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s45 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 -; GLOBALNESS0-NEXT: s_mov_b32 s34, s33 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_mov_b32 s33, s34 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: .LBB1_32: ; %Flow ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_34 ; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i -; GLOBALNESS0-NEXT: s_add_u32 s8, s48, 40 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v41, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s10, v41, 2 -; GLOBALNESS0-NEXT: s_addc_u32 s9, s49, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 7 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v41, 5 -; GLOBALNESS0-NEXT: v_readlane_b32 s11, v41, 3 -; GLOBALNESS0-NEXT: v_readlane_b32 s12, v41, 1 -; GLOBALNESS0-NEXT: v_readlane_b32 s13, v41, 0 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s33 +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s42 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s45 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4