From c76738a6d366ec7f1be3cb1d133afca1a6f92478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Mon, 24 Mar 2025 16:16:19 +0100 Subject: [PATCH] [AMDGPU][SelectionDAG] Use COPY instead of S_MOV_B32 to assign values to M0 This is consistent with what happens on GISel side. And allows the register coalescer to remove the redundant intermediate s_mov_b32. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +- .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 18 ++- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 104 +++++++----------- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 14 +-- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 10 +- .../CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll | 25 ++--- .../CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll | 18 ++- .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll | 33 ++---- .../CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll | 32 ++---- .../llvm.amdgcn.struct.buffer.load.lds.ll | 42 +++---- ...mdgcn.struct.ptr.buffer.load.lds.gfx950.ll | 80 +++++--------- .../llvm.amdgcn.struct.ptr.buffer.load.lds.ll | 42 +++---- 12 files changed, 154 insertions(+), 274 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8657c0389cd40..b0c18715ef810 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4691,7 +4691,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, } else { // Move index from VCC into M0 if (Offset == 0) { - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .addReg(CurrentIdxReg, RegState::Kill); } else { BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) @@ -4805,7 +4805,7 @@ static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, if (Offset == 0) { // clang-format off - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .add(*Idx); // clang-format on } else { @@ -5400,9 +5400,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } case AMDGPU::SI_INIT_M0: { + MachineOperand &M0Init = MI.getOperand(0); BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), - TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .add(MI.getOperand(0)); + TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32), + AMDGPU::M0) + .add(M0Init); MI.eraseFromParent(); return BB; } diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index a72e74167d564..10de973dac0c5 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -301,12 +301,11 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s10, s0 ; GCN-NEXT: s_mov_b32 s12, s0 ; GCN-NEXT: s_mov_b32 s14, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s18, s18, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s18, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 @@ -352,11 +351,10 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s10, s0 ; GCN-NEXT: s_mov_b32 s12, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s16, s16, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: s_mov_b32 m0, s16 +; GCN-NEXT: s_lshl_b32 m0, s16, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 @@ -451,12 +449,11 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s60, s36 ; GCN-NEXT: s_mov_b32 s62, s36 ; GCN-NEXT: s_mov_b32 s64, s36 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v31, s67 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 @@ -535,12 +532,11 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s62, s36 ; GCN-NEXT: s_mov_b32 s64, s36 ; GCN-NEXT: s_mov_b32 s66, s36 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v31, s67 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index c75dc539cdcee..d0b54a866718c 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -154,8 +154,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 -; SI-MOVREL-NEXT: s_mov_b32 m0, s6 +; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 @@ -183,8 +182,7 @@ define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -439,12 +437,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 +; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1 ; SI-MOVREL-NEXT: s_or_b32 s8, s8, 1 ; SI-MOVREL-NEXT: s_or_b32 s4, s23, 16 ; SI-MOVREL-NEXT: s_or_b32 s5, s22, 15 -; SI-MOVREL-NEXT: s_or_b32 s7, s21, 14 -; SI-MOVREL-NEXT: s_or_b32 s20, s20, 13 +; SI-MOVREL-NEXT: s_or_b32 s6, s21, 14 +; SI-MOVREL-NEXT: s_or_b32 s7, s20, 13 ; SI-MOVREL-NEXT: s_or_b32 s19, s19, 12 ; SI-MOVREL-NEXT: s_or_b32 s18, s18, 11 ; SI-MOVREL-NEXT: s_or_b32 s17, s17, 10 @@ -457,7 +455,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: s_or_b32 s10, s10, 3 ; SI-MOVREL-NEXT: s_or_b32 s9, s9, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 -; SI-MOVREL-NEXT: s_mov_b32 m0, s6 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 @@ -469,8 +466,8 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 ; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 ; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s6 ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s5 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 @@ -483,16 +480,16 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: s_or_b32 s10, s10, 3 ; VI-MOVREL-NEXT: s_or_b32 s9, s9, 2 ; VI-MOVREL-NEXT: s_or_b32 s8, s8, 1 -; VI-MOVREL-NEXT: s_or_b32 s3, s23, 16 -; VI-MOVREL-NEXT: s_or_b32 s4, s22, 15 -; VI-MOVREL-NEXT: s_or_b32 s5, s21, 14 -; VI-MOVREL-NEXT: s_or_b32 s6, s20, 13 -; VI-MOVREL-NEXT: s_or_b32 s7, s19, 12 -; VI-MOVREL-NEXT: s_or_b32 s18, s18, 11 +; VI-MOVREL-NEXT: s_or_b32 s2, s23, 16 +; VI-MOVREL-NEXT: s_or_b32 s3, s22, 15 +; VI-MOVREL-NEXT: s_or_b32 s4, s21, 14 +; VI-MOVREL-NEXT: s_or_b32 s5, s20, 13 +; VI-MOVREL-NEXT: s_or_b32 s6, s19, 12 +; VI-MOVREL-NEXT: s_or_b32 s7, s18, 11 ; VI-MOVREL-NEXT: s_or_b32 s17, s17, 10 ; VI-MOVREL-NEXT: s_or_b32 s16, s16, 9 ; VI-MOVREL-NEXT: s_or_b32 s15, s15, 8 @@ -503,7 +500,6 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 @@ -511,12 +507,12 @@ define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 ; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s17 -; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 -; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s7 -; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s6 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s5 -; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s6 +; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s4 +; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3 +; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 @@ -2079,7 +2075,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s6, s6, 1 +; SI-MOVREL-NEXT: s_add_i32 m0, s6, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2094,7 +2090,6 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s6 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 @@ -2112,8 +2107,7 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2435,7 +2429,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_and_b32 s4, s6, 0xffff +; SI-MOVREL-NEXT: s_and_b32 m0, s6, 0xffff ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2450,7 +2444,6 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 @@ -2468,8 +2461,7 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_and_b32 s2, s2, 0xffff -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_and_b32 m0, s2, 0xffff ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -2794,7 +2786,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s6 -; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1 +; SI-MOVREL-NEXT: s_add_i32 m0, s4, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 @@ -2809,7 +2801,6 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 @@ -2828,8 +2819,7 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s2 -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 @@ -6932,9 +6922,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 -; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s3, s2, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; SI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 @@ -6948,10 +6938,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 -; SI-MOVREL-NEXT: s_mov_b32 m0, s3 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 -; SI-MOVREL-NEXT: s_add_i32 s2, s2, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6967,7 +6956,6 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1 ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 -; SI-MOVREL-NEXT: s_mov_b32 m0, s2 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 @@ -6988,9 +6976,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000 -; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_add_i32 s3, s2, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0 +; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000 @@ -7004,11 +6992,9 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 -; VI-MOVREL-NEXT: s_mov_b32 m0, s3 -; VI-MOVREL-NEXT: s_add_i32 s2, s2, 2 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 @@ -8057,8 +8043,7 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15 -; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_add_i32 m0, s12, 15 ; SI-MOVREL-NEXT: s_mov_b32 s4, s0 ; SI-MOVREL-NEXT: s_mov_b32 s5, s1 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 @@ -8089,9 +8074,8 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; VI-MOVREL-NEXT: s_add_i32 s6, s6, 15 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; VI-MOVREL-NEXT: s_mov_b32 m0, s6 +; VI-MOVREL-NEXT: s_add_i32 m0, s6, 15 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 @@ -8321,8 +8305,7 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) -; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16 -; SI-MOVREL-NEXT: s_mov_b32 m0, s12 +; SI-MOVREL-NEXT: s_add_i32 m0, s12, 16 ; SI-MOVREL-NEXT: s_mov_b32 s4, s0 ; SI-MOVREL-NEXT: s_mov_b32 s5, s1 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 @@ -8353,9 +8336,8 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; VI-MOVREL-NEXT: s_add_i32 s6, s6, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; VI-MOVREL-NEXT: s_mov_b32 m0, s6 +; VI-MOVREL-NEXT: s_add_i32 m0, s6, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 @@ -8586,9 +8568,8 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; SI-MOVREL-NEXT: s_lshl_b32 m0, s12, 2 ; SI-MOVREL-NEXT: s_mov_b32 s4, s0 -; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2 -; SI-MOVREL-NEXT: s_mov_b32 m0, s0 ; SI-MOVREL-NEXT: s_mov_b32 s5, s1 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 ; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -8618,12 +8599,11 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 -; VI-MOVREL-NEXT: s_lshl_b32 s0, s6, 2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; VI-MOVREL-NEXT: s_mov_b32 m0, s0 +; VI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) +; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1 ; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1 ; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0 @@ -8862,7 +8842,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: s_lshl_b32 s4, s6, 2 +; SI-MOVREL-NEXT: s_lshl_b32 m0, s6, 2 ; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 ; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 @@ -8879,7 +8859,6 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 ; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s22 ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 -; SI-MOVREL-NEXT: s_mov_b32 m0, s4 ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 @@ -8895,9 +8874,8 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; VI-MOVREL-NEXT: s_lshl_b32 s2, s2, 2 +; VI-MOVREL-NEXT: s_lshl_b32 m0, s2, 2 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s8 -; VI-MOVREL-NEXT: s_mov_b32 m0, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 6f0c850117208..4b9da7b49e997 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -654,7 +654,7 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 @@ -671,7 +671,6 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: v_mov_b32_e32 v13, s21 ; GCN-NEXT: v_mov_b32_e32 v14, s22 ; GCN-NEXT: v_mov_b32_e32 v15, s23 -; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -720,15 +719,14 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v7, s15 ; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: v_mov_b32_e32 v10, s2 ; GCN-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s0, 1 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_add_u32 s0, s6, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 @@ -765,8 +763,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 s0, s0, 1 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_lshl_b32 m0, s0, 1 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 @@ -872,8 +869,7 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 1 -; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_lshl_b32 m0, s0, 1 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 20d2b12a1ebfe..837c18fe7af0a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -2266,13 +2266,12 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0x20 ; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10 +; SI-NEXT: s_load_dword s4, s[8:9], 0x20 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 1 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 @@ -2289,7 +2288,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; SI-NEXT: v_mov_b32_e32 v13, s25 ; SI-NEXT: v_mov_b32_e32 v14, s26 ; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_lshl_b32 m0, s4, 1 ; SI-NEXT: v_movreld_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v1, v16 @@ -2301,13 +2300,12 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x80 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 +; VI-NEXT: s_load_dword s4, s[8:9], 0x80 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 @@ -2324,7 +2322,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; VI-NEXT: v_mov_b32_e32 v13, s25 ; VI-NEXT: v_mov_b32_e32 v14, s26 ; VI-NEXT: v_mov_b32_e32 v15, s27 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_lshl_b32 m0, s4, 1 ; VI-NEXT: v_movreld_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v1, v16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll index d97fc356b30fc..2776e24379b9d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll @@ -1,11 +1,11 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s ; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-SDAG,GCN-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s ; GCN-LABEL: {{^}}ds_append_lds: ; GCN: s_load_dword [[PTR:s[0-9]+]] @@ -35,8 +35,7 @@ define amdgpu_kernel void @ds_append_lds_max_offset(ptr addrspace(3) %lds, ptr a ; GCN-LABEL: {{^}}ds_append_no_fold_offset_si: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI: s_add_i32 [[PTR]], [[PTR]], 16 -; SI: s_mov_b32 m0, [[PTR]] +; SI: s_add_i32 m0, [[PTR]], 16 ; SI: ds_append [[RESULT:v[0-9]+]]{{$}} ; CIPLUS: s_mov_b32 m0, [[PTR]] @@ -55,12 +54,8 @@ define amdgpu_kernel void @ds_append_no_fold_offset_si(ptr addrspace(4) %lds.ptr ; GCN-LABEL: {{^}}ds_append_lds_over_max_offset: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI-SDAG: s_bitset1_b32 [[PTR]], 16 -; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 -; GCN-SDAG: s_mov_b32 m0, [[PTR]] - -; SI-GISEL: s_bitset1_b32 m0, 16 -; CIPLUS-GISEL: s_add_u32 m0, [[PTR]], 0x10000 +; SI: s_or_b32 m0, [[PTR]], 0x10000 +; CIPLUSi|u: s_add_{{i|u}}32 m0, [[PTR]], 0x10000 ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll index ffd0142d1d42c..5795af702f34f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll @@ -1,10 +1,10 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s ; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,GCN-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,GCN-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s ; GCN-LABEL: {{^}}ds_consume_lds: @@ -35,8 +35,7 @@ define amdgpu_kernel void @ds_consume_lds_max_offset(ptr addrspace(3) %lds, ptr ; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI: s_add_i32 [[PTR]], [[PTR]], 16 -; SI: s_mov_b32 m0, [[PTR]] +; SI: s_add_i32 m0, [[PTR]], 16 ; SI: ds_consume [[RESULT:v[0-9]+]]{{$}} ; CIPLUS: s_mov_b32 m0, [[PTR]] @@ -55,11 +54,8 @@ define amdgpu_kernel void @ds_consume_no_fold_offset_si(ptr addrspace(4) %lds.pt ; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI: s_bitset1_b32 [[PTR]], 16 -; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 -; CIPLUS-GISEL: s_add_u32 [[PTR]], [[PTR]], 0x10000 - -; GCN-SDAG: s_mov_b32 m0, [[PTR]] +; SI: s_or_b32 m0, [[PTR]], 0x10000 +; CIPLUS: s_add_{{i|u}}32 m0, [[PTR]], 0x10000 ; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 ; GCN: {{.*}}store{{.*}} [[RESULT]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index a6fd38cab13d0..1e031517adb30 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -4,12 +4,12 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX9 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX9 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG,GFX10 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL,GFX10 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX9 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,GFX10 %s ; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -verify-machineinstrs < %s | FileCheck -check-prefix=MIR %s @@ -61,11 +61,7 @@ define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 { ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 - +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} @@ -78,10 +74,7 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 { ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:1 gds{{$}} @@ -95,10 +88,7 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.ba ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] gds{{$}} @@ -113,10 +103,7 @@ define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], [[BAR_NUM]] ; NOLOOP: ds_gws_barrier [[GWS_VAL]] offset:3 gds{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll index 5db68e3a6c202..0949a60eae185 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -4,12 +4,12 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,LOOP %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP,NOLOOP-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s ; Minimum offset ; GCN-LABEL: {{^}}gws_init_offset0: @@ -55,10 +55,7 @@ define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 { ; GCN-LABEL: {{^}}gws_init_sgpr_offset: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_init [[GWS_VAL]] gds{{$}} @@ -71,10 +68,7 @@ define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 { ; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1: ; NOLOOP-DAG: s_load_{{dwordx2|b64}} s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]] -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, s[[OFFSET]], 16 ; NOLOOP-DAG: v_mov_b32_e32 [[GWS_VAL:v[0-9]+]], s[[BAR_NUM]] ; NOLOOP: ds_gws_init [[GWS_VAL]] offset:1 gds{{$}} @@ -88,10 +82,7 @@ define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] ; NOLOOP: ds_gws_init v0 gds{{$}} @@ -106,10 +97,7 @@ define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { ; NOLOOP-DAG: s_load_{{dword|b32}} [[BAR_NUM:s[0-9]+]] ; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 -; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}} - -; NOLOOP-GISEL-DAG: s_lshl_b32 m0, [[READLANE]], 16 +; NOLOOP-DAG: s_lshl_b32 m0, [[READLANE]], 16 ; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] ; NOLOOP: ds_gws_init v0 offset:3 gds{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll index 21dc07cf28fd6..5b752949859f2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.lds.ll @@ -1,36 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN declare void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) define amdgpu_ps float @buffer_load_lds_dword(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) { -; SDAG-LABEL: buffer_load_lds_dword: -; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: v_mov_b32_e32 v0, 8 -; SDAG-NEXT: s_mov_b32 m0, s4 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: ds_read_b32 v0, v0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: buffer_load_lds_dword: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_mov_b32 m0, s4 -; GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: ds_read_b32 v0, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: ; return to shader part epilog +; GCN-LABEL: buffer_load_lds_dword: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, 8 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog main_body: call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll index 654e72daffedd..c1b8df0898076 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950 %s ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s @@ -14,32 +14,18 @@ declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr ;--------------------------------------------------------------------- define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { -; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds -; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: ds_read_b32 v0, v0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: ; return to shader part epilog -; -; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds -; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX950-GISEL-NEXT: ds_read_b32 v0, v0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: ; return to shader part epilog +; GFX950-LABEL: buffer_load_lds_dwordx3: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ds_read_b32 v0, v0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; return to shader part epilog call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 4, i32 1) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 8, i32 2) @@ -107,32 +93,18 @@ define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) in ;--------------------------------------------------------------------- define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { -; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX950-SDAG-NEXT: s_nop 0 -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: ds_read_b32 v0, v0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: ; return to shader part epilog -; -; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds -; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds -; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX950-GISEL-NEXT: ds_read_b32 v0, v0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: ; return to shader part epilog +; GFX950-LABEL: buffer_load_lds_dwordx4: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ds_read_b32 v0, v0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; return to shader part epilog call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 4, i32 1) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 8, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll index 04a9f926acd5b..35c959f2e805c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.ll @@ -1,36 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,SDAG -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GISEL +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) define amdgpu_ps float @buffer_load_lds_dword(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { -; SDAG-LABEL: buffer_load_lds_dword: -; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: v_mov_b32_e32 v0, 8 -; SDAG-NEXT: s_mov_b32 m0, s4 -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; SDAG-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; SDAG-NEXT: v_mov_b32_e32 v0, s4 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: ds_read_b32 v0, v0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: buffer_load_lds_dword: -; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_mov_b32 m0, s4 -; GISEL-NEXT: v_mov_b32_e32 v0, 8 -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds -; GISEL-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds -; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: ds_read_b32 v0, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: ; return to shader part epilog +; GCN-LABEL: buffer_load_lds_dword: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 m0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, 8 +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:4 glc lds +; GCN-NEXT: buffer_load_dword v0, s[0:3], 0 idxen offset:8 slc lds +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog main_body: call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)