diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e48dca3cc9572b..4883b6a86ef8f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3905,20 +3905,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits; } +// Return the wave level SGPR base address if this is a wave address. +static Register getWaveAddress(const MachineInstr *Def) { + return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS + ? Def->getOperand(1).getReg() + : Register(); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); + Register Reg = Root.getReg(); + const SIMachineFunctionInfo *Info = MF->getInfo(); + + const MachineInstr *Def = MRI->getVRegDef(Reg); + if (Register WaveBase = getWaveAddress(Def)) { + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(WaveBase); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset + }}; + } int64_t Offset = 0; + + // FIXME: Copy check is a hack + Register BasePtr; + if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) { + if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset)) + return {}; + const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr); + Register WaveBase = getWaveAddress(BasePtrDef); + if (!WaveBase) + return {}; + + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(WaveBase); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset + }}; + } + if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) return {}; - const MachineFunction *MF = MBB->getParent(); - const SIMachineFunctionInfo *Info = MF->getInfo(); - return {{ [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 0519a9c7db321c..8e0f3fc4798911 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -14,21 +14,20 @@ define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF: ; %bb.0: ; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; MUBUF-NEXT: s_mov_b32 s32, 0 ; MUBUF-NEXT: s_add_u32 s0, s0, s7 +; MUBUF-NEXT: s_mov_b32 s32, 0 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; MUBUF-NEXT: v_mov_b32_e32 v1, 9 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_mov_b32_e32 v1, 10 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; MUBUF-NEXT: v_mov_b32_e32 v1, 11 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; MUBUF-NEXT: v_mov_b32_e32 v1, 12 +; MUBUF-NEXT: v_mov_b32_e32 v0, 9 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v0, 10 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; MUBUF-NEXT: v_mov_b32_e32 v0, 11 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; MUBUF-NEXT: v_mov_b32_e32 v0, 12 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: s_endpgm ; @@ -112,42 +111,41 @@ define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64 ; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68 ; MUBUF-NEXT: s_movk_i32 s32, 0x1400 -; MUBUF-NEXT: v_lshrrev_b32_e64 v16, 6, s32 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v1, v16, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v3, v16, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen offset:16 +; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen offset:20 +; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v6, v16, s[0:3], 0 offen offset:24 +; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v7, v16, s[0:3], 0 offen offset:28 +; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v8, v16, s[0:3], 0 offen offset:32 +; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v9, v16, s[0:3], 0 offen offset:36 +; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen offset:40 +; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen offset:44 +; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen offset:48 +; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen offset:52 +; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen offset:56 +; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; MUBUF-NEXT: s_waitcnt vmcnt(15) -; MUBUF-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen offset:60 +; MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: s_endpgm ; @@ -244,20 +242,19 @@ define void @func_caller_stack() { ; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; MUBUF-NEXT: v_mov_b32_e32 v1, 9 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_mov_b32_e32 v1, 10 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; MUBUF-NEXT: v_mov_b32_e32 v1, 11 +; MUBUF-NEXT: v_mov_b32_e32 v0, 9 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v0, 10 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; MUBUF-NEXT: v_mov_b32_e32 v0, 11 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; MUBUF-NEXT: v_mov_b32_e32 v1, 12 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; MUBUF-NEXT: v_mov_b32_e32 v0, 12 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 -; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s4, v40, 0 ; MUBUF-NEXT: v_readlane_b32 s5, v40, 1 @@ -317,65 +314,64 @@ define void @func_caller_byval([16 x i32] addrspace(5)* %argptr) { ; MUBUF-NEXT: v_writelane_b32 v40, s33, 2 ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s32 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:8 ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:12 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:8 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:12 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:16 ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:20 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:16 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:20 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:24 ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:28 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:24 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:28 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:32 ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:36 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:32 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:36 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:40 ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:44 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:40 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:44 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:48 ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:52 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:48 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:48 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:52 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:56 ; MUBUF-NEXT: s_nop 0 ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:60 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:56 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; MUBUF-NEXT: s_waitcnt vmcnt(1) -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:60 +; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s4, v40, 0 ; MUBUF-NEXT: v_readlane_b32 s5, v40, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir index 11b7c28860aea3..246c7686d646da 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -581,3 +581,144 @@ body: | G_STORE %1, %0 :: (store (s8), align 1, addrspace 5) ... + +--- +name: function_store_private_s32_to_4_wave_address +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 + G_STORE %0, %1 :: (store (s32), align 4, addrspace 5) + +... + +# Has regbank copy of constant +--- +name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 + %2:sgpr(s32) = G_CONSTANT i32 4095 + %3:vgpr(s32) = COPY %2 + %4:vgpr(p5) = G_PTR_ADD %1, %3 + G_STORE %0, %4 :: (store (s32), align 4, addrspace 5) + +... + +--- +name: function_store_private_s32_to_4_wave_address_offset_4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX6-NEXT: %3:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_LSHRREV_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 + %2:vgpr(s32) = G_CONSTANT i32 4095 + %3:vgpr(p5) = G_PTR_ADD %1, %2 + G_STORE %0, %3 :: (store (s32), align 4, addrspace 5) + +... + +--- +name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6-NEXT: {{ $}} + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX6-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec + ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32 + %2:sgpr(s32) = G_CONSTANT i32 4096 + %3:vgpr(s32) = COPY %2 + %4:vgpr(p5) = G_PTR_ADD %1, %3 + G_STORE %0, %4 :: (store (s32), align 4, addrspace 5) + +... diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll index 6950088bd71b5a..3c32a431bc01d0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -144,8 +144,7 @@ attributes #0 = { nounwind } ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} -; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}} +; GCN-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} ; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: @@ -187,15 +186,13 @@ attributes #0 = { nounwind } ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; SDAG-NEXT: .vgpr_count: 0x2a{{$}} -; GISEL-NEXT: .vgpr_count: 0x34{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; SDAG-NEXT: .vgpr_count: 0x2a{{$}} -; GISEL-NEXT: .vgpr_count: 0x34{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -227,8 +224,7 @@ attributes #0 = { nounwind } ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; SDAG-NEXT: .vgpr_count: 0x2b{{$}} -; GISEL-NEXT: .vgpr_count: 0x34{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x26{{$}}