diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 4ac7b6e79ff3e1..39e88482db94d7 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16258,12 +16258,31 @@ Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF, return CGF.Builder.CreateAddrSpaceCast(Call, RetTy); } +Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) { + auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr); + auto *Call = CGF.Builder.CreateCall(F); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 256)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8))); + return Call; +} + // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { - const unsigned XOffset = 4; - auto *DP = EmitAMDGPUDispatchPtr(CGF); - // Indexing the HSA kernel_dispatch_packet struct. - auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 2); + bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion == + clang::TargetOptions::COV_5; + Constant *Offset; + Value *DP; + if (IsCOV_5) { + // Indexing the implicit kernarg segment. + Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2); + DP = EmitAMDGPUImplicitArgPtr(CGF); + } else { + // Indexing the HSA kernel_dispatch_packet struct. + Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); + DP = EmitAMDGPUDispatchPtr(CGF); + } + auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); auto *DstTy = CGF.Int16Ty->getPointerTo(GEP->getType()->getPointerAddressSpace()); diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu index 5928320b89f00f..4c1c4c883a152f 100644 --- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu +++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu @@ -1,17 +1,31 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck %s +// RUN: | FileCheck -check-prefix=PRECOV5 %s + + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=COV5 %s #include "Inputs/cuda.h" -// CHECK-LABEL: test_get_workgroup_size -// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() -// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4 -// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load -// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6 -// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load -// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8 -// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// PRECOV5-LABEL: test_get_workgroup_size +// PRECOV5: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4 +// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6 +// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8 +// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load + +// COV5-LABEL: test_get_workgroup_size +// COV5: call align 8 dereferenceable(256) i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 12 +// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 14 +// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 16 +// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load __device__ void test_get_workgroup_size(int d, int *out) { switch (d) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 46748c9365cea4..997f9dd28dc3b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -542,16 +542,14 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { bool funcRetrievesHeapPtr(Attributor &A) { if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) return false; - auto Pos = llvm::AMDGPU::getHeapPtrImplicitArgPosition(); - AAPointerInfo::OffsetAndSize OAS(Pos, 8); + AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); return funcRetrievesImplicitKernelArg(A, OAS); } bool funcRetrievesQueuePtr(Attributor &A) { if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) return false; - auto Pos = llvm::AMDGPU::getQueuePtrImplicitArgPosition(); - AAPointerInfo::OffsetAndSize OAS(Pos, 8); + AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); return funcRetrievesImplicitKernelArg(A, OAS); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e4d86e2133b5d2..3a5728f99e6d62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4382,10 +4382,14 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + ExplicitArgOffset; switch (Param) { - case GRID_DIM: + case FIRST_IMPLICIT: return ArgOffset; - case GRID_OFFSET: - return ArgOffset + 4; + case PRIVATE_BASE: + return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET; + case SHARED_BASE: + return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET; + case QUEUE_PTR: + return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET; } llvm_unreachable("unexpected implicit parameter type"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 10eecb68fa1dec..73081483f1c3d0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -320,8 +320,9 @@ class AMDGPUTargetLowering : public TargetLowering { enum ImplicitParameter { FIRST_IMPLICIT, - GRID_DIM = FIRST_IMPLICIT, - GRID_OFFSET, + PRIVATE_BASE, + SHARED_BASE, + QUEUE_PTR, }; /// Helper function that returns the byte offset of the given diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 07c28e25467a8d..0cbaed0ad6d27d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1810,6 +1810,39 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); } + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + Register LoadAddr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + // For code object version 5, private_base and shared_base are passed through + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE + : AMDGPUTargetLowering::PRIVATE_BASE; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return Register(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(32), commonAlignment(Align(64), Offset)); + + // Pointer address + B.buildPtrAdd(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + // Load address + return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); + } + Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); @@ -1820,17 +1853,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture( // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; - // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - Register LoadAddr; - - B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + B.buildPtrAdd(LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -4817,6 +4847,47 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm( bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + const LLT S64 = LLT::scalar(64); + + Register SGPR01(AMDGPU::SGPR0_SGPR1); + // For code object version 5, queue_ptr is passed through implicit kernarg. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + AMDGPUTargetLowering::QUEUE_PTR; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return false; + + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(64), commonAlignment(Align(64), Offset)); + + // Pointer address + Register LoadAddr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + B.buildPtrAdd(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + // Load address + Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); + B.buildCopy(SGPR01, Temp); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)) + .addReg(SGPR01, RegState::Implicit); + MI.eraseFromParent(); + return true; + } + // Pass queue pointer to trap handler as input, and insert trap instruction // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi Register LiveIn = @@ -4824,7 +4895,6 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return false; - Register SGPR01(AMDGPU::SGPR0_SGPR1); B.buildCopy(SGPR01, LiveIn); B.buildInstr(AMDGPU::S_TRAP) .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)) diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 61814daa2a9b04..52232d8ab846dc 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -780,6 +780,17 @@ enum OpSel : uint64_t { } // namespace VOP3PEncoding +namespace ImplicitArg { +// Implicit kernel argument offset for code object version 5. +enum Offset_COV5 : unsigned { + HOSTCALL_PTR_OFFSET = 80, + HEAP_PTR_OFFSET = 96, + PRIVATE_BASE_OFFSET = 192, + SHARED_BASE_OFFSET = 196, + QUEUE_PTR_OFFSET = 200, +}; + +} // namespace ImplicitArg } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c9bcb72bb4b004..7053685fd28cfd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5442,24 +5442,41 @@ SDValue SITargetLowering::lowerTrapEndpgm( return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); } +SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, + const SDLoc &DL, Align Alignment, ImplicitParameter Param) const { + MachineFunction &MF = DAG.getMachineFunction(); + uint64_t Offset = getImplicitParameterOffset(MF, Param); + SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); +} + SDValue SITargetLowering::lowerTrapHsaQueuePtr( SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); - MachineFunction &MF = DAG.getMachineFunction(); - SIMachineFunctionInfo *Info = MF.getInfo(); - Register UserSGPR = Info->getQueuePtrUserSGPR(); - SDValue QueuePtr; - if (UserSGPR == AMDGPU::NoRegister) { - // We probably are in a function incorrectly marked with - // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap, - // so just use a null pointer. - QueuePtr = DAG.getConstant(0, SL, MVT::i64); + // For code object version 5, QueuePtr is passed through implicit kernarg. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + QueuePtr = + loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); } else { - QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo(); + Register UserSGPR = Info->getQueuePtrUserSGPR(); + + if (UserSGPR == AMDGPU::NoRegister) { + // We probably are in a function incorrectly marked with + // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the + // trap, so just use a null pointer. + QueuePtr = DAG.getConstant(0, SL, MVT::i64); + } else { + QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, + MVT::i64); + } } SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); @@ -5535,6 +5552,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); } + // For code object version 5, private_base and shared_base are passed through + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + ImplicitParameter Param = + (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; + return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); Register UserSGPR = Info->getQueuePtrUserSGPR(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 98e6b9bbc2ebf3..7468d4db0829ef 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -53,6 +53,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { uint64_t Offset, Align Alignment, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, const SDLoc &DL, + Align Alignment, + ImplicitParameter Param) const; SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, const SDLoc &SL, SDValue Chain, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 14d8b1db48090a..155a352194afff 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -149,27 +149,13 @@ unsigned getHostcallImplicitArgPosition() { case 4: return 24; case 5: - return 80; + return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET; default: llvm_unreachable("Unexpected code object version"); return 0; } } -unsigned getHeapPtrImplicitArgPosition() { - if (AmdhsaCodeObjectVersion == 5) - return 96; - llvm_unreachable("hidden_heap is supported only by code object version 5"); - return 0; -} - -unsigned getQueuePtrImplicitArgPosition() { - if (AmdhsaCodeObjectVersion == 5) - return 200; - llvm_unreachable("queue_ptr is supported only by code object version 5"); - return 0; -} - #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index c925f003c96720..118dbbfd5d9a18 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -57,12 +57,6 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI); /// \returns The offset of the hostcall pointer argument from implicitarg_ptr unsigned getHostcallImplicitArgPosition(); -/// \returns The offset of the heap ptr argument from implicitarg_ptr -unsigned getHeapPtrImplicitArgPosition(); - -/// \returns The offset of the queue ptr argument from implicitarg_ptr -unsigned getQueuePtrImplicitArgPosition(); - /// \returns Code object version. unsigned getAmdhsaCodeObjectVersion(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll new file mode 100644 index 00000000000000..3e1a23abdda628 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -0,0 +1,546 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=GFX8V3 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=GFX8V4 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=GFX8V5 %s + +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefixes=GFX9V3 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s + +define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) { +; GFX8V3-LABEL: addrspacecast: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x44 +; GFX8V3-NEXT: s_load_dword s5, s[4:5], 0x40 +; GFX8V3-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_mov_b32 s2, s0 +; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8V3-NEXT: s_mov_b32 s4, s1 +; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V3-NEXT: flat_store_dword v[0:1], v2 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_store_dword v[0:1], v2 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: addrspacecast: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s5, s[4:5], 0x40 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_mov_b32 s2, s0 +; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8V4-NEXT: s_mov_b32 s4, s1 +; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: flat_store_dword v[0:1], v2 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_store_dword v[0:1], v2 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: addrspacecast: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dword s3, s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dword s5, s[4:5], 0xcc +; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_mov_b32 s2, s0 +; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8V5-NEXT: s_mov_b32 s4, s1 +; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V5-NEXT: flat_store_dword v[0:1], v2 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_store_dword v[0:1], v2 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: addrspacecast: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s3, s2, 16 +; GFX9V3-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_mov_b32 s2, s0 +; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9V3-NEXT: s_lshl_b32 s5, s4, 16 +; GFX9V3-NEXT: s_mov_b32 s4, s1 +; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V3-NEXT: flat_store_dword v[0:1], v2 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V3-NEXT: flat_store_dword v[0:1], v2 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: addrspacecast: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s3, s2, 16 +; GFX9V4-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_mov_b32 s2, s0 +; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9V4-NEXT: s_lshl_b32 s5, s4, 16 +; GFX9V4-NEXT: s_mov_b32 s4, s1 +; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V4-NEXT: flat_store_dword v[0:1], v2 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V4-NEXT: flat_store_dword v[0:1], v2 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: addrspacecast: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s3, s2, 16 +; GFX9V5-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_mov_b32 s2, s0 +; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9V5-NEXT: s_lshl_b32 s5, s4, 16 +; GFX9V5-NEXT: s_mov_b32 s4, s1 +; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V5-NEXT: flat_store_dword v[0:1], v2 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V5-NEXT: flat_store_dword v[0:1], v2 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32* + %flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32* + store volatile i32 1, i32* %flat.private + store volatile i32 2, i32* %flat.local + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_shared: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_shared: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_shared: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_shared: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_shared: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_shared: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr) + %zext = zext i1 %is.shared to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_private: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_private: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_private: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_private: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_private: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_private: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr) + %zext = zext i1 %is.private to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_trap() { +; GFX8V3-LABEL: llvm_trap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V3-NEXT: s_trap 2 +; +; GFX8V4-LABEL: llvm_trap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V4-NEXT: s_trap 2 +; +; GFX8V5-LABEL: llvm_trap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_trap 2 +; +; GFX9V3-LABEL: llvm_trap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9V3-NEXT: s_trap 2 +; +; GFX9V4-LABEL: llvm_trap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 2 +; +; GFX9V5-LABEL: llvm_trap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 2 + call void @llvm.trap() + unreachable +} + +define amdgpu_kernel void @llvm_debugtrap() { +; GFX8V3-LABEL: llvm_debugtrap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_trap 3 +; +; GFX8V4-LABEL: llvm_debugtrap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_trap 3 +; +; GFX8V5-LABEL: llvm_debugtrap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_trap 3 +; +; GFX9V3-LABEL: llvm_debugtrap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_trap 3 +; +; GFX9V4-LABEL: llvm_debugtrap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 3 +; +; GFX9V5-LABEL: llvm_debugtrap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 3 + call void @llvm.debugtrap() + unreachable +} + +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V3-NEXT: s_add_u32 s0, s8, 8 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v3, s1 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s0 +; GFX8V3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s10 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s11 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v3, s1 +; GFX8V4-NEXT: v_mov_b32_e32 v2, s0 +; GFX8V4-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_u32 s0, s6, 8 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s8 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s9 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v3, s1 +; GFX8V5-NEXT: v_mov_b32_e32 v2, s0 +; GFX8V5-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V4-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V4-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, s8 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s9 +; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() + %queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr + %implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr + %dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr + store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr + ret void +} + +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() +declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +declare i64 @llvm.amdgcn.dispatch.id() +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +declare i1 @llvm.amdgcn.is.shared(i8*) +declare i1 @llvm.amdgcn.is.private(i8*) +declare void @llvm.trap() +declare void @llvm.debugtrap() diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll new file mode 100644 index 00000000000000..c1e82de132d988 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -0,0 +1,550 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=GFX8V3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=GFX8V4 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=GFX8V5 %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefixes=GFX9V3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s + +define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) { +; GFX8V3-LABEL: addrspacecast: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_load_dword s2, s[4:5], 0x44 +; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x40 +; GFX8V3-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v2, s1 +; GFX8V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V3-NEXT: flat_store_dword v[0:1], v4 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V3-NEXT: flat_store_dword v[2:3], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: addrspacecast: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dword s2, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x40 +; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v2, s1 +; GFX8V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V4-NEXT: flat_store_dword v[0:1], v4 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V4-NEXT: flat_store_dword v[2:3], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: addrspacecast: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dword s2, s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dword s3, s[4:5], 0xcc +; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v2, s1 +; GFX8V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V5-NEXT: flat_store_dword v[0:1], v4 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V5-NEXT: flat_store_dword v[2:3], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: addrspacecast: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V3-NEXT: v_mov_b32_e32 v4, 1 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v2, s0 +; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v2, s1 +; GFX9V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V3-NEXT: flat_store_dword v[0:1], v4 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, 2 +; GFX9V3-NEXT: flat_store_dword v[2:3], v0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: addrspacecast: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v2, s0 +; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 +; GFX9V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V4-NEXT: flat_store_dword v[0:1], v4 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, 2 +; GFX9V4-NEXT: flat_store_dword v[2:3], v0 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: addrspacecast: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v2, s0 +; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 +; GFX9V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V5-NEXT: flat_store_dword v[0:1], v4 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, 2 +; GFX9V5-NEXT: flat_store_dword v[2:3], v0 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32* + %flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32* + store volatile i32 1, i32* %flat.private + store volatile i32 2, i32* %flat.local + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_shared: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_shared: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_shared: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc +; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_shared: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_shared: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_shared: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr) + %zext = zext i1 %is.shared to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_private: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_private: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_private: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_private: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_private: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_private: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr) + %zext = zext i1 %is.private to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_trap() { +; GFX8V3-LABEL: llvm_trap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V3-NEXT: s_trap 2 +; +; GFX8V4-LABEL: llvm_trap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V4-NEXT: s_trap 2 +; +; GFX8V5-LABEL: llvm_trap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_trap 2 +; +; GFX9V3-LABEL: llvm_trap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9V3-NEXT: s_trap 2 +; +; GFX9V4-LABEL: llvm_trap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 2 +; +; GFX9V5-LABEL: llvm_trap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 2 + call void @llvm.trap() + unreachable +} + +define amdgpu_kernel void @llvm_debugtrap() { +; GFX8V3-LABEL: llvm_debugtrap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_trap 3 +; +; GFX8V4-LABEL: llvm_debugtrap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_trap 3 +; +; GFX8V5-LABEL: llvm_debugtrap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_trap 3 +; +; GFX9V3-LABEL: llvm_debugtrap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_trap 3 +; +; GFX9V4-LABEL: llvm_debugtrap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 3 +; +; GFX9V5-LABEL: llvm_debugtrap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 3 + call void @llvm.debugtrap() + unreachable +} + +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V3-NEXT: s_add_u32 s0, s8, 8 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s10 +; GFX8V3-NEXT: v_mov_b32_e32 v3, s11 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: v_mov_b32_e32 v2, s10 +; GFX8V4-NEXT: v_mov_b32_e32 v3, s11 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_u32 s0, s6, 8 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_addc_u32 s1, s7, 0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, s8 +; GFX8V5-NEXT: v_mov_b32_e32 v3, s9 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V4-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V4-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, s8 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s9 +; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() + %queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr + %implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr + %dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr + store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr + ret void +} + +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() +declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +declare i64 @llvm.amdgcn.dispatch.id() +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +declare i1 @llvm.amdgcn.is.shared(i8*) +declare i1 @llvm.amdgcn.is.private(i8*) +declare void @llvm.trap() +declare void @llvm.debugtrap()