diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index f010b4a5b52c5..c1b515dae3ebe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -492,24 +492,19 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, return true; } -Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, - Type *ParamTy, - uint64_t Offset) const { - +void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, + Type *ParamTy, + uint64_t Offset) const { MachineFunction &MF = B.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = MF.getFunction(); - const DataLayout &DL = F.getParent()->getDataLayout(); - PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); - LLT PtrType = getLLTForType(*PtrTy, DL); Register KernArgSegmentPtr = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); - return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0); + B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); } void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, @@ -520,7 +515,10 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, const DataLayout &DL = F.getParent()->getDataLayout(); MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); + + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); + lowerParameterPtr(PtrReg, B, ParamTy, Offset); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, @@ -607,12 +605,15 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( // TODO: Align down to dword alignment and extract bits for extending loads. for (auto &Arg : F.args()) { - Type *ArgTy = Arg.getType(); + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); if (AllocSize == 0) continue; - Align ABIAlign = DL.getABITypeAlign(ArgTy); + MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; + if (!ABIAlign) + ABIAlign = DL.getABITypeAlign(ArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; @@ -622,16 +623,34 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( continue; } - ArrayRef OrigArgRegs = VRegs[i]; - Register ArgReg = - OrigArgRegs.size() == 1 - ? OrigArgRegs[0] - : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); - Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); - lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); - if (OrigArgRegs.size() > 1) - unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); + + if (IsByRef) { + unsigned ByRefAS = cast(Arg.getType())->getAddressSpace(); + + assert(VRegs[i].size() == 1 && + "expected only one register for byval pointers"); + if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { + lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); + } else { + const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); + lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); + + B.buildAddrSpaceCast(VRegs[i][0], PtrReg); + } + } else { + ArrayRef OrigArgRegs = VRegs[i]; + Register ArgReg = + OrigArgRegs.size() == 1 + ? OrigArgRegs[0] + : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); + + lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); + if (OrigArgRegs.size() > 1) + unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); + } + ++i; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 4aaf0dc55f41c..d11b9801b9716 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -23,8 +23,8 @@ class AMDGPUTargetLowering; class MachineInstrBuilder; class AMDGPUCallLowering: public CallLowering { - Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy, - uint64_t Offset) const; + void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy, + uint64_t Offset) const; void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, Align Alignment, Register DstReg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index c6f6a3b84e367..5ecb048797963 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -24,6 +24,23 @@ #include "llvm/IR/Module.h" #include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static std::pair getArgumentTypeAlign(const Argument &Arg, + const DataLayout &DL) { + Type *Ty = Arg.getType(); + MaybeAlign ArgAlign; + if (Arg.hasByRefAttr()) { + Ty = Arg.getParamByRefType(); + ArgAlign = Arg.getParamAlign(); + } + + if (!ArgAlign) + ArgAlign = DL.getABITypeAlign(Ty); + + return std::make_pair(Ty, *ArgAlign); +} + namespace llvm { static cl::opt DumpHSAMetadata( @@ -311,23 +328,28 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { if (Node && ArgNo < Node->getNumOperands()) TypeQual = cast(Node->getOperand(ArgNo))->getString(); - Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); MaybeAlign PointeeAlign; - if (auto PtrTy = dyn_cast(Ty)) { + if (auto PtrTy = dyn_cast(Arg.getType())) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // FIXME: Should report this for all address spaces PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), PtrTy->getElementType()); } } - emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName), - PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); + Type *ArgTy; + Align ArgAlign; + std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); + + emitKernelArg(DL, ArgTy, ArgAlign, + getValueKind(ArgTy, TypeQual, BaseTypeName), PointeeAlign, Name, + TypeName, BaseTypeName, AccQual, TypeQual); } void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, - ValueKind ValueKind, + Align Alignment, ValueKind ValueKind, MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, @@ -338,7 +360,7 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, Arg.mName = std::string(Name); Arg.mTypeName = std::string(TypeName); Arg.mSize = DL.getTypeAllocSize(Ty); - Arg.mAlign = DL.getABITypeAlign(Ty).value(); + Arg.mAlign = Alignment.value(); Arg.mValueKind = ValueKind; Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0; @@ -374,11 +396,11 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { auto Int64Ty = Type::getInt64Ty(Func.getContext()); if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX); + emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetX); if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); + emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetY); if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); + emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetZ); auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); @@ -387,31 +409,31 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { // "none" argument. if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer); else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { // The printf runtime binding pass should have ensured that hostcall and // printf are not used in the same module. assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenHostcallBuffer); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer); } else - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); } // Emit "default queue" and "completion action" arguments if enqueue kernel is // used, otherwise emit dummy "none" arguments. if (HiddenArgNumBytes >= 48) { if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenDefaultQueue); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction); } else { - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); } } // Emit the pointer argument for multi-grid object. if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg); } bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { @@ -699,10 +721,12 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, if (Node && ArgNo < Node->getNumOperands()) TypeQual = cast(Node->getOperand(ArgNo))->getString(); - Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); MaybeAlign PointeeAlign; + Type *Ty = Arg.hasByRefAttr() ? Arg.getParamByRefType() : Arg.getType(); + + // FIXME: Need to distinguish in memory alignment from pointer alignment. if (auto PtrTy = dyn_cast(Ty)) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), @@ -710,19 +734,21 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, } } - emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), - getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset, - Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual, - TypeQual); + // There's no distinction between byval aggregates and raw aggregates. + Type *ArgTy; + Align ArgAlign; + std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); + + emitKernelArg(DL, ArgTy, ArgAlign, + getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args, + PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); } -void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, - StringRef ValueKind, unsigned &Offset, - msgpack::ArrayDocNode Args, - MaybeAlign PointeeAlign, StringRef Name, - StringRef TypeName, - StringRef BaseTypeName, - StringRef AccQual, StringRef TypeQual) { +void MetadataStreamerV3::emitKernelArg( + const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind, + unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign, + StringRef Name, StringRef TypeName, StringRef BaseTypeName, + StringRef AccQual, StringRef TypeQual) { auto Arg = Args.getDocument()->getMapNode(); if (!Name.empty()) @@ -730,7 +756,6 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, if (!TypeName.empty()) Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true); auto Size = DL.getTypeAllocSize(Ty); - Align Alignment = DL.getABITypeAlign(Ty); Arg[".size"] = Arg.getDocument()->getNode(Size); Offset = alignTo(Offset, Alignment); Arg[".offset"] = Arg.getDocument()->getNode(Offset); @@ -777,11 +802,14 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, auto Int64Ty = Type::getInt64Ty(Func.getContext()); if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, + Args); if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, + Args); if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, + Args); auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); @@ -790,31 +818,36 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, // "none" argument. if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, + Args); else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { // The printf runtime binding pass should have ensured that hostcall and // printf are not used in the same module. assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); - emitKernelArg(DL, Int8PtrTy, "hidden_hostcall_buffer", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, + Args); } else - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } // Emit "default queue" and "completion action" arguments if enqueue kernel is // used, otherwise emit dummy "none" arguments. if (HiddenArgNumBytes >= 48) { if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args); - emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, + Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, + Args); } else { - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } } // Emit the pointer argument for multi-grid object. if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, + Args); } msgpack::MapDocNode diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 9534fffd228d3..8390ea90e2719 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -87,11 +87,12 @@ class MetadataStreamerV3 final : public MetadataStreamer { void emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args); - void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, - unsigned &Offset, msgpack::ArrayDocNode Args, - MaybeAlign PointeeAlign = None, StringRef Name = "", - StringRef TypeName = "", StringRef BaseTypeName = "", - StringRef AccQual = "", StringRef TypeQual = ""); + void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment, + StringRef ValueKind, unsigned &Offset, + msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign = None, + StringRef Name = "", StringRef TypeName = "", + StringRef BaseTypeName = "", StringRef AccQual = "", + StringRef TypeQual = ""); void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, msgpack::ArrayDocNode Args); @@ -156,10 +157,11 @@ class MetadataStreamerV2 final : public MetadataStreamer { void emitKernelArg(const Argument &Arg); - void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, - MaybeAlign PointeeAlign = None, StringRef Name = "", - StringRef TypeName = "", StringRef BaseTypeName = "", - StringRef AccQual = "", StringRef TypeQual = ""); + void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment, + ValueKind ValueKind, MaybeAlign PointeeAlign = None, + StringRef Name = "", StringRef TypeName = "", + StringRef BaseTypeName = "", StringRef AccQual = "", + StringRef TypeQual = ""); void emitHiddenKernelArgs(const Function &Func); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 940ec6f31c698..aef51180988d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1017,10 +1017,14 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( unsigned InIndex = 0; for (const Argument &Arg : Fn.args()) { + const bool IsByRef = Arg.hasByRefAttr(); Type *BaseArgTy = Arg.getType(); - Align Alignment = DL.getABITypeAlign(BaseArgTy); - MaxAlign = std::max(Alignment, MaxAlign); - unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); + Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; + MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; + if (!Alignment) + Alignment = DL.getABITypeAlign(MemArgTy); + MaxAlign = max(Alignment, MaxAlign); + uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 62ab5bb55a16a..650c872182eca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -108,10 +108,14 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { uint64_t ExplicitArgOffset = 0; for (Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - Align ABITypeAlign = DL.getABITypeAlign(ArgTy); - unsigned Size = DL.getTypeSizeInBits(ArgTy); - unsigned AllocSize = DL.getTypeAllocSize(ArgTy); + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); + MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None; + if (!ABITypeAlign) + ABITypeAlign = DL.getABITypeAlign(ArgTy); + + uint64_t Size = DL.getTypeSizeInBits(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; @@ -119,6 +123,19 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { if (Arg.use_empty()) continue; + // If this is byval, the loads are already explicit in the function. We just + // need to rewrite the pointer values. + if (IsByRef) { + Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".byval.kernarg.offset"); + + Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + ArgOffsetPtr, Arg.getType()); + Arg.replaceAllUsesWith(CastOffsetPtr); + continue; + } + if (PointerType *PT = dyn_cast(ArgTy)) { // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing // modes on SI to know the high bits are 0 so pointer adds don't wrap. We diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 213788ae0f67b..ac30137943bc2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -508,12 +508,15 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, MaxAlign = Align(1); for (const Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); + MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; + if (!Alignment) + Alignment = DL.getABITypeAlign(ArgTy); - const Align Alignment = DL.getABITypeAlign(ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; - MaxAlign = std::max(MaxAlign, Alignment); + MaxAlign = max(MaxAlign, Alignment); } return ExplicitArgBytes; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c2666f3302151..4ad4865a0dcc9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2254,9 +2254,21 @@ SDValue SITargetLowering::LowerFormalArguments( const uint64_t Offset = VA.getLocMemOffset(); Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); - SDValue Arg = - lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment, - Ins[i].Flags.isSExt(), &Ins[i]); + if (Arg.Flags.isByRef()) { + SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); + + if (!isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace())) { + Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace()); + } + + InVals.push_back(Ptr); + continue; + } + + SDValue Arg = lowerKernargMemParameter( + DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll index 76a6f1732102b..eda521cce827e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -1265,3 +1265,467 @@ define amdgpu_kernel void @unused_i32_arg(i32 addrspace(1)* nocapture %out, i32 entry: ret void } + +; Byref pointers should only be treated as offsets from kernarg +define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_i8_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (load 1 from %ir.in.byref, addrspace 4) + ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8) + ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (load 1 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8) + ; LEGACY-MESA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i8, i8 addrspace(4)* %in.byref + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) align 2 %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_i16_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 from %ir.in.byref, addrspace 4) + ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16) + ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16) + ; LEGACY-MESA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i16, i16 addrspace(4)* %in.byref + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align 4 %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) align(16) %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_constant_v4i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 16 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store 16 into %ir.out, align 4, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out.cast, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 16 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store 16 into %ir.out, align 4, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out.cast, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_align_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 260 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 292 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 296 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg + ; HSA-VI: bb.1 (%ir-block.1): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 128 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 64 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store 64 into %ir.cast.out, align 4, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.1): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 100 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 164 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 64 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store 64 into %ir.cast.out, align 4, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byref kernel arguments with other global address spaces. +define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_global_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.1, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.1, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(1)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_flat_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (load 4 from %ir.in.byref) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (load 4 from %ir.in.byref) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_32bit_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p6) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (load 4 from %ir.in.byref, addrspace 6) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p6) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (load 4 from %ir.in.byref, addrspace 6) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(6)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_unknown_as_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p999) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (load 4 from %ir.in.byref, addrspace 999) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p999) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (load 4 from %ir.in.byref, addrspace 999) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(999)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_local_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p3) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (load 4 from %ir.in.byref, addrspace 3) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p3) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (load 4 from %ir.in.byref, addrspace 3) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(3)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(4) %in0.byref, i32 addrspace(4)* byref(i32) align(4) %in1.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: multi_byref_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; HSA-VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in0.byref, addrspace 4) + ; HSA-VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 4 from %ir.in1.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 52 + ; LEGACY-MESA-VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in0.byref, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 4 from %ir.in1.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in0 = load i32, i32 addrspace(4)* %in0.byref + %in1 = load i32, i32 addrspace(4)* %in1.byref + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_i32_arg_offset0 + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; HSA-VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0 + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byref + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll index 1eb4eb717d94d..2c392c7c31b6c 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -53,6 +53,80 @@ define amdgpu_kernel void @test_char(i8 %a) #0 ret void } +; CHECK: - .args: +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +; CHECK-NOT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK: .value_kind: hidden_multigrid_sync_arg +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_char_byref_constant +; CHECK: .symbol: test_char_byref_constant.kd +define amdgpu_kernel void @test_char_byref_constant(i8 addrspace(4)* byref(i8) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - .args: +; CHECK-NEXT: - .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 512 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 520 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 528 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 536 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 544 +; CHECK-NEXT: .size: 8 +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +; CHECK-NOT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK: .value_kind: hidden_multigrid_sync_arg +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_char_byref_constant_align512 +; CHECK: .symbol: test_char_byref_constant_align512.kd +define amdgpu_kernel void @test_char_byref_constant_align512(i8, i8 addrspace(4)* byref(i8) align(512) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !111 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: - .name: a ; CHECK-NEXT: .offset: 0 @@ -529,6 +603,49 @@ define amdgpu_kernel void @test_struct(%struct.A %a) #0 ret void } +; CHECK: - .args: +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .type_name: struct A +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_struct_byref_constant +; CHECK: .symbol: test_struct_byref_constant.kd +define amdgpu_kernel void @test_struct_byref_constant(%struct.A addrspace(4)* byref(%struct.A) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: .name: a ; CHECK-NEXT: .offset: 0 @@ -572,6 +689,49 @@ define amdgpu_kernel void @test_array([32 x i8] %a) #0 ret void } +; CHECK: - .args: +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 32 +; CHECK-NEXT: .type_name: struct A +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_array_byref_constant +; CHECK: .symbol: test_array_byref_constant.kd +define amdgpu_kernel void @test_array_byref_constant([32 x i8] addrspace(4)* byref([32 x i8]) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: - .name: a ; CHECK-NEXT: .offset: 0 @@ -1792,5 +1952,6 @@ attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} !110 = !{!"__block_literal"} +!111 = !{!"char", !"char"} ; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll index 85915cc5140c3..5c5960f8b1fe7 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -58,6 +58,80 @@ define amdgpu_kernel void @test_char(i8 %a) #0 ret void } +; CHECK: - Name: test_char_byref_constant +; CHECK-NEXT: SymbolName: 'test_char_byref_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +; CHECK: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_char_byref_constant(i8 addrspace(4)* byref(i8) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_char_byref_constant_align512 +; CHECK-NEXT: SymbolName: 'test_char_byref_constant_align512@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 512 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +; CHECK: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_char_byref_constant_align512(i8, i8 addrspace(4)* byref(i8) align 512 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !111 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_ushort2 ; CHECK-NEXT: SymbolName: 'test_ushort2@kd' ; CHECK-NEXT: Language: OpenCL C @@ -523,6 +597,48 @@ define amdgpu_kernel void @test_struct(%struct.A %a) #0 ret void } +; CHECK: - Name: test_struct_byref_constant +; CHECK-NEXT: SymbolName: 'test_struct_byref_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: struct A +; CHECK-NEXT: Size: 8 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_struct_byref_constant(%struct.A addrspace(4)* byref(%struct.A) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_array ; CHECK-NEXT: SymbolName: 'test_array@kd' ; CHECK-NEXT: Language: OpenCL C @@ -565,6 +681,48 @@ define amdgpu_kernel void @test_array([8 x i8] %a) #0 ret void } +; CHECK: - Name: test_array_byref_constant +; CHECK-NEXT: SymbolName: 'test_array_byref_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: struct A +; CHECK-NEXT: Size: 8 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_array_byref_constant([8 x i8] addrspace(4)* byref([8 x i8]) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_i128 ; CHECK-NEXT: SymbolName: 'test_i128@kd' ; CHECK-NEXT: Language: OpenCL C @@ -1766,5 +1924,5 @@ attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} !110 = !{!"__block_literal"} - +!111 = !{!"char", !"char"} ; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 49c2bf08ba3e1..72f8c9cfc55ff 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -920,3 +920,25 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { store volatile i8 %val, i8 addrspace(1)* undef ret void } + +; GCN-LABEL: {{^}}byref_align_constant_i32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 264 +; HSA-GFX9-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x100{{$}} +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 132 +; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 +; HSA-GFX9-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index 8a33e92557cb1..7250ef262bf16 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -152,3 +152,137 @@ entry: store <3 x i15> %in, <3 x i15> addrspace(1)* %out, align 4 ret void } + +; Byref pointers should only be treated as offsets from kernarg +; GCN-LABEL: {{^}}byref_constant_i8_arg: +; GCN: kernarg_segment_byte_size = 12 +; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 +; GCN: global_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { + %in = load i8, i8 addrspace(4)* %in.byref + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_i16_arg: +; GCN: kernarg_segment_byte_size = 12 +; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 +; GCN: global_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) { + %in = load i16, i16 addrspace(4)* %in.byref + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_i32_arg: +; GCN: kernarg_segment_byte_size = 16 +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} +; GCN: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0xc{{$}} +define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_v4i32_arg: +; GCN: kernarg_segment_byte_size = 36 +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}} +define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) { + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_align_constant_i32_arg: +; GCN: kernarg_segment_byte_size = 264 +; GCN-DAG: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x100{{$}} +; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}} +; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]] +; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]] +; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_IN]] +; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_AFTER_OFFSET]] +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: +; GCN: kernarg_segment_byte_size = 132 +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 +; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byref kernel arguments with other global address spaces. +; GCN-LABEL: {{^}}byref_global_i32_arg: +; GCN: kernarg_segment_byte_size = 12 +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} +define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) { + %in = load i32, i32 addrspace(1)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_flat_i32_arg: +; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} +define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) { + %in = load i32, i32* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg: +; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8 +; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}} +define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) { + %in = load i32, i32 addrspace(6)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref %in.byref) { +; %in = load i32, i32 addrspace(999)* %in.byref +; store i32 %in, i32 addrspace(1)* %out, align 4 +; ret void +; } + +; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: +; GCN: kernarg_segment_byte_size = 20 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x8 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0xc +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x10 +define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) { + %in0 = load i32, i32 addrspace(4)* %in0.byref + %in1 = load i32, i32 addrspace(4)* %in1.byref + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_i32_arg_offset0: +; GCN: kernarg_segment_byte_size = 4 +; GCN-NOT: s4 +; GCN-NOT: s5 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}} +define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) { + %in = load i32, i32 addrspace(4)* %in.byref + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll index 9daaeee89cea5..4d03f827ac2f1 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -1486,6 +1486,432 @@ define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) { ret void } +; Byref pointers should only be treated as offsets from kernarg +define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { +; HSA-LABEL: @byref_constant_i8_arg( +; HSA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1 +; HSA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32 +; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i8_arg( +; MESA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1 +; MESA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32 +; MESA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i8, i8 addrspace(4)* %in.byref + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) { +; HSA-LABEL: @byref_constant_i16_arg( +; HSA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)* +; HSA-NEXT: [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2 +; HSA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32 +; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i16_arg( +; MESA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)* +; MESA-NEXT: [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2 +; MESA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32 +; MESA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i16, i16 addrspace(4)* %in.byref + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_constant_i32_arg( +; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i32_arg( +; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_constant_v4i32_arg( +; HSA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16 +; HSA-NEXT: store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)* +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_v4i32_arg( +; MESA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16 +; MESA-NEXT: store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)* +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4 +; MESA-NEXT: ret void +; + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_align_constant_i32_arg( +; HSA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_align_constant_i32_arg( +; MESA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(300) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_natural_align_constant_v16i32_arg( +; HSA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(132) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64 +; HSA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64 +; HSA-NEXT: [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)* +; HSA-NEXT: store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_natural_align_constant_v16i32_arg( +; MESA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(168) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64 +; MESA-NEXT: [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)* +; MESA-NEXT: store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byref kernel arguments with other global address spaces. +define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_global_i32_arg( +; HSA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_global_i32_arg( +; MESA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(1)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) { +; HSA-LABEL: @byref_flat_i32_arg( +; HSA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32* +; HSA-NEXT: [[IN:%.*]] = load i32, i32* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_flat_i32_arg( +; MESA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32* +; MESA-NEXT: [[IN:%.*]] = load i32, i32* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_constant_32bit_i32_arg( +; HSA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_32bit_i32_arg( +; MESA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(6)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_unknown_as_i32_arg( +; HSA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_unknown_as_i32_arg( +; MESA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(999)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_local_i32_arg( +; HSA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_local_i32_arg( +; MESA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(3)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) { +; HSA-LABEL: @multi_byref_constant_i32_arg( +; HSA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12 +; HSA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4 +; HSA-NEXT: store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @multi_byref_constant_i32_arg( +; MESA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4 +; MESA-NEXT: store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in0 = load i32, i32 addrspace(4)* %in0.byref + %in1 = load i32, i32 addrspace(4)* %in1.byref + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_constant_i32_arg_offset0( +; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* undef, align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i32_arg_offset0( +; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* undef, align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byref + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} + attributes #0 = { nounwind "target-cpu"="kaveri" } attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" } attributes #2 = { nounwind "target-cpu"="tahiti" }