-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Use ConstantPool as source value for DAG lowered kernarg loads #168917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/arsenm/amdgpu/handle-invariant-load-split
Are you sure you want to change the base?
AMDGPU: Use ConstantPool as source value for DAG lowered kernarg loads #168917
Conversation
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis isn't quite a constant pool, but probably close enough for this Patch is 218.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168917.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7afaddea164f8..1e935403bbec0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#define DEBUG_TYPE "amdgpu-call-lowering"
@@ -414,7 +415,10 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getDataLayout();
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
+ // This isn't really a constant pool but close enough.
+ MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool(),
+ AMDGPUAS::CONSTANT_ADDRESS);
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 7caafa16f9043..b7b87674ee658 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -28,8 +28,17 @@ Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) {
// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) {
- // FIXME: null value is should be treated as unknown, not as uniform.
const Value *Ptr = MMO->getValue();
+ if (!Ptr) {
+ if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+ return PSV->isConstantPool() || PSV->isStack() || PSV->isGOT() ||
+ PSV->isJumpTable();
+ }
+
+ // FIXME: null value is should be treated as unknown, not as uniform.
+ return true;
+ }
+
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers.
// If Ptr is null, then that means this mem operand contains a
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1a13b2226ecd6..e58ac85a29079 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -2321,14 +2322,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return B.buildUnmerge(S32, Dst).getReg(1);
}
- // TODO: can we be smarter about machine pointer info?
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
Register LoadAddr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
// For code object version 5, private_base and shared_base are passed through
// implicit kernargs.
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
AMDGPU::AMDHSA_COV5) {
+ // TODO: can we be smarter about machine pointer info?
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+
AMDGPUTargetLowering::ImplicitParameter Param =
AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
: AMDGPUTargetLowering::PRIVATE_BASE;
@@ -2343,7 +2345,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
return Register();
MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
+ PtrInfo.getWithOffset(Offset),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(32), commonAlignment(Align(64), Offset));
@@ -2361,6 +2363,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return Register();
+ // TODO: can we be smarter about machine pointer info?
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
@@ -4709,6 +4714,14 @@ bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
return true;
}
+MachinePointerInfo
+AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+ // This isn't really a constant pool but close enough.
+ MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+ PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+ return PtrInfo;
+}
+
Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
int64_t Offset) const {
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -4736,8 +4749,8 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
"unexpected kernarg parameter type");
Register Ptr = getKernargParameterPtr(B, Offset);
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+ B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
MI.eraseFromParent();
@@ -7260,9 +7273,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
return false;
// TODO: can we be smarter about machine pointer info?
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
+ PtrInfo.getWithOffset(Offset),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
LLT::scalar(64), commonAlignment(Align(64), Offset));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index cd44a9ba0807c..31db548d2af88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -132,6 +132,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
uint64_t Offset,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c681d12ba7499..4fa3d6d81c7d0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -35,6 +35,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
@@ -2265,6 +2266,14 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
return TargetLowering::isTypeDesirableForOp(Op, VT);
}
+MachinePointerInfo
+SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+ // This isn't really a constant pool but close enough.
+ MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+ PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+ return PtrInfo;
+}
+
SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
@@ -2341,7 +2350,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
uint64_t Offset, Align Alignment, bool Signed,
const ISD::InputArg *Arg) const {
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
+ MachinePointerInfo PtrInfo =
+ getKernargSegmentPtrInfo(DAG.getMachineFunction());
// Try to avoid using an extload by loading earlier than the argument address,
// and extracting the relevant bits. The load should hopefully be merged with
@@ -2356,7 +2367,8 @@ SDValue SITargetLowering::lowerKernargMemParameter(
// TODO: If we passed in the base kernel offset we could have a better
// alignment than 4, but we don't really need it.
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
- SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
+ PtrInfo.getWithOffset(AlignDownOffset), Align(4),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
@@ -2371,9 +2383,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
}
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
- SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ SDValue Load = DAG.getLoad(
+ MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+ MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
@@ -8109,10 +8121,11 @@ SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
MachineFunction &MF = DAG.getMachineFunction();
uint64_t Offset = getImplicitParameterOffset(MF, Param);
SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
- MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachinePointerInfo PtrInfo =
+ getKernargSegmentPtrInfo(DAG.getMachineFunction());
+ return DAG.getLoad(
+ VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+ MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
}
SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 74e58f4272e10..c5b5d0d0891f0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -46,6 +46,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
unsigned &NumIntermediates, MVT &RegisterVT) const override;
private:
+ MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
+
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index 11153bbbba612..23d2e7d67fcf8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -10,10 +10,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool + 4, align 4, basealign 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool + 4, align 4, basealign 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -25,10 +25,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool + 4, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool + 4, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -45,10 +45,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool + 4, align 4, basealign 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool + 4, align 4, basealign 8, addrspace 4)
; HSA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -60,10 +60,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool + 4, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool + 4, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; LEGACY-MESA-VI-NEXT: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -80,10 +80,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool + 4, align 4, basealign 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 8, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool + 4, align 4, basealign 8, addrspace 4)
; HSA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
; HSA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
@@ -95,10 +95,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36
; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool + 4, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8), align 4, addrspace 4)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s8) from constant-pool + 4, align 4, addrspace 4)
; LEGACY-MESA-VI-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
; LEGACY-MESA-VI-NEXT: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
@@ -115,10 +115,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
- ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4)
+ ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1) from constant-pool + 4, align 4, basealign 16, addrspace 4)
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_...
[truncated]
|
🐧 Linux x64 Test Results
|
a406bd2 to
954dc93
Compare
ba69569 to
be8c7c9
Compare
This isn't quite a constant pool, but probably close enough for this purpose. We just need some known invariant value address. The aliasing queries against the real kernarg base pointer will falsely report no aliasing, but for invariant memory it probably doesn't matter.
954dc93 to
4be9e5b
Compare
be8c7c9 to
d406c2c
Compare

This isn't quite a constant pool, but probably close enough for this
purpose. We just need some known invariant value address. The aliasing
queries against the real kernarg base pointer will falsely report
no aliasing, but for invariant memory it probably doesn't matter.