Skip to content

Commit

Permalink
AMDGPU: Directly implement computeKnownBits for workitem intrinsics
Browse files Browse the repository at this point in the history
Currently metadata is inserted in a late pass which is lowered
to an AssertZext. The metadata would be more useful if it was
inserted earlier after inlining, but before codegen.

Probably shouldn't change anything now. Just replacing the
late metadata annotation needs more work, since we lose
out on optimizations after these are lowered to CopyFromReg.

Seems to be slightly better than relying on the AssertZext from the
metadata. The test change in cvt_f32_ubyte.ll is a quirk from it using
-start-before=amdgpu-isel instead of running the usual codegen
pipeline.
  • Loading branch information
arsenm committed Apr 22, 2022
1 parent 40bc911 commit 794a0bb
Show file tree
Hide file tree
Showing 11 changed files with 361 additions and 234 deletions.
21 changes: 21 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -4578,6 +4578,19 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
return SDValue();
}

static unsigned workitemIntrinsicDim(unsigned ID) {
switch (ID) {
case Intrinsic::amdgcn_workitem_id_x:
return 0;
case Intrinsic::amdgcn_workitem_id_y:
return 1;
case Intrinsic::amdgcn_workitem_id_z:
return 2;
default:
llvm_unreachable("not a workitem intrinsic");
}
}

void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
const SDValue Op, KnownBits &Known,
const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
Expand Down Expand Up @@ -4714,6 +4727,14 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
break;
}
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::amdgcn_workitem_id_z: {
unsigned MaxValue = Subtarget->getMaxWorkitemID(
DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
Known.Zero.setHighBits(countLeadingZeros(MaxValue));
break;
}
default:
break;
}
Expand Down
73 changes: 53 additions & 20 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -3004,6 +3004,53 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
return true;
}

static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
int64_t C) {
B.buildConstant(MI.getOperand(0).getReg(), C);
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
if (MaxID == 0)
return replaceWithConstant(B, MI, 0);

const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
const ArgDescriptor *Arg;
const TargetRegisterClass *ArgRC;
LLT ArgTy;
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);

Register DstReg = MI.getOperand(0).getReg();
if (!Arg) {
// It's undefined behavior if a function marked with the amdgpu-no-*
// attributes uses the corresponding intrinsic.
B.buildUndef(DstReg);
MI.eraseFromParent();
return true;
}

if (Arg->isMasked()) {
// Don't bother inserting AssertZext for packed IDs since we're emitting the
// masking operations anyway.
//
// TODO: We could assert the top bit is 0 for the source copy.
if (!loadInputValue(DstReg, B, ArgType))
return false;
} else {
Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
if (!loadInputValue(TmpReg, B, ArgType))
return false;
B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
}

MI.eraseFromParent();
return true;
}

Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
int64_t Offset) const {
LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Expand Down Expand Up @@ -5072,12 +5119,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
return true;
}

static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
B.buildConstant(MI.getOperand(0).getReg(), C);
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
MachineIRBuilder &B) const {
unsigned Opc;
Expand Down Expand Up @@ -5202,22 +5243,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_implicitarg_ptr:
return legalizeImplicitArgPtr(MI, MRI, B);
case Intrinsic::amdgcn_workitem_id_x:
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
return replaceWithConstant(B, MI, 0);
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
case Intrinsic::amdgcn_workitem_id_y:
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
return replaceWithConstant(B, MI, 0);

return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
case Intrinsic::amdgcn_workitem_id_z:
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
return replaceWithConstant(B, MI, 0);

return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
case Intrinsic::amdgcn_workgroup_id_x:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Expand Up @@ -96,9 +96,13 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
const TargetRegisterClass *ArgRC, LLT ArgTy) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;

bool legalizePreloadedArgIntrin(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
bool legalizeWorkitemIDIntrinsic(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;

Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
Expand Up @@ -156,11 +156,8 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
Changed = true;
break;

case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
case Intrinsic::r600_read_local_size_x:
case Intrinsic::r600_read_local_size_y:
Expand Down
47 changes: 29 additions & 18 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -6762,6 +6762,32 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
return Loads[0];
}

SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
unsigned Dim,
const ArgDescriptor &Arg) const {
SDLoc SL(Op);
MachineFunction &MF = DAG.getMachineFunction();
unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
if (MaxID == 0)
return DAG.getConstant(0, SL, MVT::i32);

SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()), Arg);

// Don't bother inserting AssertZext for packed IDs since we're emitting the
// masking operations anyway.
//
// TODO: We could assert the top bit is 0 for the source copy.
if (Arg.isMasked())
return Val;

// Preserve the known bits after expansion to a copy.
EVT SmallVT =
EVT::getIntegerVT(*DAG.getContext(), 32 - countLeadingZeros(MaxID));
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
DAG.getValueType(SmallVT));
}

SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
Expand Down Expand Up @@ -6908,26 +6934,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_workitem_id_x:
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
return DAG.getConstant(0, DL, MVT::i32);

return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDX);
return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
case Intrinsic::amdgcn_workitem_id_y:
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
return DAG.getConstant(0, DL, MVT::i32);

return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDY);
return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
return DAG.getConstant(0, DL, MVT::i32);

return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
case Intrinsic::amdgcn_wavefrontsize:
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Expand Up @@ -79,6 +79,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const;

SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim,
const ArgDescriptor &ArgDesc) const;

SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
Expand Down
159 changes: 159 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir
@@ -0,0 +1,159 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=legalizer -o - %s | FileCheck -check-prefix=GCN %s

--- |
define amdgpu_kernel void @test_workitem_id_x_unpacked() !reqd_work_group_size !0 {
ret void
}

define amdgpu_kernel void @test_workitem_id_y_unpacked() !reqd_work_group_size !0 {
ret void
}

define amdgpu_kernel void @test_workitem_id_z_unpacked() !reqd_work_group_size !0 {
ret void
}

define amdgpu_kernel void @test_workitem_id_x_packed() !reqd_work_group_size !0 {
ret void
}

define amdgpu_kernel void @test_workitem_id_y_packed() !reqd_work_group_size !0 {
ret void
}

define amdgpu_kernel void @test_workitem_id_z_packed() !reqd_work_group_size !0 {
ret void
}

define amdgpu_kernel void @missing_arg_info() "amdgpu-no-workitem-id-x" {
ret void
}

!0 = !{i32 256, i32 8, i32 4}
...
---
name: test_workitem_id_x_unpacked
machineFunctionInfo:
argumentInfo:
workGroupIDX: { reg: '$sgpr2' }
workItemIDX: { reg: '$vgpr0' }
workItemIDY: { reg: '$vgpr1' }
workItemIDZ: { reg: '$vgpr2' }
body: |
bb.0:
; GCN-LABEL: name: test_workitem_id_x_unpacked
; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY1]], 8
; GCN-NEXT: S_ENDPGM 0, implicit [[ASSERT_ZEXT]](s32)
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
S_ENDPGM 0, implicit %0
...

---
name: test_workitem_id_y_unpacked
machineFunctionInfo:
argumentInfo:
workGroupIDX: { reg: '$sgpr2' }
workItemIDX: { reg: '$vgpr0' }
workItemIDY: { reg: '$vgpr1' }
workItemIDZ: { reg: '$vgpr2' }
body: |
bb.0:
; GCN-LABEL: name: test_workitem_id_y_unpacked
; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY1]], 3
; GCN-NEXT: S_ENDPGM 0, implicit [[ASSERT_ZEXT]](s32)
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.y)
S_ENDPGM 0, implicit %0
...

---
name: test_workitem_id_z_unpacked
machineFunctionInfo:
argumentInfo:
workGroupIDX: { reg: '$sgpr2' }
workItemIDX: { reg: '$vgpr0' }
workItemIDY: { reg: '$vgpr1' }
workItemIDZ: { reg: '$vgpr2' }
body: |
bb.0:
; GCN-LABEL: name: test_workitem_id_z_unpacked
; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY1]], 2
; GCN-NEXT: S_ENDPGM 0, implicit [[ASSERT_ZEXT]](s32)
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.z)
S_ENDPGM 0, implicit %0
...

---
name: test_workitem_id_x_packed
machineFunctionInfo:
argumentInfo:
workItemIDX: { reg: '$vgpr0', mask: 1023 }
workItemIDY: { reg: '$vgpr0', mask: 1047552 }
workItemIDZ: { reg: '$vgpr0', mask: 1072693248 }
body: |
bb.0:
; GCN-LABEL: name: test_workitem_id_x_packed
; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1023
; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
; GCN-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
S_ENDPGM 0, implicit %0
...

---
name: test_workitem_id_y_packed
machineFunctionInfo:
argumentInfo:
workItemIDX: { reg: '$vgpr0', mask: 1023 }
workItemIDY: { reg: '$vgpr0', mask: 1047552 }
workItemIDZ: { reg: '$vgpr0', mask: 1072693248 }
body: |
bb.0:
; GCN-LABEL: name: test_workitem_id_y_packed
; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1023
; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
; GCN-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.y)
S_ENDPGM 0, implicit %0
...

---
name: test_workitem_id_z_packed
machineFunctionInfo:
argumentInfo:
workItemIDX: { reg: '$vgpr0', mask: 1023 }
workItemIDY: { reg: '$vgpr0', mask: 1047552 }
workItemIDZ: { reg: '$vgpr0', mask: 1072693248 }
body: |
bb.0:
; GCN-LABEL: name: test_workitem_id_z_packed
; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1023
; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
; GCN-NEXT: S_ENDPGM 0, implicit [[AND]](s32)
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.z)
S_ENDPGM 0, implicit %0
...

---
name: missing_arg_info
body: |
bb.0:
; GCN-LABEL: name: missing_arg_info
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN-NEXT: S_ENDPGM 0, implicit [[DEF]](s32)
%0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.z)
S_ENDPGM 0, implicit %0
...

0 comments on commit 794a0bb

Please sign in to comment.