Skip to content

Commit

Permalink
Reapply "AMDGPU/GlobalISel: Fully handle 0 dmask case during legalize"
Browse files Browse the repository at this point in the history
This reverts commit 9bca8fc.

Rearrange handling to avoid changing the instruction in the case where
it's going to be erased and replaced with undef.
  • Loading branch information
arsenm committed Mar 18, 2020
1 parent d1a7bfc commit ea4597e
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 148 deletions.
56 changes: 37 additions & 19 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -3470,6 +3470,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MachineInstr &MI, MachineIRBuilder &B,
GISelChangeObserver &Observer,
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
B.setInstr(MI);

const int NumDefs = MI.getNumExplicitDefs();
bool IsTFE = NumDefs == 2;
// We are only processing the operands of d16 image operations on subtargets
Expand All @@ -3479,18 +3481,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);

Observer.changingInstr(MI);
auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });


unsigned NewOpcode = NumDefs == 0 ?
AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;

// Track that we legalized this
MI.setDesc(B.getTII().get(NewOpcode));

B.setInstr(MI);

MachineRegisterInfo *MRI = B.getMRI();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
Expand All @@ -3506,6 +3496,41 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(

int NumVAddrs, NumGradients;
std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
const int DMaskIdx = BaseOpcode->Atomic ? -1 :
getDMaskIdx(BaseOpcode, NumDefs);
unsigned DMask = 0;

int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
DMask = MI.getOperand(DMaskIdx).getImm();
if (BaseOpcode->Gather4) {
DMaskLanes = 4;
} else if (DMask != 0) {
DMaskLanes = countPopulation(DMask);
} else if (!IsTFE && !BaseOpcode->Store) {
// If dmask is 0, this is a no-op load. This can be eliminated.
B.buildUndef(MI.getOperand(0));
MI.eraseFromParent();
return true;
}
}

Observer.changingInstr(MI);
auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });

unsigned NewOpcode = NumDefs == 0 ?
AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;

// Track that we legalized this
MI.setDesc(B.getTII().get(NewOpcode));

// Expecting to get an error flag since TFC is on - and dmask is 0 Force
// dmask to be at least 1 otherwise the instruction will fail
if (IsTFE && DMask == 0) {
DMask = 0x1;
DMaskLanes = 1;
MI.getOperand(DMaskIdx).setImm(DMask);
}

// If the register allocator cannot place the address registers contiguously
// without introducing moves, then using the non-sequential address encoding
Expand Down Expand Up @@ -3556,13 +3581,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
}

int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
const int DMaskIdx = getDMaskIdx(BaseOpcode, NumDefs);
unsigned DMask = MI.getOperand(DMaskIdx).getImm();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
}

if (BaseOpcode->Store) { // No TFE for stores?
// TODO: Handle dmask trim
Register VData = MI.getOperand(1).getReg();
Expand Down
Expand Up @@ -2903,15 +2903,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9: $vgpr1 = COPY [[DEF]](s32)
; GFX9: $vgpr2 = COPY [[DEF]](s32)
; GFX9: $vgpr3 = COPY [[DEF]](s32)
; GFX9: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
; GFX9: $vgpr0 = COPY [[UV]](s32)
; GFX9: $vgpr1 = COPY [[UV1]](s32)
; GFX9: $vgpr2 = COPY [[UV2]](s32)
; GFX9: $vgpr3 = COPY [[UV3]](s32)
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
; GFX10NSA-LABEL: name: getresinfo_dmask0
; GFX10NSA: bb.1.main_body:
Expand All @@ -2925,15 +2922,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA: $vgpr1 = COPY [[DEF]](s32)
; GFX10NSA: $vgpr2 = COPY [[DEF]](s32)
; GFX10NSA: $vgpr3 = COPY [[DEF]](s32)
; GFX10NSA: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
; GFX10NSA: $vgpr0 = COPY [[UV]](s32)
; GFX10NSA: $vgpr1 = COPY [[UV1]](s32)
; GFX10NSA: $vgpr2 = COPY [[UV2]](s32)
; GFX10NSA: $vgpr3 = COPY [[UV3]](s32)
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0
Expand Down

0 comments on commit ea4597e

Please sign in to comment.