Skip to content

Commit

Permalink
[InstCombine] Cleanup the TFE/LWE check in AMDGPU SimplifyDemanded
Browse files Browse the repository at this point in the history
Summary:
The fix added in r352904 is not quite correct, or rather misleading:

1. When the texfailctrl (TFC) argument was non-constant, the fix assumed
   non-TFE/LWE, which is incorrect.

2. Regardless, this code path cannot even be hit for correct
   TFE/LWE-enabled calls, because those return a struct. Added
   a test case for those for completeness.

Change-Id: I92d314dbc67a2670f6d7adaab765ef45f56a49cf

Reviewers: hliao, dstuttard, arsenm

Subscribers: kzhuravl, jvesely, wdng, yaxunl, tpr, t-tye, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D57681

llvm-svn: 353097
  • Loading branch information
nhaehnle committed Feb 4, 2019
1 parent 4ca0b85 commit a69146e
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 18 deletions.
3 changes: 1 addition & 2 deletions llvm/lib/Transforms/InstCombine/InstCombineInternal.h
Expand Up @@ -800,8 +800,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner

Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DmaskIdx = -1,
int TFCIdx = -1);
int DmaskIdx = -1);

Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
APInt &UndefElts, unsigned Depth = 0);
Expand Down
29 changes: 13 additions & 16 deletions llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
Expand Up @@ -966,25 +966,16 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
}

/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
///
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
/// struct returns.
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DMaskIdx,
int TFCIdx) {
int DMaskIdx) {
unsigned VWidth = II->getType()->getVectorNumElements();
if (VWidth == 1)
return nullptr;

// Need to change to new instruction format
bool TFELWEEnabled = false;
if (TFCIdx > 0) {
if (ConstantInt *TFC = dyn_cast<ConstantInt>(II->getArgOperand(TFCIdx)))
TFELWEEnabled = TFC->getZExtValue() & 0x1 // TFE
|| TFC->getZExtValue() & 0x2; // LWE
}

if (TFELWEEnabled)
return nullptr; // TFE not yet supported

ConstantInt *NewDMask = nullptr;

if (DMaskIdx < 0) {
Expand Down Expand Up @@ -1648,9 +1639,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
case Intrinsic::amdgcn_struct_buffer_load_format:
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
default: {
if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
return simplifyAMDGCNMemoryIntrinsicDemanded(
II, DemandedElts, 0, II->getNumArgOperands() - 2);
if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) {
LLVM_DEBUG(
Value *TFC = II->getArgOperand(II->getNumOperands() - 2);
assert(!isa<ConstantInt>(TFC) ||
dyn_cast<ConstantInt>(TFC)->getZExtValue() == 0);
);

return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
}

break;
}
Expand Down
Expand Up @@ -2404,6 +2404,21 @@ define protected <4 x half> @__llvm_amdgcn_image_sample_d_1darray_v4f16_f32_f32(
declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
declare <4 x half> @llvm.amdgcn.image.sample.d.1darray.v4f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)

; --------------------------------------------------------------------
; TFE / LWE
; --------------------------------------------------------------------

; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 {
%data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
%rgba = extractvalue { <4 x float>, i32 } %data, 0
%elt0 = extractelement <4 x float> %rgba, i32 0
ret float %elt0
}

declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }

Expand Down

0 comments on commit a69146e

Please sign in to comment.