Skip to content

Commit

Permalink
InstCombine/AMDGPU: Add dimension-aware image intrinsics to SimplifyD…
Browse files Browse the repository at this point in the history
…emanded

Summary:
Use the expanded features of the TableGen generic tables to avoid manually
adding the combinatorially exploded set of intrinsics. The
getAMDGPUImageDimIntrinsic lookup function is early-out,
i.e. non-AMDGPU intrinsics will never look at the underlying table.

Use a generic approach for getting the new intrinsic overload to keep the
code simple, and make the image dmask handling more generic:
- handle non-sampler image loads
- handle the case where the set of demanded elements is not a prefix

There is some overlap between this code and an optimization that happens
in the backend during code generation. They currently complement each other:

- only the codegen optimization can generate vec3 loads
- only the InstCombine optimization can handle D16

The InstCombine optimization also likely covers more cases since the
codegen optimization is fairly ad-hoc. Ideally, we'll remove the optimization
in codegen once the infrastructure for vec3 is in place (which will probably
take a long time).

Modify the test cases to use dimension-aware intrinsics. This makes it
easier to see that the test coverage for the new intrinsics is equivalent,
and the old style intrinsics will be removed in a follow-up commit anyway.

Change-Id: I4b91ea661413d13004956fe4ef7d13d41b8ce3ad

Reviewers: arsenm, rampitec, majnemer

Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D48165

llvm-svn: 335230
  • Loading branch information
nhaehnle committed Jun 21, 2018
1 parent 1045928 commit b29ee70
Show file tree
Hide file tree
Showing 6 changed files with 610 additions and 611 deletions.
30 changes: 22 additions & 8 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Expand Up @@ -803,9 +803,15 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
!if(P_.IsAtomic, 0, 1)), 1> {
AMDGPUDimProfile P = P_;

AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);

let TargetPrefix = "amdgcn";
}

// Marker class for intrinsics with a DMask that determines the returned
// channels.
class AMDGPUImageDMaskIntrinsic;

defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {

//////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -839,10 +845,14 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
}
}

defm int_amdgcn_image_load : AMDGPUImageDimIntrinsicsAll<
"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand]>;
defm int_amdgcn_image_load_mip : AMDGPUImageDimIntrinsicsNoMsaa<
"LOAD_MIP", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand], 1>;
defm int_amdgcn_image_load
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
[SDNPMemOperand]>,
AMDGPUImageDMaskIntrinsic;
defm int_amdgcn_image_load_mip
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
[IntrReadMem], [SDNPMemOperand], 1>,
AMDGPUImageDMaskIntrinsic;

defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
"STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
Expand All @@ -866,18 +876,22 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
}

foreach sample = AMDGPUSampleVariants in {
defm int_amdgcn_image_sample # sample.LowerCaseMod :
AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>;
defm int_amdgcn_image_sample # sample.LowerCaseMod
: AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
AMDGPUImageDMaskIntrinsic;
}

defm int_amdgcn_image_getlod : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>;
defm int_amdgcn_image_getlod
: AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
AMDGPUImageDMaskIntrinsic;

//////////////////////////////////////////////////////////////////////////
// getresinfo intrinsics
//////////////////////////////////////////////////////////////////////////
foreach dim = AMDGPUDims.All in {
def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
: AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>;
: AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
AMDGPUImageDMaskIntrinsic;
}

//////////////////////////////////////////////////////////////////////////
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/InstCombine/CMakeLists.txt
@@ -1,3 +1,7 @@
set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
add_public_tablegen_target(InstCombineTableGen)

add_llvm_library(LLVMInstCombine
InstructionCombining.cpp
InstCombineAddSub.cpp
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineInternal.h
Expand Up @@ -706,6 +706,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
/// demanded bits.
bool SimplifyDemandedInstructionBits(Instruction &Inst);

Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DmaskIdx = -1);

Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
APInt &UndefElts, unsigned Depth = 0);

Expand Down
193 changes: 122 additions & 71 deletions llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
Expand Up @@ -23,6 +23,17 @@ using namespace llvm::PatternMatch;

#define DEBUG_TYPE "instcombine"

namespace {

struct AMDGPUImageDMaskIntrinsic {
unsigned Intr;
};

#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
#include "InstCombineTables.inc"

} // end anonymous namespace

/// Check to see if the specified operand of the specified instruction is a
/// constant integer. If so, check to see if there are any bits set in the
/// constant that are not demanded. If so, shrink the constant and return true.
Expand Down Expand Up @@ -909,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
return nullptr;
}

/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DMaskIdx) {
unsigned VWidth = II->getType()->getVectorNumElements();
if (VWidth == 1)
return nullptr;

ConstantInt *NewDMask = nullptr;

if (DMaskIdx < 0) {
// Pretend that a prefix of elements is demanded to simplify the code
// below.
DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
} else {
ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx));
if (!DMask)
return nullptr; // non-constant dmask is not supported by codegen

unsigned DMaskVal = DMask->getZExtValue() & 0xf;

// Mask off values that are undefined because the dmask doesn't cover them
DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;

unsigned NewDMaskVal = 0;
unsigned OrigLoadIdx = 0;
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
const unsigned Bit = 1 << SrcIdx;
if (!!(DMaskVal & Bit)) {
if (!!(DemandedElts & (1 << OrigLoadIdx)))
NewDMaskVal |= Bit;
OrigLoadIdx++;
}
}

if (DMaskVal != NewDMaskVal)
NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
}

// TODO: Handle 3 vectors when supported in code gen.
unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation());
if (!NewNumElts)
return UndefValue::get(II->getType());

if (NewNumElts >= VWidth && DemandedElts.isMask()) {
if (NewDMask)
II->setArgOperand(DMaskIdx, NewDMask);
return nullptr;
}

// Determine the overload types of the original intrinsic.
auto IID = II->getIntrinsicID();
SmallVector<Intrinsic::IITDescriptor, 16> Table;
getIntrinsicInfoTableEntries(IID, Table);
ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;

FunctionType *FTy = II->getCalledFunction()->getFunctionType();
SmallVector<Type *, 6> OverloadTys;
Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys);
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys);

// Get the new return type overload of the intrinsic.
Module *M = II->getParent()->getParent()->getParent();
Type *EltTy = II->getType()->getVectorElementType();
Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);

OverloadTys[0] = NewTy;
Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);

SmallVector<Value *, 16> Args;
for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
Args.push_back(II->getArgOperand(I));

if (NewDMask)
Args[DMaskIdx] = NewDMask;

IRBuilderBase::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(II);

CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
NewCall->takeName(II);
NewCall->copyMetadata(*II);

if (NewNumElts == 1) {
return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
DemandedElts.countTrailingZeros());
}

SmallVector<uint32_t, 8> EltMask;
unsigned NewLoadIdx = 0;
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
if (!!(DemandedElts & (1 << OrigLoadIdx)))
EltMask.push_back(NewLoadIdx++);
else
EltMask.push_back(NewNumElts);
}

Value *Shuffle =
Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);

return Shuffle;
}

/// The specified value produces a vector with any number of elements.
/// DemandedElts contains the set of elements that are actually used by the
/// caller. This method analyzes which elements of the operand are undef and
Expand Down Expand Up @@ -1267,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
if (!II) break;
switch (II->getIntrinsicID()) {
default: break;

case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd:
// The instructions for these intrinsics are speced to zero upper bits not
Expand Down Expand Up @@ -1582,79 +1695,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
case Intrinsic::amdgcn_image_sample_c_cd_cl_o:

case Intrinsic::amdgcn_image_getlod: {
if (VWidth == 1 || !DemandedElts.isMask())
return nullptr;

// TODO: Handle 3 vectors when supported in code gen.
unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
if (NewNumElts == VWidth)
return nullptr;

Module *M = II->getParent()->getParent()->getParent();
Type *EltTy = V->getType()->getVectorElementType();

Type *NewTy = (NewNumElts == 1) ? EltTy :
VectorType::get(EltTy, NewNumElts);

auto IID = II->getIntrinsicID();

bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
IID == Intrinsic::amdgcn_buffer_load_format;
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts,
IsBuffer ? -1 : 3);
}
default: {
if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);

Function *NewIntrin = IsBuffer ?
Intrinsic::getDeclaration(M, IID, NewTy) :
// Samplers have 3 mangled types.
Intrinsic::getDeclaration(M, IID,
{ NewTy, II->getArgOperand(0)->getType(),
II->getArgOperand(1)->getType()});

SmallVector<Value *, 5> Args;
for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
Args.push_back(II->getArgOperand(I));

IRBuilderBase::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(II);

CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
NewCall->takeName(II);
NewCall->copyMetadata(*II);

if (!IsBuffer) {
ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
if (DMask) {
unsigned DMaskVal = DMask->getZExtValue() & 0xf;

unsigned PopCnt = 0;
unsigned NewDMask = 0;
for (unsigned I = 0; I < 4; ++I) {
const unsigned Bit = 1 << I;
if (!!(DMaskVal & Bit)) {
if (++PopCnt > NewNumElts)
break;

NewDMask |= Bit;
}
}

NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
}
}


if (NewNumElts == 1) {
return Builder.CreateInsertElement(UndefValue::get(V->getType()),
NewCall, static_cast<uint64_t>(0));
}

SmallVector<uint32_t, 8> EltMask;
for (unsigned I = 0; I < VWidth; ++I)
EltMask.push_back(I);

Value *Shuffle = Builder.CreateShuffleVector(
NewCall, UndefValue::get(NewTy), EltMask);

MadeChange = true;
return Shuffle;
break;
}
}
break;
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineTables.td
@@ -0,0 +1,11 @@
include "llvm/TableGen/SearchableTable.td"
include "llvm/IR/Intrinsics.td"

def AMDGPUImageDMaskIntrinsicTable : GenericTable {
let FilterClass = "AMDGPUImageDMaskIntrinsic";
let Fields = ["Intr"];

let PrimaryKey = ["Intr"];
let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
let PrimaryKeyEarlyOut = 1;
}

0 comments on commit b29ee70

Please sign in to comment.