Skip to content

Commit

Permalink
[AMDGPU] Extend permlane16, permlanex16 and permlane64 intrinsic lowe…
Browse files Browse the repository at this point in the history
…ring for generic types (#92725)

These are incremental changes over #89217 , with core logic being the
same. This patch along with #89217 and #91190 should get us ready to enable 64
bit optimizations in atomic optimizer.
  • Loading branch information
vikramRH committed Jun 26, 2024
1 parent 89d8df1 commit 35f7b60
Show file tree
Hide file tree
Showing 17 changed files with 10,752 additions and 1,063 deletions.
10 changes: 10 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18479,6 +18479,16 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_permlane16:
case AMDGPU::BI__builtin_amdgcn_permlanex16:
return emitBuiltinWithOneOverloadedType<6>(
*this, E,
BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
? Intrinsic::amdgcn_permlane16
: Intrinsic::amdgcn_permlanex16);
case AMDGPU::BI__builtin_amdgcn_permlane64:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_permlane64);
case AMDGPU::BI__builtin_amdgcn_readlane:
return emitBuiltinWithOneOverloadedType<2>(*this, E,
Intrinsic::amdgcn_readlane);
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ typedef unsigned int uint;
typedef unsigned long ulong;

// CHECK-LABEL: @test_permlane16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
}

// CHECK-LABEL: @test_permlanex16(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
}
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
}

// CHECK-LABEL: @test_permlane64(
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
void test_permlane64(global uint* out, uint a) {
*out = __builtin_amdgcn_permlane64(a);
}
Expand Down
20 changes: 20 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,26 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
reduction will be performed using default iterative strategy.
Intrinsic is currently only implemented for i32.

llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
operation within a row (16 contiguous lanes) of the second input operand.
The third and fourth inputs must be scalar values. these are combined into
a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
<2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
operation across two rows of the second input operand (each row is 16 contiguous
lanes). The third and fourth inputs must be scalar values. these are combined
into a single 64-bit value representing lane selects used to swizzle within each
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
<2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
lanes of the input operand where the high half and low half of a wave64 are swapped.
Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half,
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the
32-bit vectors.

llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
support such instructions. This performs unsigned dot product
with two v2i16 operands, summed with the third i32 operand. The
Expand Down
15 changes: 7 additions & 8 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2355,16 +2355,16 @@ def int_amdgcn_pops_exiting_wave_id :
//===----------------------------------------------------------------------===//

// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlane16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
def int_amdgcn_permlanex16 :
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;

Expand Down Expand Up @@ -2407,8 +2407,7 @@ def int_amdgcn_image_bvh_intersect_ray :

// llvm.amdgcn.permlane64 <src0>
def int_amdgcn_permlane64 :
ClangBuiltin<"__builtin_amdgcn_permlane64">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_ds_add_gs_reg_rtn :
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlanex16Call, AtomicTy));
Expand All @@ -416,7 +416,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce across the upper and lower 32 lanes.
V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
B.CreateBitCast(Permlane64Call, AtomicTy));
}
Expand Down Expand Up @@ -472,7 +472,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
assert(ST->hasPermLaneX16());
V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});

Value *UpdateDPPCall =
Expand Down
49 changes: 40 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5438,16 +5438,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();

auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
LLT VT) -> Register {
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;

auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
Register Src2, LLT VT) -> Register {
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16: {
Register Src3 = MI.getOperand(5).getReg();
Register Src4 = MI.getOperand(6).getImm();
Register Src5 = MI.getOperand(7).getImm();
return LaneOp.addUse(Src1)
.addUse(Src2)
.addUse(Src3)
.addImm(Src4)
.addImm(Src5)
.getReg(0);
}
default:
llvm_unreachable("unhandled lane op");
}
Expand All @@ -5456,9 +5472,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register DstReg = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane) {
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
}
}
Expand All @@ -5473,12 +5490,15 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,

if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
if (Src2.isValid())

if (IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);

Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
B.buildTrunc(DstReg, LaneOpDst);

MI.eraseFromParent();
return true;
}
Expand All @@ -5505,15 +5525,23 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
SmallVector<Register, 2> PartialRes;
unsigned NumParts = Size / 32;
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src2Parts;
MachineInstrBuilder Src1Parts, Src2Parts;

if (IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);

if (Src2.isValid())
if (IID == Intrinsic::amdgcn_writelane)
Src2Parts = B.buildUnmerge(PartialResTy, Src2);

for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);
if (Src2.isValid())

if (IsPermLane16)
Src1 = Src1Parts.getReg(i);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = Src2Parts.getReg(i);

PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
}

Expand Down Expand Up @@ -7465,6 +7493,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
Expand Down
68 changes: 49 additions & 19 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6119,28 +6119,38 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
unsigned IID = N->getConstantOperandVal(0);
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);

auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
SDValue Src2, MVT ValT) -> SDValue {
SmallVector<SDValue, 8> Operands;
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
Operands.push_back(Src0);
break;
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
Operands.push_back(N->getOperand(6));
Operands.push_back(N->getOperand(5));
Operands.push_back(N->getOperand(4));
[[fallthrough]];
case Intrinsic::amdgcn_writelane:
Operands.push_back(Src2);
[[fallthrough]];
case Intrinsic::amdgcn_readlane:
Operands.push_back(Src0);
Operands.push_back(Src1);
break;
case Intrinsic::amdgcn_writelane:
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_permlane64:
Operands.push_back(Src0);
Operands.push_back(Src1);
Operands.push_back(Src2);
break;
default:
llvm_unreachable("unhandled lane op");
}

Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
std::reverse(Operands.begin(), Operands.end());

if (SDNode *GL = N->getGluedNode()) {
assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
GL = GL->getOperand(0).getNode();
Expand All @@ -6153,9 +6163,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,

SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane)
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
}

Expand All @@ -6168,10 +6179,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
bool IsFloat = VT.isFloatingPoint();
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);
if (Src2.getNode()) {

if (IsPermLane16) {
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
SL, MVT::i32);
}

if (IID == Intrinsic::amdgcn_writelane) {
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
SL, MVT::i32);
}

SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
Expand Down Expand Up @@ -6233,17 +6251,23 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
case MVT::bf16: {
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
SmallVector<SDValue, 4> Pieces;
SDValue Src0SubVec, Src1SubVec, Src2SubVec;
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
SDValue Src0SubVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));

SDValue Src2SubVec;
if (Src2)
if (IsPermLane16)
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));

if (IID == Intrinsic::amdgcn_writelane)
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
DAG.getConstant(EltIdx, SL, MVT::i32));

Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
Pieces.push_back(
IsPermLane16
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
EltIdx += 2;
}
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
Expand All @@ -6257,7 +6281,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);

if (Src2)
if (IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);

if (IID == Intrinsic::amdgcn_writelane)
Src2 = DAG.getBitcast(VecVT, Src2);

SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
Expand Down Expand Up @@ -8734,6 +8761,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -732,9 +732,7 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
getVOP1Pat<int_amdgcn_permlane64,
VOP_MOVRELS>.ret,
/*VOP1Only=*/ 1> {
[], /*VOP1Only=*/ 1> {
let IsInvalidSingleUseConsumer = 1;
let IsInvalidSingleUseProducer = 1;
}
Expand All @@ -744,6 +742,12 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
} // End SubtargetPredicate = isGFX11Plus

foreach vt = Reg32Types.types in {
def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
(vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
>;
}

//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
Expand Down
Loading

0 comments on commit 35f7b60

Please sign in to comment.