Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2470,7 +2470,7 @@ def int_amdgcn_s_quadmask :
def int_amdgcn_s_wqm :
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;

class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
[data_ty],
[
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
Expand Down
49 changes: 45 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5338,6 +5338,15 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
case AMDGPU::V_MIN_F32_e64:
case AMDGPU::V_MAX_F32_e64: {
// Modeled similar to llvm.maxnum/minnum intrinsics
float QNaN = std::numeric_limits<float>::quiet_NaN();
uint32_t bits;
assert(sizeof(bits) == sizeof(QNaN) && "Huh?");
std::memcpy(&bits, &QNaN, sizeof(bits));
return bits;
}
default:
llvm_unreachable(
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
Expand Down Expand Up @@ -5372,7 +5381,12 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
Opc == AMDGPU::S_XOR_B32;
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64;
}

static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
}

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Expand All @@ -5393,8 +5407,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
switch (Opc) {
case AMDGPU::S_MIN_U32:
case AMDGPU::S_MIN_I32:
case AMDGPU::V_MIN_F32_e64:
case AMDGPU::S_MAX_U32:
case AMDGPU::S_MAX_I32:
case AMDGPU::V_MAX_F32_e64:
case AMDGPU::S_AND_B32:
case AMDGPU::S_OR_B32: {
// Idempotent operations.
Expand Down Expand Up @@ -5590,6 +5606,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
bool isFPOp = isFloatingPointWaveReduceOperation(Opc);

// Create Control flow for loop
// Split MI's Machine Basic block into For loop
Expand Down Expand Up @@ -5649,9 +5666,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
LaneValueReg)
.addReg(SrcReg)
.addReg(FF1Reg);
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValueReg);
if (isFPOp) {
Register LaneValVreg =
MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
// Get the Lane Value in VGPR to avoid the Constant Bus Restriction
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
LaneValVreg)
.addReg(LaneValueReg);
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
.addImm(0) // src0 modifier
.addReg(Accumulator->getOperand(0).getReg())
.addImm(0) // src1 modifier
.addReg(LaneValVreg)
.addImm(0) // clamp
.addImm(0); // omod
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(DstVreg);
} else {
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValueReg);
}
} else {
Register LaneValueLoReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Expand Down Expand Up @@ -5780,6 +5817,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
Expand All @@ -5788,6 +5827,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,

// Input list : [Operation_name,
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
// bit-width
// input-type
// output register class,
// input register class]
defvar Operations = [
Expand All @@ -372,6 +372,9 @@ defvar Operations = [
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,

WaveReduceOp<"min", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"max", "F32", f32, SGPR_32, VSrc_b32>,
];

foreach Op = Operations in {
Expand Down
Loading