Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5522,6 +5522,9 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint64_t>::min();
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
return std::numeric_limits<int64_t>::min();
case AMDGPU::V_MIN_F64_e64:
case AMDGPU::V_MAX_F64_e64:
return 0x7FF8000000000000; // qNAN
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO:
case AMDGPU::S_OR_B64:
Expand All @@ -5547,7 +5550,8 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {

static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64;
}

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Expand Down Expand Up @@ -5583,6 +5587,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::V_CMP_LT_I64_e64: // min
case AMDGPU::V_CMP_GT_U64_e64: // umax
case AMDGPU::V_CMP_GT_I64_e64: // max
case AMDGPU::V_MIN_F64_e64:
case AMDGPU::V_MAX_F64_e64:
case AMDGPU::S_AND_B64:
case AMDGPU::S_OR_B64: {
// Idempotent operations.
Expand Down Expand Up @@ -5952,6 +5958,60 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(Accumulator->getOperand(0).getReg());
break;
}
case AMDGPU::V_MIN_F64_e64:
case AMDGPU::V_MAX_F64_e64: {
const TargetRegisterClass *VregRC = TRI->getVGPR64Class();
const TargetRegisterClass *VregSubRC =
TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
Register DstVreg = MRI.createVirtualRegister(VregRC);
Register LaneValLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register LaneValHi =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B64_PSEUDO),
AccumulatorVReg)
.addReg(Accumulator->getOperand(0).getReg());
if (ST.getGeneration() == AMDGPUSubtarget::Generation::GFX12) {
switch (Opc) {
case AMDGPU::V_MIN_F64_e64:
Opc = AMDGPU::V_MIN_NUM_F64_e64;
break;
case AMDGPU::V_MAX_F64_e64:
Opc = AMDGPU::V_MAX_NUM_F64_e64;
break;
}
}
auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
.addImm(0) // src0 modifiers
.addReg(LaneValue->getOperand(0).getReg())
.addImm(0) // src1 modifiers
.addReg(AccumulatorVReg)
.addImm(0) // clamp
.addImm(0); // omod
auto ReadLaneLo =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
LaneValLo);
auto ReadLaneHi =
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
LaneValHi);
MachineBasicBlock::iterator Iters = *ReadLaneLo;
MachineOperand Op1L =
TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
VregRC, AMDGPU::sub0, VregSubRC);
MachineOperand Op1H =
TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
VregRC, AMDGPU::sub1, VregSubRC);
ReadLaneLo.add(Op1L);
ReadLaneHi.add(Op1H);
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
.addReg(LaneValLo)
.addImm(AMDGPU::sub0)
.addReg(LaneValHi)
.addImm(AMDGPU::sub1);
break;
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
Expand Down Expand Up @@ -6009,6 +6069,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F64_e64);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
Expand All @@ -6019,6 +6081,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F64_e64);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,9 @@ defvar Operations = [
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,

WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"fmin", "F64", f64, SGPR_64, VSrc_b64>,
WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"fmax", "F64", f64, SGPR_64, VSrc_b64>,
WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
];
Expand Down
Loading