Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5330,11 +5330,13 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
case AMDGPU::S_MAX_U32:
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_MAX_I32:
case AMDGPU::V_SUB_F32_e64: // +0.0
return std::numeric_limits<int32_t>::min();
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
case AMDGPU::V_ADD_F32_e64: // -0.0
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
Expand Down Expand Up @@ -5382,11 +5384,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64;
Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64;
}

static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
}

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Expand Down Expand Up @@ -5433,8 +5437,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_SUB_U64_PSEUDO: {
case AMDGPU::S_SUB_U64_PSEUDO:
case AMDGPU::V_SUB_F32_e64: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
Expand Down Expand Up @@ -5589,6 +5595,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addImm(AMDGPU::sub1);
break;
}
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_SUB_F32_e64: {
Register ActiveLanesVreg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// Get number of active lanes as a float val.
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
ActiveLanesVreg)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(0) // clamp
.addImm(0); // output-modifier

// Take negation of input for SUB reduction
unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
.addImm(srcMod) // src0 modifier
.addReg(SrcReg)
.addImm(0) // src1 modifier
.addReg(ActiveLanesVreg)
.addImm(0) // clamp
.addImm(0); // output-mod
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(DstVreg);
}
}
RetBB = &BB;
}
Expand Down Expand Up @@ -5833,10 +5863,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,8 @@ defvar Operations = [

WaveReduceOp<"min", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"max", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"add", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"sub", "F32", f32, SGPR_32, VSrc_b32>,
];

foreach Op = Operations in {
Expand Down
Loading
Loading