Skip to content

Commit 9165bca

Browse files
committed
[AMDGPU] Add wave reduce intrinsics for float types - 2
Supported Ops: `fadd`, `fsub`
1 parent 38fcd47 commit 9165bca

File tree

2 files changed

+53
-3
lines changed

2 files changed

+53
-3
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5330,11 +5330,14 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
53305330
case AMDGPU::S_MAX_U32:
53315331
return std::numeric_limits<uint32_t>::min();
53325332
case AMDGPU::S_MAX_I32:
5333+
case AMDGPU::V_SUB_F32_e64: // +0.0
53335334
return std::numeric_limits<int32_t>::min();
53345335
case AMDGPU::S_ADD_I32:
53355336
case AMDGPU::S_SUB_I32:
53365337
case AMDGPU::S_OR_B32:
53375338
case AMDGPU::S_XOR_B32:
5339+
case AMDGPU::V_ADD_F32_e64: // -0.0
5340+
// return 0x00000000; // -0.0
53385341
return std::numeric_limits<uint32_t>::min();
53395342
case AMDGPU::S_AND_B32:
53405343
return std::numeric_limits<uint32_t>::max();
@@ -5346,6 +5349,13 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
53465349
std::memcpy(&bits, &nanf, sizeof(bits));
53475350
return bits;
53485351
}
5352+
// case AMDGPU::V_SUB_F32_e64: {
5353+
// float nanf = std::numeric_limits<float>::zero();
5354+
// uint32_t bits;
5355+
// assert(sizeof(bits) == sizeof(nanf) && "Huh?");
5356+
// std::memcpy(&bits, &nanf, sizeof(bits));
5357+
// return 0x80000000; // +0.0
5358+
// }
53495359
default:
53505360
llvm_unreachable(
53515361
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
@@ -5381,11 +5391,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
53815391
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
53825392
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
53835393
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5384-
Opc == AMDGPU::V_MAX_F32_e64;
5394+
Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5395+
Opc == AMDGPU::V_SUB_F32_e64;
53855396
}
53865397

53875398
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5388-
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
5399+
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5400+
Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
53895401
}
53905402

53915403
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5432,8 +5444,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54325444
case AMDGPU::S_XOR_B64:
54335445
case AMDGPU::S_ADD_I32:
54345446
case AMDGPU::S_ADD_U64_PSEUDO:
5447+
case AMDGPU::V_ADD_F32_e64:
54355448
case AMDGPU::S_SUB_I32:
5436-
case AMDGPU::S_SUB_U64_PSEUDO: {
5449+
case AMDGPU::S_SUB_U64_PSEUDO:
5450+
case AMDGPU::V_SUB_F32_e64: {
54375451
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
54385452
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
54395453
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5588,6 +5602,36 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55885602
.addImm(AMDGPU::sub1);
55895603
break;
55905604
}
5605+
case AMDGPU::V_ADD_F32_e64:
5606+
case AMDGPU::V_SUB_F32_e64: {
5607+
/// for FPop: #activebits: int, src: float.
5608+
/// convert int to float, and then mul. there is only V_MUL_F32, so copy to vgpr.
5609+
/// /home/aalokdes/dockerx/work/llvm-trunk/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir
5610+
/// ig: 1(01) -> negation, 2(10) -> abs, 3(11) -> abs and neg
5611+
// V_CVT_F32_I32_e64
5612+
// get #active lanes in vgpr
5613+
Register ActiveLanesVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5614+
Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5615+
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64), ActiveLanesVreg)
5616+
// .addReg(SrcReg)
5617+
.addReg(NewAccumulator->getOperand(0).getReg())
5618+
.addImm(0) // clamp
5619+
.addImm(0); // output-modifier
5620+
5621+
// Multiply numactivelanes * src
5622+
// Take negation of input for SUB reduction
5623+
unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0; // check this to make sure i am taking negation
5624+
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5625+
.addImm(srcMod) // src0 modifier
5626+
.addReg(SrcReg)
5627+
.addImm(0) // src1 modifier
5628+
.addReg(ActiveLanesVreg)
5629+
.addImm(0) // clamp
5630+
.addImm(0); // output-mod
5631+
BuildMI(BB, MI, DL,
5632+
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5633+
.addReg(DstVreg);
5634+
}
55915635
}
55925636
RetBB = &BB;
55935637
}
@@ -5832,10 +5876,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
58325876
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
58335877
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
58345878
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5879+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
5880+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
58355881
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
58365882
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
58375883
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
58385884
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5885+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
5886+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
58395887
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
58405888
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
58415889
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,8 @@ defvar Operations = [
375375

376376
WaveReduceOp<"min", "F32", f32, SGPR_32, VSrc_b32>,
377377
WaveReduceOp<"max", "F32", f32, SGPR_32, VSrc_b32>,
378+
WaveReduceOp<"add", "F32", f32, SGPR_32, VSrc_b32>,
379+
WaveReduceOp<"sub", "F32", f32, SGPR_32, VSrc_b32>,
378380
];
379381

380382
foreach Op = Operations in {

0 commit comments

Comments
 (0)