@@ -5330,11 +5330,14 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
5330
5330
case AMDGPU::S_MAX_U32:
5331
5331
return std::numeric_limits<uint32_t>::min();
5332
5332
case AMDGPU::S_MAX_I32:
5333
+ case AMDGPU::V_SUB_F32_e64: // +0.0
5333
5334
return std::numeric_limits<int32_t>::min();
5334
5335
case AMDGPU::S_ADD_I32:
5335
5336
case AMDGPU::S_SUB_I32:
5336
5337
case AMDGPU::S_OR_B32:
5337
5338
case AMDGPU::S_XOR_B32:
5339
+ case AMDGPU::V_ADD_F32_e64: // -0.0
5340
+ // return 0x00000000; // -0.0
5338
5341
return std::numeric_limits<uint32_t>::min();
5339
5342
case AMDGPU::S_AND_B32:
5340
5343
return std::numeric_limits<uint32_t>::max();
@@ -5346,6 +5349,13 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
5346
5349
std::memcpy(&bits, &nanf, sizeof(bits));
5347
5350
return bits;
5348
5351
}
5352
+ // case AMDGPU::V_SUB_F32_e64: {
5353
+ // float nanf = std::numeric_limits<float>::zero();
5354
+ // uint32_t bits;
5355
+ // assert(sizeof(bits) == sizeof(nanf) && "Huh?");
5356
+ // std::memcpy(&bits, &nanf, sizeof(bits));
5357
+ // return 0x80000000; // +0.0
5358
+ // }
5349
5359
default:
5350
5360
llvm_unreachable(
5351
5361
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
@@ -5381,11 +5391,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
5381
5391
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5382
5392
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5383
5393
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5384
- Opc == AMDGPU::V_MAX_F32_e64;
5394
+ Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5395
+ Opc == AMDGPU::V_SUB_F32_e64;
5385
5396
}
5386
5397
5387
5398
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5388
- return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
5399
+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5400
+ Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
5389
5401
}
5390
5402
5391
5403
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5432,8 +5444,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5432
5444
case AMDGPU::S_XOR_B64:
5433
5445
case AMDGPU::S_ADD_I32:
5434
5446
case AMDGPU::S_ADD_U64_PSEUDO:
5447
+ case AMDGPU::V_ADD_F32_e64:
5435
5448
case AMDGPU::S_SUB_I32:
5436
- case AMDGPU::S_SUB_U64_PSEUDO: {
5449
+ case AMDGPU::S_SUB_U64_PSEUDO:
5450
+ case AMDGPU::V_SUB_F32_e64: {
5437
5451
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5438
5452
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5439
5453
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5588,6 +5602,36 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5588
5602
.addImm(AMDGPU::sub1);
5589
5603
break;
5590
5604
}
5605
+ case AMDGPU::V_ADD_F32_e64:
5606
+ case AMDGPU::V_SUB_F32_e64: {
5607
+ /// for FPop: #activebits: int, src: float.
5608
+ /// convert int to float, and then mul. there is only V_MUL_F32, so copy to vgpr.
5609
+ /// /home/aalokdes/dockerx/work/llvm-trunk/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fadd.s32.mir
5610
+ /// ig: 1(01) -> negation, 2(10) -> abs, 3(11) -> abs and neg
5611
+ // V_CVT_F32_I32_e64
5612
+ // get #active lanes in vgpr
5613
+ Register ActiveLanesVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5614
+ Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5615
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64), ActiveLanesVreg)
5616
+ // .addReg(SrcReg)
5617
+ .addReg(NewAccumulator->getOperand(0).getReg())
5618
+ .addImm(0) // clamp
5619
+ .addImm(0); // output-modifier
5620
+
5621
+ // Multiply numactivelanes * src
5622
+ // Take negation of input for SUB reduction
5623
+ unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0; // check this to make sure i am taking negation
5624
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
5625
+ .addImm(srcMod) // src0 modifier
5626
+ .addReg(SrcReg)
5627
+ .addImm(0) // src1 modifier
5628
+ .addReg(ActiveLanesVreg)
5629
+ .addImm(0) // clamp
5630
+ .addImm(0); // output-mod
5631
+ BuildMI(BB, MI, DL,
5632
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5633
+ .addReg(DstVreg);
5634
+ }
5591
5635
}
5592
5636
RetBB = &BB;
5593
5637
}
@@ -5832,10 +5876,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5832
5876
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5833
5877
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5834
5878
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5879
+ case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_F32:
5880
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
5835
5881
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5836
5882
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5837
5883
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5838
5884
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5885
+ case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_F32:
5886
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
5839
5887
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5840
5888
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5841
5889
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
0 commit comments