-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Add wave reduce intrinsics for double types - 2 #170812
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/easyonaadit/amdgpu/wave-reduce-intrinsics-double-min-max
Are you sure you want to change the base?
Conversation
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
77d3978 to
4a5ed5f
Compare
Supported Ops: `add`, `sub`
fbdfe55 to
e163cb4
Compare
4a5ed5f to
4e5c83b
Compare
🐧 Linux x64 Test Results
Failed Tests(click on a test name to see its output) LLVMLLVM.CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.llLLVM.CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.llIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
|
@llvm/pr-subscribers-backend-amdgpu Author: Aaditya (easyonaadit) ChangesSupported Ops: Patch is 118.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170812.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80978c6a00a9c..239504788fcf5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5532,6 +5532,10 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint64_t>::min();
case AMDGPU::S_AND_B64:
return std::numeric_limits<uint64_t>::max();
+ case AMDGPU::V_ADD_F64_e64:
+ return 0x8000000000000000; // -0.0
+ case AMDGPU::V_SUB_F16_e64:
+ return 0x0; // +0.0
default:
llvm_unreachable(
"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
@@ -5551,7 +5555,8 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
- Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64;
+ Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
+ Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_SUB_F16_e64;
}
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5601,9 +5606,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::V_ADD_F32_e64:
+ case AMDGPU::V_ADD_F64_e64:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_SUB_U64_PSEUDO:
- case AMDGPU::V_SUB_F32_e64: {
+ case AMDGPU::V_SUB_F32_e64:
+ case AMDGPU::V_SUB_F16_e64: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5759,28 +5766,70 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
break;
}
case AMDGPU::V_ADD_F32_e64:
- case AMDGPU::V_SUB_F32_e64: {
- Register ActiveLanesVreg =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_SUB_F32_e64:
+ case AMDGPU::V_SUB_F16_e64: {
+ bool is32BitOpc = is32bitWaveReduceOperation(Opc);
+ const TargetRegisterClass *VregRC =
+ is32BitOpc ? &AMDGPU::VGPR_32RegClass : TRI->getVGPR64Class();
+ Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
+ Register DstVreg = MRI.createVirtualRegister(VregRC);
// Get number of active lanes as a float val.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
+ BuildMI(BB, MI, DL,
+ TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
+ : AMDGPU::V_CVT_F64_I32_e64),
ActiveLanesVreg)
.addReg(NewAccumulator->getOperand(0).getReg())
.addImm(0) // clamp
.addImm(0); // output-modifier
// Take negation of input for SUB reduction
- unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
- .addImm(srcMod) // src0 modifier
- .addReg(SrcReg)
- .addImm(0) // src1 modifier
- .addReg(ActiveLanesVreg)
- .addImm(0) // clamp
- .addImm(0); // output-mod
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- .addReg(DstVreg);
+ unsigned srcMod =
+ Opc == AMDGPU::V_SUB_F32_e64 || Opc == AMDGPU::V_SUB_F16_e64 ? 1
+ : 0;
+ unsigned MulOpc =
+ is32BitOpc ? AMDGPU::V_MUL_F32_e64
+ : ST.getGeneration() == AMDGPUSubtarget::Generation::GFX12
+ ? AMDGPU::V_MUL_F64_e64_gfx12
+ : AMDGPU::V_MUL_F64_e64;
+ auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
+ DstVreg)
+ .addImm(srcMod) // src0 modifier
+ .addReg(SrcReg)
+ .addImm(0) // src1 modifier
+ .addReg(ActiveLanesVreg)
+ .addImm(0) // clamp
+ .addImm(0); // output-mod
+ if (is32BitOpc) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(DstVreg);
+ } else {
+ Register LaneValueLoReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValueHiReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const TargetRegisterClass *VregSubRC =
+ TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
+ MachineOperand Op1L =
+ TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
+ VregRC, AMDGPU::sub0, VregSubRC);
+ MachineOperand Op1H =
+ TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
+ VregRC, AMDGPU::sub1, VregSubRC);
+ // lane value input should be in an sgpr
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ LaneValueLoReg)
+ .add(Op1L);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ LaneValueHiReg)
+ .add(Op1H);
+ NewAccumulator =
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(LaneValueLoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(LaneValueHiReg)
+ .addImm(AMDGPU::sub1);
+ }
}
}
RetBB = &BB;
@@ -5959,7 +6008,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
break;
}
case AMDGPU::V_MIN_F64_e64:
- case AMDGPU::V_MAX_F64_e64: {
+ case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_SUB_F16_e64: {
const TargetRegisterClass *VregRC = TRI->getVGPR64Class();
const TargetRegisterClass *VregSubRC =
TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
@@ -5972,6 +6023,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B64_PSEUDO),
AccumulatorVReg)
.addReg(Accumulator->getOperand(0).getReg());
+ unsigned Modifier = Opc == AMDGPU::V_SUB_F16_e64 ? 1 : 0;
+ Opc = Opc == AMDGPU::V_SUB_F16_e64 ? AMDGPU::V_ADD_F64_e64 : Opc;
if (ST.getGeneration() == AMDGPUSubtarget::Generation::GFX12) {
switch (Opc) {
case AMDGPU::V_MIN_F64_e64:
@@ -5980,10 +6033,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::V_MAX_F64_e64:
Opc = AMDGPU::V_MAX_NUM_F64_e64;
break;
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_SUB_F16_e64:
+ Opc = AMDGPU::V_ADD_F64_e64_gfx12;
+ break;
}
}
auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
- .addImm(0) // src0 modifiers
+ .addImm(Modifier) // src0 modifiers
.addReg(LaneValue->getOperand(0).getReg())
.addImm(0) // src1 modifiers
.addReg(AccumulatorVReg)
@@ -6089,12 +6146,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
+ case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F64_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
+ case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F16_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 82a83e12649fb..81fec05d8da30 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -377,7 +377,9 @@ defvar Operations = [
WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"fmax", "F64", f64, SGPR_64, VSrc_b64>,
WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fadd", "F64", f64, SGPR_64, VSrc_b64>,
WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
+ WaveReduceOp<"fsub", "F64", f64, SGPR_64, VSrc_b64>,
];
foreach Op = Operations in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
index 5d408dc65d68b..56a789e8f027a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
@@ -1014,6 +1014,1117 @@ endif:
store float %combine, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %in) {
+; GFX8DAGISEL-LABEL: uniform_value_double:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8DAGISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s4
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_double:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8GISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s4
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_double:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9DAGISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_double:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9GISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_double:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064DAGISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_double:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064GISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_double:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX1032DAGISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_double:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX1032GISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_double:
+; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164DAGISEL-NEXT: s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_double:
+; GFX1164GISEL: ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1164GISEL-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_double:
+; GFX1132DAGISEL: ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX1132DAGISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132DAGISEL-NEXT: s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_double:
+; GFX1132GISEL: ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1132GISEL-NEXT: s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_double:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX12DAGISEL-NEXT: v_cvt_f64_i32_e32 v[0:1], s0
+; GFX12DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12DAGISEL-NEXT...
[truncated]
|
| case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32: | ||
| return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64); | ||
| case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64: | ||
| return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F16_e64); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I couldn't find any V_SUB_F64 opcodes, so I have used this for the moment. It is not affecting the codegen in anyway, its just a placeholder for the switch statements.
I would appreciate some suggestions for other opcodes to use instead.
As a broader discussion point, should I introduce a V_SUB_F64_Pseudo in the backend?

Supported Ops:
add,sub