-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Revert "[AMDGPU] Add wave reduce intrinsics for float types - 2 (#161… #168845
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Revert "[AMDGPU] Add wave reduce intrinsics for float types - 2 (#161… #168845
Conversation
)" This reverts commit dcab4cb.
|
@llvm/pr-subscribers-llvm-ir Author: Aaditya (easyonaadit) Changes…815)" This reverts commit dcab4cb. Patch is 101.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168845.diff 6 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c2057ac3a14e6..539036b283498 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2482,7 +2482,7 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
multiclass AMDGPUWaveReduceOps {
foreach Op =
- ["umin", "fmin", "min", "umax", "fmax", "max", "add", "fadd", "sub", "fsub", "and", "or", "xor"] in {
+ ["umin", "fmin", "min", "umax", "fmax", "max", "add", "sub", "and", "or", "xor"] in {
def Op : AMDGPUWaveReduce;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a88e4b2a2a31d..c6131d6b3b050 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5214,9 +5214,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
- case Intrinsic::amdgcn_wave_reduce_fadd:
case Intrinsic::amdgcn_wave_reduce_sub:
- case Intrinsic::amdgcn_wave_reduce_fsub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_fmin:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fab0e3d84e7e7..875278a3b4f97 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5480,15 +5480,11 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
- case AMDGPU::V_SUB_F32_e64: // +0.0
- return __builtin_bit_cast(uint32_t, +0.0f);
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return std::numeric_limits<uint32_t>::min();
- case AMDGPU::V_ADD_F32_e64: // -0.0
- return __builtin_bit_cast(uint32_t, -0.0f);
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
case AMDGPU::V_MIN_F32_e64:
@@ -5529,13 +5525,11 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
- Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
- Opc == AMDGPU::V_SUB_F32_e64;
+ Opc == AMDGPU::V_MAX_F32_e64;
}
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
- return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
- Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
}
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5582,10 +5576,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
- case AMDGPU::V_ADD_F32_e64:
case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U64_PSEUDO:
- case AMDGPU::V_SUB_F32_e64: {
+ case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5740,30 +5732,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addImm(AMDGPU::sub1);
break;
}
- case AMDGPU::V_ADD_F32_e64:
- case AMDGPU::V_SUB_F32_e64: {
- Register ActiveLanesVreg =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // Get number of active lanes as a float val.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
- ActiveLanesVreg)
- .addReg(NewAccumulator->getOperand(0).getReg())
- .addImm(0) // clamp
- .addImm(0); // output-modifier
-
- // Take negation of input for SUB reduction
- unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
- .addImm(srcMod) // src0 modifier
- .addReg(SrcReg)
- .addImm(0) // src1 modifier
- .addReg(ActiveLanesVreg)
- .addImm(0) // clamp
- .addImm(0); // output-mod
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- .addReg(DstVreg);
- }
}
RetBB = &BB;
}
@@ -6011,14 +5979,10 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
- case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
- case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3fe37e8217f35..1282ece9dc875 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -374,8 +374,6 @@ defvar Operations = [
WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
- WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
- WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
];
foreach Op = Operations in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index 2cb1811ff4f09..a7ebf458d2591 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -2019,1007 +2019,6 @@ endif:
store i64 %combine, ptr addrspace(1) %out
ret void
}
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value_float:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value_float:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: uniform_value_float:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: uniform_value_float:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: uniform_value_float:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: uniform_value_float:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s0, s0
-; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value_float:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value_float:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value_float:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value_float:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s0, s0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-;
-; GFX12DAGISEL-LABEL: uniform_value_float:
-; GFX12DAGISEL: ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
-; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12DAGISEL-NEXT: s_endpgm
-entry:
- %result = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
- store float %result, ptr addrspace(1) %out
- ret void
-}
-
-define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
-; GFX8DAGISEL-LABEL: divergent_value_float:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX8DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX8DAGISEL-NEXT: ; %bb.2:
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_value_float:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT: s_brev_b32 s6, 1
-; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX8GISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX8GISEL-NEXT: ; %bb.2:
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_value_float:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX9DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX9DAGISEL-NEXT: ; %bb.2:
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off
-; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_value_float:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT: s_brev_b32 s6, 1
-; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX9GISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX9GISEL-NEXT: ; %bb.2:
-; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
-...
[truncated]
|
|
@llvm/pr-subscribers-backend-amdgpu Author: Aaditya (easyonaadit) Changes…815)" This reverts commit dcab4cb. Patch is 101.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168845.diff 6 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c2057ac3a14e6..539036b283498 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2482,7 +2482,7 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
multiclass AMDGPUWaveReduceOps {
foreach Op =
- ["umin", "fmin", "min", "umax", "fmax", "max", "add", "fadd", "sub", "fsub", "and", "or", "xor"] in {
+ ["umin", "fmin", "min", "umax", "fmax", "max", "add", "sub", "and", "or", "xor"] in {
def Op : AMDGPUWaveReduce;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a88e4b2a2a31d..c6131d6b3b050 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5214,9 +5214,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_wave_reduce_add:
- case Intrinsic::amdgcn_wave_reduce_fadd:
case Intrinsic::amdgcn_wave_reduce_sub:
- case Intrinsic::amdgcn_wave_reduce_fsub:
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_fmin:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fab0e3d84e7e7..875278a3b4f97 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5480,15 +5480,11 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
return std::numeric_limits<uint32_t>::min();
case AMDGPU::S_MAX_I32:
return std::numeric_limits<int32_t>::min();
- case AMDGPU::V_SUB_F32_e64: // +0.0
- return __builtin_bit_cast(uint32_t, +0.0f);
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
case AMDGPU::S_OR_B32:
case AMDGPU::S_XOR_B32:
return std::numeric_limits<uint32_t>::min();
- case AMDGPU::V_ADD_F32_e64: // -0.0
- return __builtin_bit_cast(uint32_t, -0.0f);
case AMDGPU::S_AND_B32:
return std::numeric_limits<uint32_t>::max();
case AMDGPU::V_MIN_F32_e64:
@@ -5529,13 +5525,11 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
- Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
- Opc == AMDGPU::V_SUB_F32_e64;
+ Opc == AMDGPU::V_MAX_F32_e64;
}
static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
- return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
- Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
+ return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
}
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5582,10 +5576,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
case AMDGPU::S_XOR_B64:
case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U64_PSEUDO:
- case AMDGPU::V_ADD_F32_e64:
case AMDGPU::S_SUB_I32:
- case AMDGPU::S_SUB_U64_PSEUDO:
- case AMDGPU::V_SUB_F32_e64: {
+ case AMDGPU::S_SUB_U64_PSEUDO: {
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5740,30 +5732,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addImm(AMDGPU::sub1);
break;
}
- case AMDGPU::V_ADD_F32_e64:
- case AMDGPU::V_SUB_F32_e64: {
- Register ActiveLanesVreg =
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // Get number of active lanes as a float val.
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
- ActiveLanesVreg)
- .addReg(NewAccumulator->getOperand(0).getReg())
- .addImm(0) // clamp
- .addImm(0); // output-modifier
-
- // Take negation of input for SUB reduction
- unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
- .addImm(srcMod) // src0 modifier
- .addReg(SrcReg)
- .addImm(0) // src1 modifier
- .addReg(ActiveLanesVreg)
- .addImm(0) // clamp
- .addImm(0); // output-mod
- BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- .addReg(DstVreg);
- }
}
RetBB = &BB;
}
@@ -6011,14 +5979,10 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
- case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
- case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
- return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3fe37e8217f35..1282ece9dc875 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -374,8 +374,6 @@ defvar Operations = [
WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
- WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
- WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
];
foreach Op = Operations in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index 2cb1811ff4f09..a7ebf458d2591 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -2019,1007 +2019,6 @@ endif:
store i64 %combine, ptr addrspace(1) %out
ret void
}
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
-; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_endpgm
-;
-; GFX8GISEL-LABEL: uniform_value_float:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_endpgm
-;
-; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
-; GFX9GISEL-LABEL: uniform_value_float:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GISEL-NEXT: s_endpgm
-;
-; GFX1064DAGISEL-LABEL: uniform_value_float:
-; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1064DAGISEL-NEXT: s_endpgm
-;
-; GFX1064GISEL-LABEL: uniform_value_float:
-; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1064GISEL-NEXT: s_endpgm
-;
-; GFX1032DAGISEL-LABEL: uniform_value_float:
-; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1032DAGISEL-NEXT: s_endpgm
-;
-; GFX1032GISEL-LABEL: uniform_value_float:
-; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s0, s0
-; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX1032GISEL-NEXT: s_endpgm
-;
-; GFX1164DAGISEL-LABEL: uniform_value_float:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
-;
-; GFX1164GISEL-LABEL: uniform_value_float:
-; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164GISEL-NEXT: s_endpgm
-;
-; GFX1132DAGISEL-LABEL: uniform_value_float:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1132DAGISEL-NEXT: s_endpgm
-;
-; GFX1132GISEL-LABEL: uniform_value_float:
-; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s0, s0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1132GISEL-NEXT: s_endpgm
-;
-; GFX12DAGISEL-LABEL: uniform_value_float:
-; GFX12DAGISEL: ; %bb.0: ; %entry
-; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
-; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
-; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
-; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12DAGISEL-NEXT: s_endpgm
-entry:
- %result = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
- store float %result, ptr addrspace(1) %out
- ret void
-}
-
-define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
-; GFX8DAGISEL-LABEL: divergent_value_float:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX8DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX8DAGISEL-NEXT: ; %bb.2:
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8GISEL-LABEL: divergent_value_float:
-; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT: s_brev_b32 s6, 1
-; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX8GISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX8GISEL-NEXT: ; %bb.2:
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9DAGISEL-LABEL: divergent_value_float:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX9DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX9DAGISEL-NEXT: ; %bb.2:
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off
-; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9GISEL-LABEL: divergent_value_float:
-; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT: s_brev_b32 s6, 1
-; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
-; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
-; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
-; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
-; GFX9GISEL-NEXT: v_add_f32_e32 v3, s6, v3
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1
-; GFX9GISEL-NEXT: ; %bb.2:
-; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off
-; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
-...
[truncated]
|
…815)"
This reverts commit dcab4cb.