-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LLVM] Make use of s_flbit_i32_b64 and s_ff1_i32_b64 #75158
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Acim Maravic (Acim-Maravic) ChangesUpdate DAG ISel to support 64bit versions S_FF1_I32_B64 and S_FLBIT_I32_B664 Patch is 176.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75158.diff 21 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcbdf51b03c1fc..7c0947945c9ad0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3067,6 +3067,17 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
return NewOpr;
}
+ // 64-bit scalar ctlz/cttz should use S_FLBIT_I32_B64/S_FF1_I32_B64
+ // 64-bit scalar version produce 32-bit result
+ if (!(Src->isDivergent()) && Src.getValueType() == MVT::i64) {
+ SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
+ if (!ZeroUndef) {
+ const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
+ NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+ }
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
+ }
+
SDValue Lo, Hi;
std::tie(Lo, Hi) = split64BitValue(Src, DAG);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0a06fa88b6b102..88a95ba9b00107 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6849,6 +6849,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
+ case AMDGPU::S_FLBIT_I32_B64:
+ splitScalar64BitCountOp(Worklist, Inst, AMDGPU::S_FLBIT_I32_B32);
+ Inst.eraseFromParent();
+ return;
+ case AMDGPU::S_FF1_I32_B64:
+ splitScalar64BitCountOp(Worklist, Inst, AMDGPU::S_FF1_I32_B32);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_LSHL_B32:
if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
@@ -7834,6 +7843,62 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Instr, unsigned Opcode, MachineDominatorTree *MDT) const {
+ MachineBasicBlock &MBB = *Instr.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Instr;
+ const DebugLoc &DL = Instr.getDebugLoc();
+
+ MachineOperand &Dest = Instr.getOperand(0);
+ MachineOperand &Src = Instr.getOperand(1);
+
+ const MCInstrDesc &InstDesc = get(Opcode);
+ bool isCtlz = Opcode == AMDGPU::S_FLBIT_I32_B32;
+
+
+ const TargetRegisterClass *SrcRC = Src.isReg() ?
+ MRI.getRegClass(Src.getReg()) :
+ &AMDGPU::SGPR_32RegClass;
+
+ const TargetRegisterClass *SrcSubRC =
+ RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+ MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub0, SrcSubRC);
+ MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+ AMDGPU::sub1, SrcSubRC);
+
+ Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ MachineInstr *M1 = BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
+ MachineInstr *M2 = BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
+ MachineInstr *M3 = nullptr;
+ MachineInstr *M4 = nullptr;
+
+ if (isCtlz)
+ M3 = BuildMI(MBB, MII, DL, get(AMDGPU::S_ADD_I32), MidReg3).addReg(MidReg1).addImm(32);
+ else
+ M3 = BuildMI(MBB, MII, DL, get(AMDGPU::S_ADD_I32), MidReg3).addReg(MidReg2).addImm(32);
+
+ if (isCtlz)
+ M4 = BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN3_U32_e64), MidReg4).addReg(MidReg3).addReg(MidReg2).addImm(64);
+ else
+ M4 = BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN3_U32_e64), MidReg4).addReg(MidReg3).addReg(MidReg1).addImm(64);
+
+ MRI.replaceRegWith(Dest.getReg(), MidReg4);
+
+ Worklist.insert(M1);
+ Worklist.insert(M2);
+ Worklist.insert(M3);
+ Worklist.insert(M4);
+
+ addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
+}
+
void SIInstrInfo::addUsersToMoveToVALUWorklist(
Register DstReg, MachineRegisterInfo &MRI,
SIInstrWorklist &Worklist) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0ce31ac6d54ec5..effd891eb26edc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -147,6 +147,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
MachineInstr &Inst) const;
void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+ void splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Instr, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const;
void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5a..12574c41f7cd31 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -458,13 +458,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s5, s3
-; GFX8-NEXT: s_ff1_i32_b32 s6, s2
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s5, s6, s5
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -502,13 +499,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s5, s3
-; GFX9-NEXT: s_ff1_i32_b32 s6, s2
-; GFX9-NEXT: s_add_i32 s5, s5, 32
-; GFX9-NEXT: s_min_u32 s5, s6, s5
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -545,10 +539,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT: s_add_i32 s5, s5, 32
-; GFX10W64-NEXT: s_min_u32 s5, s6, s5
+; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -627,16 +618,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT: s_add_i32 s5, s5, 32
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_min_u32 s5, s6, s5
+; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -739,13 +726,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s5, s3
-; GFX8-NEXT: s_ff1_i32_b32 s6, s2
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s5, s6, s5
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -785,13 +769,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s5, s3
-; GFX9-NEXT: s_ff1_i32_b32 s6, s2
-; GFX9-NEXT: s_add_i32 s5, s5, 32
-; GFX9-NEXT: s_min_u32 s5, s6, s5
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -830,10 +811,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT: s_add_i32 s5, s5, 32
-; GFX10W64-NEXT: s_min_u32 s5, s6, s5
+; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -918,16 +896,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT: s_add_i32 s5, s5, 32
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_min_u32 s5, s6, s5
+; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1540,13 +1514,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB7_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s5, s3
-; GFX8-NEXT: s_ff1_i32_b32 s6, s2
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s5, s6, s5
+; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_readlane_b32 s8, v0, s5
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT: s_mov_b32 m0, s5
; GFX8-NEXT: v_writelane_b32 v1, s4, m0
; GFX8-NEXT: s_add_i32 s4, s4, s8
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1584,13 +1555,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB7_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s5, s3
-; GFX9-NEXT: s_ff1_i32_b32 s6, s2
-; GFX9-NEXT: s_add_i32 s5, s5, 32
-; GFX9-NEXT: s_min_u32 s5, s6, s5
+; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_readlane_b32 s8, v0, s5
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT: s_mov_b32 m0, s5
; GFX9-NEXT: v_writelane_b32 v1, s4, m0
; GFX9-NEXT: s_add_i32 s4, s4, s8
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1627,10 +1595,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT: s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT: s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT: s_add_i32 s5, s5, 32
-; GFX10W64-NEXT: s_min_u32 s5, s6, s5
+; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5
@@ -1709,16 +1674,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT: s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT: s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT: s_add_i32 s5, s5, 32
-; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_min_u32 s5, s6, s5
+; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5
; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s4, s4, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f8..d64112e6972de3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -525,15 +525,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s4, s3
-; GFX8-NEXT: s_ff1_i32_b32 s5, s2
-; GFX8-NEXT: s_add_i32 s4, s4, 32
-; GFX8-NEXT: s_min_u32 s7, s5, s4
-; GFX8-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT: s_mov_b32 m0, s7
+; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s4
+; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s8
+; GFX8-NEXT: s_add_i32 s6, s6, s7
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
@@ -574,15 +571,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s4, s3
-; GFX9-NEXT: s_ff1_i32_b32 s5, s2
-; GFX9-NEXT: s_add_i32 s4, s4, 32
-; GFX9-NEXT: s_min_u32 s7, s5, s4
-; GFX9-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT: s_mov_b32 m0, s7
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s4
+; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s8
+; GFX9-NEXT: s_add_i32 s6, s6, s7
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
@@ -623,10 +617,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT: s_ff1_i32_b32 s5, s2
-; GFX1064-NEXT: s_add_i32 s4, s4, 32
-; GFX1064-NEXT: s_min_u32 s7, s5, s4
+; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
@@ -721,16 +712,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b32 s4, s3
-; GFX1164-NEXT: s_ctz_i32_b32 s5, s2
-; GFX1164-NEXT: s_add_i32 s4, s4, 32
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT: s_min_u32 s7, s5, s4
+; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: s_add_i32 s6, s6, s8
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
@@ -2038,15 +2025,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB8_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_ff1_i32_b32 s4, s3
-; GFX8-NEXT: s_ff1_i32_b32 s5, s2
-; GFX8-NEXT: s_add_i32 s4, s4, 32
-; GFX8-NEXT: s_min_u32 s7, s5, s4
-; GFX8-NEXT: v_readlane_b32 s8, v0, s7
-; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT: s_mov_b32 m0, s7
+; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT: s_mov_b32 m0, s4
+; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s8
+; GFX8-NEXT: s_add_i32 s6, s6, s7
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB8_1
@@ -2087,15 +2071,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_ff1_i32_b32 s4, s3
-; GFX9-NEXT: s_ff1_i32_b32 s5, s2
-; GFX9-NEXT: s_add_i32 s4, s4, 32
-; GFX9-NEXT: s_min_u32 s7, s5, s4
-; GFX9-NEXT: v_readlane_b32 s8, v0, s7
-; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT: s_mov_b32 m0, s7
+; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT: s_mov_b32 m0, s4
+; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s8
+; GFX9-NEXT: s_add_i32 s6, s6, s7
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
@@ -2136,10 +2117,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT: s_ff1_i32_b32 s5, ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
d435def
to
11e8923
Compare
@@ -3067,6 +3067,17 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons | |||
return NewOpr; | |||
} | |||
|
|||
// 64-bit scalar ctlz/cttz should use S_FLBIT_I32_B64/S_FF1_I32_B64 | |||
// 64-bit scalar version produce 32-bit result | |||
if (!(Src->isDivergent()) && Src.getValueType() == MVT::i64) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The outer parentheses around Src->isDivergent()
should not be required.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The type check should be an assert, since these ops should only be legal for i32 and i64.
But anyway this block should be merged with the block above - using "if (i32 or uniform i64) {...}". Only the final zext might need to be conditional.
MachineOperand &Src = Instr.getOperand(1); | ||
|
||
const MCInstrDesc &InstDesc = get(Opcode); | ||
bool isCtlz = Opcode == AMDGPU::S_FLBIT_I32_B32; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should start with capital I
(IsCtlz
)
@@ -3067,6 +3067,17 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons | |||
return NewOpr; | |||
} | |||
|
|||
// 64-bit scalar ctlz/cttz should use S_FLBIT_I32_B64/S_FF1_I32_B64 | |||
// 64-bit scalar version produce 32-bit result | |||
if (!(Src->isDivergent()) && Src.getValueType() == MVT::i64) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The type check should be an assert, since these ops should only be legal for i32 and i64.
But anyway this block should be merged with the block above - using "if (i32 or uniform i64) {...}". Only the final zext might need to be conditional.
@@ -7834,6 +7843,56 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, | |||
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
} | |||
|
|||
void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment showing what code this generates? I don't understand what the final min3 with 64 is for.
If the input it all zeros then S_FLBIT_I32_B64/S_FF1_I32_B64 would return -1, so this expansion should do the same.
Also are there any tests for this code? If not could you add some? See move-to-valu-* for examples.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a comment. The final add with 64 was because I thought that when input is all zeroes it should return 64..., I changed that now. I have added tests, I hope that is okay now.
440f05e
to
6cb1e3f
Compare
if (Is64BitScalar) | ||
return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); | ||
return NewOpr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think ZERO_EXTEND to the same type is optimized out as a no-op, so you could write this more simply:
if (Is64BitScalar) | |
return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); | |
return NewOpr; | |
return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr); |
MachineInstr *M1 = BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0); | ||
MachineInstr *M2 = BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1); | ||
|
||
BuildMI(MBB, MII, DL, get(AMDGPU::S_ADD_I32), MidReg3) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't this need to be an unsigned saturating add?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you are right I have missed that I should produce expected (-1) result for zero.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After that is called function SIInstrInfo::getVALUOp, in which for S_ADD_I32 it returns V_ADD_U32_e64 or V_ADD_CO_U32_e32, which are both unsigned saturating adds. Correct me if I am wrong?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After that is called function SIInstrInfo::getVALUOp, in which for S_ADD_I32 it returns V_ADD_U32_e64 or V_ADD_CO_U32_e32, which are both unsigned saturating adds. Correct me if I am wrong?
No. They are only saturating adds if you specify the "clamp" output modifier. Otherwise they have normal 2's complement wrap around behaviour.
6cb1e3f
to
777ca1e
Compare
MachineInstr &Inst, unsigned Opcode, | ||
MachineDominatorTree *MDT) const { | ||
// (S_FLBIT_I32_B64 hi:lo) -> | ||
// -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks better than the generic expansion for ctlz/cttz, maybe something similar could also be implemented there?
Also, RegBankSelect should get similar treatment for GlobalISel
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with test tweaks.
Should also do GlobalISel as Matt mentioned, but that can be a separate patch.
@@ -0,0 +1,308 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you need this file. The .ll
version is simpler and tests the same thing.
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] | ||
; GFX10-NEXT: s_endpgm | ||
%val = load i64, ptr addrspace(1) %arrayidx, align 1 | ||
%ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also add a test for the i1 false
version, and the same for cttz?
777ca1e
to
55e35e0
Compare
I updated the tests. I will do GlobalISel in another patch. |
llvm/test/CodeGen/AMDGPU/ctlz.ll
Outdated
; SI-NEXT: s_add_i32 s4, s4, 32 | ||
; SI-NEXT: v_min3_u32 v0, s4, v0, 64 | ||
; SI-NEXT: s_flbit_i32_b64 s4, s[4:5] | ||
; SI-NEXT: s_min_u32 s4, s4, 32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
32 is wrong here and many other places. Should be 64.
55e35e0
to
4c01063
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks. Just one nit inline.
SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); | ||
if (!ZeroUndef) { | ||
const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32); | ||
NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32); | ||
if (!Is64BitScalar) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't really need this if
- you could do something like SDValue ConstVal = DAG.getConstant(VT.getScalarSizeInBits(), SL, MVT::i32)
.
4c01063
to
fa355b8
Compare
Thank you for help and support |
fa355b8
to
a48cd27
Compare
Update DAG ISel to support 64bit versions S_FF1_I32_B64 and S_FLBIT_I32_B664