-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Remove unnecessary add instructions in ctlz.i8 #77615
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-backend-amdgpu Author: Leon Clark (PeddleSpam) ChangesAdd custom lowering for ctlz.i8 to avoid multiple add/sub operations. Full diff: https://github.com/llvm/llvm-project/pull/77615.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0dbcaf5a1b136c..2c36e6e01e5539 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -446,6 +446,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
MVT::i64, Custom);
+ setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i8, Custom);
+
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
@@ -1397,6 +1399,10 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
Results.push_back(Lowered);
return;
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF:
+ replaceCTLZResults(SDValue(N, 0u), DAG, Results);
+ return;
default:
return;
}
@@ -3062,6 +3068,25 @@ static bool isCttzOpc(unsigned Opc) {
return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
}
+void AMDGPUTargetLowering::replaceCTLZResults(
+ SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const {
+ auto SL = SDLoc(Op);
+ auto Arg = Op.getOperand(0u);
+ auto ResultVT = Op.getValueType();
+
+ if (ResultVT != MVT::i8)
+ return;
+
+ assert(isCtlzOpc(Op.getOpcode()));
+ assert(ResultVT == Arg.getValueType());
+
+ auto SubVal = DAG.getConstant(24u, SL, MVT::i32);
+ auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+ NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
+ NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp));
+}
+
SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 827fb106b55199..4cbd6c3146d5dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -84,6 +84,9 @@ class AMDGPUTargetLowering : public TargetLowering {
SDNodeFlags Flags) const;
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
+ void replaceCTLZResults(SDValue Op, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) const;
+
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 3d69655111da63..2267aa6b5496c6 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -514,8 +514,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_add_u16_e32 v0, -8, v0
+; VI-NEXT: v_subrev_u32_e32 v0, vcc, 24, v0
; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -558,8 +557,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT: v_add_nc_u16 v1, v1, -8
+; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -586,9 +584,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v1, -16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u16 v1, v1, -8
+; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 03f3d04cf8a68f..1c57a64643b92b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -329,10 +329,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: s_add_i32 s2, s2, -16
+; VI-NEXT: s_sub_i32 s2, s2, 24
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_add_u16_e64 v2, s2, -8
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -606,8 +606,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
-; VI-NEXT: v_add_u16_e32 v1, -8, v1
+; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24, v1
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1118,8 +1117,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_add_u16_e32 v2, -8, v0
+; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
|
auto Arg = Op.getOperand(0u); | ||
auto ResultVT = Op.getValueType(); | ||
|
||
if (ResultVT != MVT::i8) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't a problem really specific to i8; The same issue arises for i16 when it's not legal. Really should cover any sub-32-bit type
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't see the same issue with i16
. The i8
case happens because we first zero-extend to i16
and then later to i32
. Both times a subtraction is needed to account for the leading zeroes, which is where we get the two add/sub ops.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You will see the same issue for i16 for targets where i16 is not legal. e.g. -mcpu=hawaii
@@ -3062,6 +3068,25 @@ static bool isCttzOpc(unsigned Opc) { | |||
return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; | |||
} | |||
|
|||
void AMDGPUTargetLowering::replaceCTLZResults( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lowerCTLZ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought it might be confusing since we already have a LowerCTLZ_CTTZ
function. Other backends use the "replaceXResults" convention which seemed appropriate. Or we could split the difference and call it lowerCTLZResults
.
✅ With the latest revision this PR passed the C/C++ code formatter. |
auto Arg = Op.getOperand(0u); | ||
auto ResultVT = Op.getValueType(); | ||
|
||
if (!(ResultVT == MVT::i8 || ResultVT == MVT::i16)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: this can be an assert, and then this function will always succeed so the caller doesn't need a check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function is also called for ctlz.i64
since it's marked for custom lowering.
@@ -3062,6 +3071,26 @@ static bool isCttzOpc(unsigned Opc) { | |||
return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; | |||
} | |||
|
|||
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I know you've already changed the name once, but isn't this really doing integer result promotion (as in DAGTypeLegalizer::PromoteIntegerResult
)? So maybe PromoteIntRes_CTLZ
??
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm happy to change the name if there are no objections? I think "lowerX" was preferred for consistency with the other functions in ReplaceNodeResults
.
auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits(); | ||
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg); | ||
auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32); | ||
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal); | ||
NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp); | ||
return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunate duplication of generic legalization logic
Add custom lowering for ctlz.i8 to avoid multiple add/sub operations.
demorgan condition
Remove LSH transform and restore previous lowering. Fixes conformance issue in [77615](#77615) where OpenCL integer_ops tests fail for integer_clz. Co-authored-by: Leon Clark <leoclark@amd.com>
Use LSH to lower ctlz_zero_undef instead of subtracting leading zeros for i8 and i16. Related to [77615](#77615). --------- Co-authored-by: Leon Clark <leoclark@amd.com>
Add custom lowering for ctlz.i8 to avoid multiple add/sub operations.