Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16 #88512

Merged
merged 11 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3117,20 +3117,30 @@ static bool isCttzOpc(unsigned Opc) {
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
SelectionDAG &DAG) const {
auto SL = SDLoc(Op);
auto Opc = Op.getOpcode();
auto Arg = Op.getOperand(0u);
auto ResultVT = Op.getValueType();

if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
return {};

assert(isCtlzOpc(Op.getOpcode()));
assert(isCtlzOpc(Opc));
assert(ResultVT == Arg.getValueType());

auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
const uint64_t NumBits = ResultVT.getFixedSizeInBits();
SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
SDValue NewOp;

if (Opc == ISD::CTLZ_ZERO_UNDEF) {
NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
} else {
NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
}

return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
}

Expand Down
44 changes: 37 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1270,13 +1270,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.custom();

// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
.legalFor({{S32, S32}, {S32, S64}})
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
.legalFor({{S32, S32}, {S32, S64}})
.customIf(scalarNarrowerThan(1, 32))
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);

getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
.legalFor({{S32, S32}, {S32, S64}})
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);

// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
Expand Down Expand Up @@ -2128,6 +2137,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_CTLZ:
case TargetOpcode::G_CTTZ:
return legalizeCTLZ_CTTZ(MI, MRI, B);
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
return legalizeFPTruncRound(MI, B);
case TargetOpcode::G_STACKSAVE:
Expand Down Expand Up @@ -4145,6 +4156,25 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
return true;
}

bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(Src);
TypeSize NumBits = SrcTy.getSizeInBits();

assert(NumBits < 32u);

auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
auto Shift = B.buildLShr(S32, {Extend}, ShiftAmt);
PeddleSpam marked this conversation as resolved.
Show resolved Hide resolved
auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
B.buildTrunc(Dst, Ctlz);
MI.eraseFromParent();
return true;
}

// Check that this is a G_XOR x, -1
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
if (MI.getOpcode() != TargetOpcode::G_XOR)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

bool loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg,
Expand Down
47 changes: 20 additions & 27 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,12 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s16) = G_TRUNC %0
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
Expand Down Expand Up @@ -149,18 +147,15 @@ body: |
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
Expand All @@ -179,14 +174,12 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s7) = G_TRUNC %0
%2:_(s7) = G_CTLZ_ZERO_UNDEF %1
Expand Down
Loading
Loading