Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 63 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10577,6 +10577,67 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;

const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
this]() -> bool {
if (CmpValue != 0)
return false;

MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
if (!Def || Def->getParent() != CmpInstr.getParent())
return false;
Comment on lines +10586 to +10587
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't see why the parent would matter, but this also looks untested

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Def and CmpInstr need to have the same parent so that the scan will work.


bool CanOptimize = false;
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);

// For S_OP that set SCC = DST!=0, do the transformation
//
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
if (SccDef && Def->getOpcode() != AMDGPU::S_ADD_I32 &&
Def->getOpcode() != AMDGPU::S_ADD_U32 &&
Def->getOpcode() != AMDGPU::S_ADDC_U32 &&
Def->getOpcode() != AMDGPU::S_SUB_I32 &&
Def->getOpcode() != AMDGPU::S_SUB_U32 &&
Def->getOpcode() != AMDGPU::S_SUBB_U32 &&
Def->getOpcode() != AMDGPU::S_MIN_I32 &&
Def->getOpcode() != AMDGPU::S_MIN_U32 &&
Def->getOpcode() != AMDGPU::S_MAX_I32 &&
Def->getOpcode() != AMDGPU::S_MAX_U32 &&
Def->getOpcode() != AMDGPU::S_ADDK_I32)
CanOptimize = true;

// s_cmp_lg_* is redundant because the SCC input value for S_CSELECT* has
// the same value that will be calculated by s_cmp_lg_*
//
// s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
// imm), 0)
if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
bool Op1IsNonZeroImm =
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
bool Op2IsZeroImm =
Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
if (Op1IsNonZeroImm && Op2IsZeroImm)
CanOptimize = true;
}

if (!CanOptimize)
return false;

for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
I != E; ++I) {
if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
I->killsRegister(AMDGPU::SCC, &RI))
return false;
}
Comment on lines +10627 to +10632
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you should need to be scanning around for physreg liveness, I don't see other targets doing that here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you should need to be scanning around for physreg liveness, I don't see other targets doing that here

Scanning for register liveness is also done in the other lambda, optimizeCmpAnd, (line 10752) that is used in optimizeCompareInstr. It is necessary to prevent incorrectly deleting a s_cmp if an there is an intervening definition. In PR #161582, tests were added that requires this scanning. PR #161582 was done to add SCC as an implicit destination of s_quadmask*. Without this fix the scanning would not detect a conflict.


if (SccDef)
SccDef->setIsDead(false);

CmpInstr.eraseFromParent();
return true;
};

const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
Expand Down Expand Up @@ -10704,15 +10765,15 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
return optimizeCmpAnd(0, 32, true, false);
return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
case AMDGPU::S_CMP_GT_I32:
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
return optimizeCmpAnd(0, 64, true, false);
return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
}

return false;
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down Expand Up @@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down Expand Up @@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
Expand Down
36 changes: 8 additions & 28 deletions llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
; CHECK-LABEL: s_add64_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s2, s4, 0
; CHECK-NEXT: ; return to shader part epilog
%sum64 = add i64 %val64A, %val64B
Expand All @@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_v2i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s10, s2, s6
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
; CHECK-NEXT: s_addc_u32 s8, s3, s7
; CHECK-NEXT: s_add_u32 s6, s2, s6
; CHECK-NEXT: s_addc_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_add_u32 s0, s0, s4
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
Expand All @@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
; CHECK-NEXT: v_mov_b32_e32 v4, s10
; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: v_mov_b32_e32 v4, s6
; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
Expand All @@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s10, s2, s6
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
; CHECK-NEXT: s_subb_u32 s8, s3, s7
; CHECK-NEXT: s_sub_u32 s6, s2, s6
; CHECK-NEXT: s_subb_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_sub_u32 s0, s0, s4
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_subb_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
Expand All @@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
; CHECK-NEXT: v_mov_b32_e32 v4, s10
; CHECK-NEXT: v_mov_b32_e32 v5, s8
; CHECK-NEXT: v_mov_b32_e32 v4, s6
; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
Expand All @@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
; CHECK-LABEL: s_uadd_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
Expand All @@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
Expand Down Expand Up @@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
Expand All @@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, -1
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, -1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
Expand Down
Loading