Skip to content

Commit

Permalink
[AMDGPU] Aggressively fold immediates in SIShrinkInstructions
Browse files Browse the repository at this point in the history
Fold immediates regardless of how many uses they have. This is expected
to increase overall code size, but decrease register usage.

Differential Revision: https://reviews.llvm.org/D114644
  • Loading branch information
jayfoad committed May 18, 2022
1 parent bdf2547 commit e292650
Show file tree
Hide file tree
Showing 105 changed files with 4,935 additions and 5,592 deletions.
12 changes: 4 additions & 8 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Expand Up @@ -93,7 +93,7 @@ bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
MachineOperand &Src0 = MI.getOperand(Src0Idx);
if (Src0.isReg()) {
Register Reg = Src0.getReg();
if (Reg.isVirtual() && MRI->hasOneUse(Reg)) {
if (Reg.isVirtual()) {
MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
if (Def && Def->isMoveImmediate()) {
MachineOperand &MovSrc = Def->getOperand(1);
Expand All @@ -115,8 +115,8 @@ bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
}

if (ConstantFolded) {
assert(MRI->use_empty(Reg));
Def->eraseFromParent();
if (MRI->use_nodbg_empty(Reg))
Def->eraseFromParent();
++NumLiteralConstantsFolded;
return true;
}
Expand Down Expand Up @@ -739,11 +739,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
}

// FIXME: We also need to consider movs of constant operands since
// immediate operands are not folded if they have more than one use, and
// the operand folding pass is unaware if the immediate will be free since
// it won't know if the src == dest constraint will end up being
// satisfied.
// Try to use S_ADDK_I32 and S_MULK_I32.
if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
MI.getOpcode() == AMDGPU::S_MUL_I32) {
const MachineOperand *Dest = &MI.getOperand(0);
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
Expand Up @@ -94,9 +94,8 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, 0x80008000
; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
; GFX8-NEXT: v_add_u16_e32 v2, v0, v1
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
Expand Down
80 changes: 34 additions & 46 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
Expand Up @@ -80,11 +80,10 @@ define i8 @v_uaddo_i8(i8 %a, i8 %b) {
; GFX7-LABEL: v_uaddo_i8:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_and_b32_e32 v1, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
Expand All @@ -93,11 +92,10 @@ define i8 @v_uaddo_i8(i8 %a, i8 %b) {
; GFX8-LABEL: v_uaddo_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s4, 0xff
; GFX8-NEXT: v_and_b32_e32 v0, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_and_b32_e32 v1, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
Expand All @@ -123,11 +121,10 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
; GFX7-LABEL: v_uaddo_i7:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0x7f
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_and_b32_e32 v1, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0
; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
Expand All @@ -136,11 +133,10 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
; GFX8-LABEL: v_uaddo_i7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s4, 0x7f
; GFX8-NEXT: v_and_b32_e32 v0, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_and_b32_e32 v1, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
Expand All @@ -149,11 +145,10 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
; GFX9-LABEL: v_uaddo_i7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x7f
; GFX9-NEXT: v_and_b32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v1, s4, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
Expand Down Expand Up @@ -576,11 +571,10 @@ define i8 @s_uaddo_i8(i8 %a, i8 %b) {
; GFX7-LABEL: s_uaddo_i8:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_and_b32_e32 v1, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
Expand All @@ -589,11 +583,10 @@ define i8 @s_uaddo_i8(i8 %a, i8 %b) {
; GFX8-LABEL: s_uaddo_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s4, 0xff
; GFX8-NEXT: v_and_b32_e32 v0, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_and_b32_e32 v1, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
Expand All @@ -619,11 +612,10 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
; GFX7-LABEL: s_uaddo_i7:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0x7f
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_and_b32_e32 v1, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0
; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
Expand All @@ -632,11 +624,10 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
; GFX8-LABEL: s_uaddo_i7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s4, 0x7f
; GFX8-NEXT: v_and_b32_e32 v0, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_and_b32_e32 v1, s4, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
Expand All @@ -645,11 +636,10 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
; GFX9-LABEL: s_uaddo_i7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x7f
; GFX9-NEXT: v_and_b32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v1, s4, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
Expand Down Expand Up @@ -966,11 +956,10 @@ define amdgpu_ps i32 @uaddo_i32_sv(i32 inreg %a, i32 %b) {
define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
; GFX7-LABEL: uaddo_i16_sv:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s1, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s1, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GFX7-NEXT: v_and_b32_e32 v1, s1, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
Expand All @@ -979,11 +968,10 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
;
; GFX8-LABEL: uaddo_i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s1, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: v_and_b32_e32 v0, s1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_and_b32_e32 v1, s1, v0
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
Expand Up @@ -559,12 +559,11 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
; GFX6-LABEL: v_andn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, v2, v4
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v1
Expand Down Expand Up @@ -816,18 +815,17 @@ define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
; GFX6-LABEL: v_andn2_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, v0, v8
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX6-NEXT: v_and_b32_e32 v2, v2, v8
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX6-NEXT: v_and_b32_e32 v3, v4, v8
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; GFX6-NEXT: v_and_b32_e32 v4, v6, v8
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3
Expand Down

0 comments on commit e292650

Please sign in to comment.