Skip to content

Commit

Permalink
AMDGPU: Try to eliminate clearing of high bits of 16-bit instructions
Browse files Browse the repository at this point in the history
These used to consistently be zeroed pre-gfx9, but gfx9 made the
situation complicated since now some still do and some don't. This
also manages to pick up a few cases that the pattern fails to optimize
away.

We handle some cases with instruction patterns, but some get
through. In particular this improves the integer cases.
  • Loading branch information
arsenm committed Jun 22, 2021
1 parent 317e92a commit 39f8a79
Show file tree
Hide file tree
Showing 19 changed files with 369 additions and 409 deletions.
99 changes: 99 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -353,6 +353,105 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
return 2;
}

/// This list was mostly derived from experimentation.
bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
switch (Opcode) {
case AMDGPU::V_CVT_F16_F32_e32:
case AMDGPU::V_CVT_F16_F32_e64:
case AMDGPU::V_CVT_F16_U16_e32:
case AMDGPU::V_CVT_F16_U16_e64:
case AMDGPU::V_CVT_F16_I16_e32:
case AMDGPU::V_CVT_F16_I16_e64:
case AMDGPU::V_RCP_F16_e64:
case AMDGPU::V_RCP_F16_e32:
case AMDGPU::V_RSQ_F16_e64:
case AMDGPU::V_RSQ_F16_e32:
case AMDGPU::V_SQRT_F16_e64:
case AMDGPU::V_SQRT_F16_e32:
case AMDGPU::V_LOG_F16_e64:
case AMDGPU::V_LOG_F16_e32:
case AMDGPU::V_EXP_F16_e64:
case AMDGPU::V_EXP_F16_e32:
case AMDGPU::V_SIN_F16_e64:
case AMDGPU::V_SIN_F16_e32:
case AMDGPU::V_COS_F16_e64:
case AMDGPU::V_COS_F16_e32:
case AMDGPU::V_FLOOR_F16_e64:
case AMDGPU::V_FLOOR_F16_e32:
case AMDGPU::V_CEIL_F16_e64:
case AMDGPU::V_CEIL_F16_e32:
case AMDGPU::V_TRUNC_F16_e64:
case AMDGPU::V_TRUNC_F16_e32:
case AMDGPU::V_RNDNE_F16_e64:
case AMDGPU::V_RNDNE_F16_e32:
case AMDGPU::V_FRACT_F16_e64:
case AMDGPU::V_FRACT_F16_e32:
case AMDGPU::V_FREXP_MANT_F16_e64:
case AMDGPU::V_FREXP_MANT_F16_e32:
case AMDGPU::V_FREXP_EXP_I16_F16_e64:
case AMDGPU::V_FREXP_EXP_I16_F16_e32:
case AMDGPU::V_LDEXP_F16_e64:
case AMDGPU::V_LDEXP_F16_e32:
case AMDGPU::V_LSHLREV_B16_e64:
case AMDGPU::V_LSHLREV_B16_e32:
case AMDGPU::V_LSHRREV_B16_e64:
case AMDGPU::V_LSHRREV_B16_e32:
case AMDGPU::V_ASHRREV_I16_e64:
case AMDGPU::V_ASHRREV_I16_e32:
case AMDGPU::V_ADD_U16_e64:
case AMDGPU::V_ADD_U16_e32:
case AMDGPU::V_SUB_U16_e64:
case AMDGPU::V_SUB_U16_e32:
case AMDGPU::V_SUBREV_U16_e64:
case AMDGPU::V_SUBREV_U16_e32:
case AMDGPU::V_MUL_LO_U16_e64:
case AMDGPU::V_MUL_LO_U16_e32:
case AMDGPU::V_ADD_F16_e64:
case AMDGPU::V_ADD_F16_e32:
case AMDGPU::V_SUB_F16_e64:
case AMDGPU::V_SUB_F16_e32:
case AMDGPU::V_SUBREV_F16_e64:
case AMDGPU::V_SUBREV_F16_e32:
case AMDGPU::V_MUL_F16_e64:
case AMDGPU::V_MUL_F16_e32:
case AMDGPU::V_MAX_F16_e64:
case AMDGPU::V_MAX_F16_e32:
case AMDGPU::V_MIN_F16_e64:
case AMDGPU::V_MIN_F16_e32:
case AMDGPU::V_MAX_U16_e64:
case AMDGPU::V_MAX_U16_e32:
case AMDGPU::V_MIN_U16_e64:
case AMDGPU::V_MIN_U16_e32:
case AMDGPU::V_MAX_I16_e64:
case AMDGPU::V_MAX_I16_e32:
case AMDGPU::V_MIN_I16_e64:
case AMDGPU::V_MIN_I16_e32:
// On gfx10, all 16-bit instructions preserve the high bits.
return getGeneration() <= AMDGPUSubtarget::GFX9;
case AMDGPU::V_MAD_F16_e64:
case AMDGPU::V_MADAK_F16:
case AMDGPU::V_MADMK_F16:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_MAC_F16_e32:
case AMDGPU::V_FMAMK_F16:
case AMDGPU::V_FMAAK_F16:
case AMDGPU::V_MAD_U16_e64:
case AMDGPU::V_MAD_I16_e64:
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_DIV_FIXUP_F16_e64:
// In gfx9, the preferred handling of the unused high 16-bits changed. Most
// instructions maintain the legacy behavior of 0ing. Some instructions
// changed to preserving the high bits.
return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
case AMDGPU::V_MAD_MIXLO_F16:
case AMDGPU::V_MAD_MIXHI_F16:
default:
return false;
}
}

unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
const Function &F) const {
if (NWaves == 1)
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Expand Up @@ -286,6 +286,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

unsigned getConstantBusLimit(unsigned Opcode) const;

/// Returns if the result of this instruction with a 16-bit result returned in
/// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
/// the original value.
bool zeroesHigh16BitsOfDest(unsigned Opcode) const;

bool hasIntClamp() const {
return HasIntClamp;
}
Expand Down
25 changes: 25 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Expand Up @@ -91,6 +91,7 @@ class SIFoldOperands : public MachineFunctionPass {
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;

bool tryFoldCndMask(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;

const MachineOperand *isClamp(const MachineInstr &MI) const;
Expand Down Expand Up @@ -1188,6 +1189,27 @@ bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
return true;
}

bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
return false;

MachineOperand *Src0 = getImmOrMaterializedImm(*MRI, MI.getOperand(1));
if (!Src0->isImm() || Src0->getImm() != 0xffff)
return false;

Register Src1 = MI.getOperand(2).getReg();
MachineInstr *SrcDef = MRI->getVRegDef(Src1);
if (ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) {
Register Dst = MI.getOperand(0).getReg();
MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
MI.eraseFromParent();
return true;
}

return false;
}

void SIFoldOperands::foldInstOperand(MachineInstr &MI,
MachineOperand &OpToFold) const {
// We need mutate the operands of new mov instructions to add implicit
Expand Down Expand Up @@ -1721,6 +1743,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
for (auto &MI : make_early_inc_range(*MBB)) {
tryFoldCndMask(MI);

if (tryFoldZeroHighBits(MI))
continue;

if (MI.isRegSequence() && tryFoldRegSequence(MI))
continue;

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/fmax3.ll
Expand Up @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad
; VI-NEXT: v_max_f16_e32 v0, v2, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64

; GFX9: s_waitcnt
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
Expand Up @@ -97,7 +97,7 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NNAN-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
Expand Down Expand Up @@ -178,7 +178,7 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; VI-NNAN-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
Expand Down Expand Up @@ -283,8 +283,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
Expand Down Expand Up @@ -437,10 +437,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5
; VI-NNAN-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v4
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10
; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9
; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/fmin3.ll
Expand Up @@ -102,7 +102,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad
; VI-NEXT: v_min_f16_e32 v0, v2, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_min_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64

; GFX9: s_waitcnt
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
Expand Up @@ -98,7 +98,7 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NNAN-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
Expand Down Expand Up @@ -179,7 +179,7 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; VI-NNAN-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
Expand Down Expand Up @@ -284,8 +284,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
Expand Down Expand Up @@ -438,10 +438,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5
; VI-NNAN-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v4
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10
; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9
; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
Expand Up @@ -73,8 +73,7 @@ entry:
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_0]]

; GCN: buffer_store_dword v[[R_V2_F16]]

Expand Down Expand Up @@ -141,7 +140,7 @@ entry:
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
; SIVI-NOT: v[[R_F16]]
; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
; GFX9-NOT: v_and_b32
; GCN: buffer_store_dword v[[R_F16]]
define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
i32 addrspace(1)* %r,
Expand All @@ -159,7 +158,7 @@ entry:
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
; SIVI-NOT: v[[R_F16]]
; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
; GFX9-NOT: v_and_b32
; GCN: buffer_store_dword v[[R_F16]]
define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
i32 addrspace(1)* %r,
Expand Down

0 comments on commit 39f8a79

Please sign in to comment.