From d599c7a68f20259327573fee7b11192c8a435ef7 Mon Sep 17 00:00:00 2001 From: Adel Ejjeh Date: Thu, 20 Nov 2025 11:21:12 -0600 Subject: [PATCH 1/5] [AMDGPU] Update log lowering to remove contract for AMDGCN backend --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 + llvm/test/CodeGen/AMDGPU/llvm.log.ll | 331 +++++++++++++++++- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 331 +++++++++++++++++- 4 files changed, 660 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index db890df7c50f9..ffafe06211109 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2766,7 +2766,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, EVT VT = Op.getValueType(); SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - + // Our implementation of LOG is not contract safe, so disable instruction + // contraction. + Flags.setAllowContract(false); const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; assert(IsLog10 || Op.getOpcode() == ISD::FLOG); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1a13b2226ecd6..5c8b720c54761 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3508,6 +3508,9 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, MachineRegisterInfo &MRI = *B.getMRI(); Register Dst = MI.getOperand(0).getReg(); Register X = MI.getOperand(1).getReg(); + // Our implementation of LOG is not contract safe, so disable contraction in + // the flags before reading the field. + MI.clearFlags(MachineInstr::FmContract); unsigned Flags = MI.getFlags(); const LLT Ty = MRI.getType(X); MachineFunction &MF = B.getMF(); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index fc6b2d95b2af8..88f202adba07e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -316,6 +316,309 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ret void } +define amdgpu_kernel void @s_log_contract_f32(ptr addrspace(1) %out, float %in) { +; SI-SDAG-LABEL: s_log_contract_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3377d1cf +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_log_contract_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v4, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_log_contract_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_log_contract_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX900-SDAG-LABEL: s_log_contract_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: s_log_contract_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: s_log_contract_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s1 +; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3f317217, v1, -v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3377d1cf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: s_log_contract_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_endpgm +; +; R600-LABEL: s_log_contract_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: AND_INT * T1.W, PS, literal.x, +; R600-NEXT: -4096(nan), 0(0.000000e+00) +; R600-NEXT: ADD * T2.W, T0.X, -PV.W, +; R600-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; R600-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W, +; R600-NEXT: SETGT * T2.W, literal.y, |T0.X|, +; R600-NEXT: 1060204544(6.931152e-01), 2139095040(INF) +; R600-NEXT: CNDE T1.W, PS, T0.X, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.X, PV.W, -PS, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_log_contract_f32: +; CM: ; %bb.0: +; CM-NEXT: ALU 26, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: AND_INT * T1.W, PV.X, literal.x, +; CM-NEXT: -4096(nan), 0(0.000000e+00) +; CM-NEXT: ADD * T2.W, T0.X, -PV.W, +; CM-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; CM-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE T0.Z, T1.W, literal.x, PV.W, +; CM-NEXT: SETGT * T1.W, literal.y, |T0.X|, +; CM-NEXT: 1060204544(6.931152e-01), 2139095040(INF) +; CM-NEXT: CNDE T0.Z, PV.W, T0.X, PV.Z, +; CM-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; CM-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00) +; CM-NEXT: ADD * T0.X, PV.Z, -PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = call contract float @llvm.log.f32(float %in) + store float %result, ptr addrspace(1) %out + ret void +} + ; FIXME: We should be able to merge these packets together on Cayman so we ; have a maximum of 4 instructions. define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { @@ -6439,6 +6742,8 @@ define half @v_log_f16_fast(half %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_f16_fast: @@ -7100,6 +7405,10 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v2f16_fast: @@ -7365,6 +7674,12 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) { ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v3f16_fast: @@ -7691,20 +8006,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index a141bceb3ce86..6dbb5823a2b0f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -316,6 +316,309 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ret void } +define amdgpu_kernel void @s_log10_contract_f32(ptr addrspace(1) %out, float %in) { +; SI-SDAG-LABEL: s_log10_contract_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3284fbcf +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_log10_contract_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v4, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_log10_contract_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_log10_contract_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX900-SDAG-LABEL: s_log10_contract_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: s_log10_contract_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x411a209b +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: s_log10_contract_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s1 +; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3e9a209a, v1, -v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3284fbcf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: s_log10_contract_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_endpgm +; +; R600-LABEL: s_log10_contract_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: AND_INT * T1.W, PS, literal.x, +; R600-NEXT: -4096(nan), 0(0.000000e+00) +; R600-NEXT: ADD * T2.W, T0.X, -PV.W, +; R600-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; R600-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; R600-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; R600-NEXT: 1050288128(3.010254e-01), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W, +; R600-NEXT: SETGT * T2.W, literal.y, |T0.X|, +; R600-NEXT: 1050288128(3.010254e-01), 2139095040(INF) +; R600-NEXT: CNDE T1.W, PS, T0.X, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: 1092231323(9.632960e+00), 0(0.000000e+00) +; R600-NEXT: ADD T0.X, PV.W, -PS, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_log10_contract_f32: +; CM: ; %bb.0: +; CM-NEXT: ALU 26, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: AND_INT * T1.W, PV.X, literal.x, +; CM-NEXT: -4096(nan), 0(0.000000e+00) +; CM-NEXT: ADD * T2.W, T0.X, -PV.W, +; CM-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; CM-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; CM-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; CM-NEXT: 1050288128(3.010254e-01), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE T0.Z, T1.W, literal.x, PV.W, +; CM-NEXT: SETGT * T1.W, literal.y, |T0.X|, +; CM-NEXT: 1050288128(3.010254e-01), 2139095040(INF) +; CM-NEXT: CNDE T0.Z, PV.W, T0.X, PV.Z, +; CM-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; CM-NEXT: 1092231323(9.632960e+00), 0(0.000000e+00) +; CM-NEXT: ADD * T0.X, PV.Z, -PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = call contract float @llvm.log10.f32(float %in) + store float %result, ptr addrspace(1) %out + ret void +} + ; FIXME: We should be able to merge these packets together on Cayman so we ; have a maximum of 4 instructions. define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { @@ -6439,6 +6742,8 @@ define half @v_log10_f16_fast(half %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_f16_fast: @@ -7100,6 +7405,10 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v2f16_fast: @@ -7365,6 +7674,12 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) { ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v3f16_fast: @@ -7691,20 +8006,28 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v4f16_fast: From 04c43c9998f4267009a9383ec5a6aca93da72f0a Mon Sep 17 00:00:00 2001 From: Adel Ejjeh Date: Thu, 20 Nov 2025 12:57:52 -0600 Subject: [PATCH 2/5] Move flag modification placement --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ffafe06211109..3705b8e12c6b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2766,9 +2766,6 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, EVT VT = Op.getValueType(); SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - // Our implementation of LOG is not contract safe, so disable instruction - // contraction. - Flags.setAllowContract(false); const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; assert(IsLog10 || Op.getOpcode() == ISD::FLOG); @@ -2807,7 +2804,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); - + // Our implementation of LOG is not contract safe, so disable instruction + // contraction. + Flags.setAllowContract(false); R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); @@ -2830,7 +2829,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); - + // Our implementation of LOG is not contract safe, so disable instruction + // contraction. + Flags.setAllowContract(false); SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); From e61c7b87499d04e6c79143bdf9e9dbaf940be855 Mon Sep 17 00:00:00 2001 From: Adel Ejjeh Date: Thu, 20 Nov 2025 13:51:52 -0600 Subject: [PATCH 3/5] Update comment, apply same fix to global isel, and fix lit tests --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 14 +++++--- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 34 ++++++++++++------- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 28 +++------------ llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 28 +++------------ 4 files changed, 39 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3705b8e12c6b7..2288d3196d2de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2804,8 +2804,11 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); - // Our implementation of LOG is not contract safe, so disable instruction - // contraction. + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations based on the rounding error of the first + // multiplication below, so contracting the multiply with the final add will + // lead to inaccurate final results. Disable contraction for the expanded + // instructions. Flags.setAllowContract(false); R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); @@ -2829,8 +2832,11 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); - // Our implementation of LOG is not contract safe, so disable instruction - // contraction. + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations based on the rounding error of the first + // multiplication below, so contracting the multiply with the final add will + // lead to inaccurate final results. Disable contraction for the expanded + // instructions. Flags.setAllowContract(false); SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5c8b720c54761..77f0e6448458e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3508,9 +3508,6 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, MachineRegisterInfo &MRI = *B.getMRI(); Register Dst = MI.getOperand(0).getReg(); Register X = MI.getOperand(1).getReg(); - // Our implementation of LOG is not contract safe, so disable contraction in - // the flags before reading the field. - MI.clearFlags(MachineInstr::FmContract); unsigned Flags = MI.getFlags(); const LLT Ty = MRI.getType(X); MachineFunction &MF = B.getMF(); @@ -3554,12 +3551,17 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); - - R = B.buildFMul(Ty, Y, C, Flags).getReg(0); - auto NegR = B.buildFNeg(Ty, R, Flags); - auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); - auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); - R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations based on the rounding error of the first + // multiplication below, so contracting the multiply with the final add will + // lead to inaccurate final results. Disable contraction for the expanded + // instructions. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0); + auto NegR = B.buildFNeg(Ty, R, NewFlags); + auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags); + auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags); + R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0); } else { // ch+ct is ln(2)/ln(10) to more than 36 bits const float ch_log10 = 0x1.344000p-2f; @@ -3575,12 +3577,18 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto MaskConst = B.buildConstant(Ty, 0xfffff000); auto YH = B.buildAnd(Ty, Y, MaskConst); auto YT = B.buildFSub(Ty, Y, YH, Flags); - auto YTCT = B.buildFMul(Ty, YT, CT, Flags); + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations based on the rounding error of the first + // multiplication below, so contracting the multiply with the final add will + // lead to inaccurate final results. Disable contraction for the expanded + // instructions. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags); Register Mad0 = - getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); - Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); - R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); + getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags); + Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags); + R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags); } const bool IsFiniteOnly = diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 88f202adba07e..33c8829a6d42c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -6742,8 +6742,6 @@ define half @v_log_f16_fast(half %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_f16_fast: @@ -7405,10 +7403,6 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v2f16_fast: @@ -7674,12 +7668,6 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) { ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v3f16_fast: @@ -8006,28 +7994,20 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_v4f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 6dbb5823a2b0f..16a784f534f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -6742,8 +6742,6 @@ define half @v_log10_f16_fast(half %in) { ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_f16_fast: @@ -7405,10 +7403,6 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) { ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v2f16_fast: @@ -7674,12 +7668,6 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) { ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v3f16_fast: @@ -8006,28 +7994,20 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) { ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2 +; SI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_v4f16_fast: From 19c16487fba7f2f58e028fb773d0160933c5ff1d Mon Sep 17 00:00:00 2001 From: Adel Ejjeh Date: Thu, 20 Nov 2025 17:22:11 -0600 Subject: [PATCH 4/5] Simplify comments --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 ++++------ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 ++++++-------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2288d3196d2de..b87c8fc53915a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2805,9 +2805,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); // Our implementation of LOG is not contract safe because we generate - // error-correcting summations based on the rounding error of the first - // multiplication below, so contracting the multiply with the final add will - // lead to inaccurate final results. Disable contraction for the expanded + // error-correcting summations for which contraction may lead to an increase + // in the error of the approximation. Disable contraction for the expanded // instructions. Flags.setAllowContract(false); R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); @@ -2833,9 +2832,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); // Our implementation of LOG is not contract safe because we generate - // error-correcting summations based on the rounding error of the first - // multiplication below, so contracting the multiply with the final add will - // lead to inaccurate final results. Disable contraction for the expanded + // error-correcting summations for which contraction may lead to an increase + // in the error of the approximation. Disable contraction for the expanded // instructions. Flags.setAllowContract(false); SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 77f0e6448458e..3fbc77c65b876 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3552,10 +3552,9 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); // Our implementation of LOG is not contract safe because we generate - // error-correcting summations based on the rounding error of the first - // multiplication below, so contracting the multiply with the final add will - // lead to inaccurate final results. Disable contraction for the expanded - // instructions. + // error-correcting summations for which contraction may lead to an + // increase in the error of the approximation. Disable contraction for the + // expanded instructions. auto NewFlags = Flags & ~(MachineInstr::FmContract); R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0); auto NegR = B.buildFNeg(Ty, R, NewFlags); @@ -3578,10 +3577,9 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto YH = B.buildAnd(Ty, Y, MaskConst); auto YT = B.buildFSub(Ty, Y, YH, Flags); // Our implementation of LOG is not contract safe because we generate - // error-correcting summations based on the rounding error of the first - // multiplication below, so contracting the multiply with the final add will - // lead to inaccurate final results. Disable contraction for the expanded - // instructions. + // error-correcting summations for which contraction may lead to an + // increase in the error of the approximation. Disable contraction for the + // expanded instructions. auto NewFlags = Flags & ~(MachineInstr::FmContract); auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags); From 1b6c2589f66bc1a49a9c6fc198522fc3b7be7257 Mon Sep 17 00:00:00 2001 From: Adel Ejjeh Date: Fri, 21 Nov 2025 08:51:11 -0600 Subject: [PATCH 5/5] Update comments --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 14 ++++++-------- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 ++++++-------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b87c8fc53915a..f1f6ff619cc71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2804,10 +2804,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); - // Our implementation of LOG is not contract safe because we generate - // error-correcting summations for which contraction may lead to an increase - // in the error of the approximation. Disable contraction for the expanded - // instructions. + // Our implementation of LOG is not contract safe because we add correction + // terms for which contraction may lead to an increase in the error of the + // approximation. Disable contraction for the expanded instructions. Flags.setAllowContract(false); R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); @@ -2831,10 +2830,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); - // Our implementation of LOG is not contract safe because we generate - // error-correcting summations for which contraction may lead to an increase - // in the error of the approximation. Disable contraction for the expanded - // instructions. + // Our implementation of LOG is not contract safe because we add correction + // terms for which contraction may lead to an increase in the error of the + // approximation. Disable contraction for the expanded instructions. Flags.setAllowContract(false); SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 3fbc77c65b876..7443ba05961f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3551,10 +3551,9 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); - // Our implementation of LOG is not contract safe because we generate - // error-correcting summations for which contraction may lead to an - // increase in the error of the approximation. Disable contraction for the - // expanded instructions. + // Our implementation of LOG is not contract safe because we add correction + // terms for which contraction may lead to an increase in the error of the + // approximation. Disable contraction for the expanded instructions. auto NewFlags = Flags & ~(MachineInstr::FmContract); R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0); auto NegR = B.buildFNeg(Ty, R, NewFlags); @@ -3576,10 +3575,9 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto MaskConst = B.buildConstant(Ty, 0xfffff000); auto YH = B.buildAnd(Ty, Y, MaskConst); auto YT = B.buildFSub(Ty, Y, YH, Flags); - // Our implementation of LOG is not contract safe because we generate - // error-correcting summations for which contraction may lead to an - // increase in the error of the approximation. Disable contraction for the - // expanded instructions. + // Our implementation of LOG is not contract safe because we add correction + // terms for which contraction may lead to an increase in the error of the + // approximation. Disable contraction for the expanded instructions. auto NewFlags = Flags & ~(MachineInstr::FmContract); auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);