diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index db890df7c50f9..b87c8fc53915a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2766,7 +2766,6 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, EVT VT = Op.getValueType(); SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; assert(IsLog10 || Op.getOpcode() == ISD::FLOG); @@ -2805,7 +2804,11 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); - + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations for which contraction may lead to an increase + // in the error of the approximation. Disable contraction for the expanded + // instructions. + Flags.setAllowContract(false); R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); @@ -2828,7 +2831,11 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); - + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations for which contraction may lead to an increase + // in the error of the approximation. Disable contraction for the expanded + // instructions. + Flags.setAllowContract(false); SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1a13b2226ecd6..3fbc77c65b876 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3551,12 +3551,16 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); - - R = B.buildFMul(Ty, Y, C, Flags).getReg(0); - auto NegR = B.buildFNeg(Ty, R, Flags); - auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); - auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); - R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations for which contraction may lead to an + // increase in the error of the approximation. Disable contraction for the + // expanded instructions. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0); + auto NegR = B.buildFNeg(Ty, R, NewFlags); + auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags); + auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags); + R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0); } else { // ch+ct is ln(2)/ln(10) to more than 36 bits const float ch_log10 = 0x1.344000p-2f; @@ -3572,12 +3576,17 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, auto MaskConst = B.buildConstant(Ty, 0xfffff000); auto YH = B.buildAnd(Ty, Y, MaskConst); auto YT = B.buildFSub(Ty, Y, YH, Flags); - auto YTCT = B.buildFMul(Ty, YT, CT, Flags); + // Our implementation of LOG is not contract safe because we generate + // error-correcting summations for which contraction may lead to an + // increase in the error of the approximation. Disable contraction for the + // expanded instructions. + auto NewFlags = Flags & ~(MachineInstr::FmContract); + auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags); Register Mad0 = - getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); - Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); - R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); + getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags); + Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags); + R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags); } const bool IsFiniteOnly = diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index fc6b2d95b2af8..33c8829a6d42c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -316,6 +316,309 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ret void } +define amdgpu_kernel void @s_log_contract_f32(ptr addrspace(1) %out, float %in) { +; SI-SDAG-LABEL: s_log_contract_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3377d1cf +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_log_contract_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v4, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_log_contract_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_log_contract_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX900-SDAG-LABEL: s_log_contract_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: s_log_contract_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x41b17218 +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: s_log_contract_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s1 +; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3f317217, v1, -v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3377d1cf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: s_log_contract_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_endpgm +; +; R600-LABEL: s_log_contract_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: AND_INT * T1.W, PS, literal.x, +; R600-NEXT: -4096(nan), 0(0.000000e+00) +; R600-NEXT: ADD * T2.W, T0.X, -PV.W, +; R600-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; R600-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W, +; R600-NEXT: SETGT * T2.W, literal.y, |T0.X|, +; R600-NEXT: 1060204544(6.931152e-01), 2139095040(INF) +; R600-NEXT: CNDE T1.W, PS, T0.X, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00) +; R600-NEXT: ADD T0.X, PV.W, -PS, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_log_contract_f32: +; CM: ; %bb.0: +; CM-NEXT: ALU 26, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: AND_INT * T1.W, PV.X, literal.x, +; CM-NEXT: -4096(nan), 0(0.000000e+00) +; CM-NEXT: ADD * T2.W, T0.X, -PV.W, +; CM-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; CM-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE T0.Z, T1.W, literal.x, PV.W, +; CM-NEXT: SETGT * T1.W, literal.y, |T0.X|, +; CM-NEXT: 1060204544(6.931152e-01), 2139095040(INF) +; CM-NEXT: CNDE T0.Z, PV.W, T0.X, PV.Z, +; CM-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; CM-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00) +; CM-NEXT: ADD * T0.X, PV.Z, -PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = call contract float @llvm.log.f32(float %in) + store float %result, ptr addrspace(1) %out + ret void +} + ; FIXME: We should be able to merge these packets together on Cayman so we ; have a maximum of 4 instructions. define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index a141bceb3ce86..16a784f534f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -316,6 +316,309 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ret void } +define amdgpu_kernel void @s_log10_contract_f32(ptr addrspace(1) %out, float %in) { +; SI-SDAG-LABEL: s_log10_contract_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1 +; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3284fbcf +; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: s_log10_contract_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a +; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3e9a209a, v0 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4 +; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, v4, v1 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3 +; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b +; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: s_log10_contract_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: s_log10_contract_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0 +; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 +; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX900-SDAG-LABEL: s_log10_contract_f32: +; GFX900-SDAG: ; %bb.0: +; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec +; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1 +; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3284fbcf +; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209a, v1 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3 +; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4 +; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 +; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2 +; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: s_log10_contract_f32: +; GFX900-GISEL: ; %bb.0: +; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3e9a209a +; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf +; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3e9a209a, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2 +; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4 +; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x411a209b +; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX900-GISEL-NEXT: s_endpgm +; +; GFX1100-SDAG-LABEL: s_log10_contract_f32: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x411a209b, s1 +; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo +; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0 +; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 +; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3e9a209a, v1, -v2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3284fbcf, v3 +; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_endpgm +; +; GFX1100-GISEL-LABEL: s_log10_contract_f32: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s2 +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_endpgm +; +; R600-LABEL: s_log10_contract_f32: +; R600: ; %bb.0: +; R600-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; R600-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; R600-NEXT: LOG_IEEE * T0.X, PV.W, +; R600-NEXT: AND_INT * T1.W, PS, literal.x, +; R600-NEXT: -4096(nan), 0(0.000000e+00) +; R600-NEXT: ADD * T2.W, T0.X, -PV.W, +; R600-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; R600-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; R600-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; R600-NEXT: 1050288128(3.010254e-01), 0(0.000000e+00) +; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W, +; R600-NEXT: SETGT * T2.W, literal.y, |T0.X|, +; R600-NEXT: 1050288128(3.010254e-01), 2139095040(INF) +; R600-NEXT: CNDE T1.W, PS, T0.X, PV.W, +; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; R600-NEXT: 1092231323(9.632960e+00), 0(0.000000e+00) +; R600-NEXT: ADD T0.X, PV.W, -PS, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: s_log10_contract_f32: +; CM: ; %bb.0: +; CM-NEXT: ALU 26, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z, +; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00) +; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x, +; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00) +; CM-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W, +; CM-NEXT: LOG_IEEE T0.X, T1.W, +; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W, +; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W, +; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W, +; CM-NEXT: AND_INT * T1.W, PV.X, literal.x, +; CM-NEXT: -4096(nan), 0(0.000000e+00) +; CM-NEXT: ADD * T2.W, T0.X, -PV.W, +; CM-NEXT: MUL_IEEE * T3.W, PV.W, literal.x, +; CM-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W, +; CM-NEXT: 916096251(4.605039e-06), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W, +; CM-NEXT: 1050288128(3.010254e-01), 0(0.000000e+00) +; CM-NEXT: MULADD_IEEE T0.Z, T1.W, literal.x, PV.W, +; CM-NEXT: SETGT * T1.W, literal.y, |T0.X|, +; CM-NEXT: 1050288128(3.010254e-01), 2139095040(INF) +; CM-NEXT: CNDE T0.Z, PV.W, T0.X, PV.Z, +; CM-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x, +; CM-NEXT: 1092231323(9.632960e+00), 0(0.000000e+00) +; CM-NEXT: ADD * T0.X, PV.Z, -PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %result = call contract float @llvm.log10.f32(float %in) + store float %result, ptr addrspace(1) %out + ret void +} + ; FIXME: We should be able to merge these packets together on Cayman so we ; have a maximum of 4 instructions. define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) {