diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 5c1f717694a4c..216caf5db3dc6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -94,6 +94,13 @@ static cl::opt UsePrecSqrtF32( cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true)); +/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it +/// does NOT use lg2.approx for log2, so this is disabled by default. +static cl::opt UseApproxLog2F32( + "nvptx-approx-log2f32", + cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), + cl::init(false)); + static cl::opt ForceMinByValParamAlign( "nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" @@ -520,6 +527,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, case ISD::FMINIMUM: IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; break; + case ISD::FEXP2: + IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70; + break; } setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action); }; @@ -968,7 +978,26 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::CopyToReg, MVT::i128, Custom); setOperationAction(ISD::CopyFromReg, MVT::i128, Custom); - // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. + // FEXP2 support: + // - f32 + // - f16/f16x2 (sm_70+, PTX 7.0+) + // - bf16/bf16x2 (sm_90+, PTX 7.8+) + // When f16/bf16 types aren't supported, they are promoted/expanded to f32. + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote); + setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand); + setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote); + setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand); + + // FLOG2 supports f32 only + // f16/bf16 types aren't supported, but they are promoted/expanded to f32. + if (UseApproxLog2F32) { + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32); + setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32); + setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand); + } + // No FPOW or FREM in PTX. // Now deduce the information based on the above mentioned diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index c3e72d6ce3a3f..ceb6d55b384ea 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -562,6 +562,18 @@ multiclass F2_Support_Half { } +// Variant where only .ftz.bf16 is supported. +multiclass F2_Support_Half_BF { + def bf16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), + OpcStr # ".ftz.bf16 \t$dst, $a;", + [(set bf16:$dst, (OpNode bf16:$a))]>, + Requires<[hasSM<90>, hasPTX<78>]>; + def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), + OpcStr # ".ftz.bf16x2 \t$dst, $a;", + [(set v2bf16:$dst, (OpNode v2bf16:$a))]>, + Requires<[hasSM<90>, hasPTX<78>]>; +} + //===----------------------------------------------------------------------===// // NVPTX Instructions. //===----------------------------------------------------------------------===// @@ -1193,6 +1205,8 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>; defm FSQRT : F2<"sqrt.rn", fsqrt>; +defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>; + // // F16 NEG // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8ede1ec4f20dc..9f754b7b2dfcd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1255,11 +1255,21 @@ def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>; def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;", Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>; + def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;", Int16Regs, Int16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>; def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;", Int32Regs, Int32Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>; +def : Pat<(fexp2 f32:$a), + (INT_NVVM_EX2_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>; +def : Pat<(fexp2 f32:$a), + (INT_NVVM_EX2_APPROX_F $a)>, Requires<[doNoF32FTZ]>; +def : Pat<(fexp2 f16:$a), + (INT_NVVM_EX2_APPROX_F16 $a)>, Requires<[useFP16Math]>; +def : Pat<(fexp2 v2f16:$a), + (INT_NVVM_EX2_APPROX_F16X2 $a)>, Requires<[useFP16Math]>; + def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>; def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;", @@ -1267,6 +1277,11 @@ def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;", def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;", Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>; +def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_FTZ_F $a)>, + Requires<[doF32FTZ]>; +def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_F $a)>, + Requires<[doNoF32FTZ]>; + // // Sin Cos // diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll index df3a36db52b1a..ae70946b4b1dc 100644 --- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -1,21 +1,37 @@ -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s -; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s +; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +target triple = "nvptx64-nvidia-cuda" declare half @llvm.nvvm.ex2.approx.f16(half) declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>) -; CHECK-LABEL: exp2_half -define half @exp2_half(half %0) { - ; CHECK-NOT: call - ; CHECK: ex2.approx.f16 - %res = call half @llvm.nvvm.ex2.approx.f16(half %0); +; CHECK-LABEL: ex2_half +define half @ex2_half(half %0) { +; CHECK-FP16-LABEL: ex2_half( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b16 %rs<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b16 %rs1, [ex2_half_param_0]; +; CHECK-FP16-NEXT: ex2.approx.f16 %rs2, %rs1; +; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-FP16-NEXT: ret; + %res = call half @llvm.nvvm.ex2.approx.f16(half %0) ret half %res } -; CHECK-LABEL: exp2_2xhalf -define <2 x half> @exp2_2xhalf(<2 x half> %0) { - ; CHECK-NOT: call - ; CHECK: ex2.approx.f16x2 - %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0); +; CHECK-LABEL: ex2_2xhalf +define <2 x half> @ex2_2xhalf(<2 x half> %0) { +; CHECK-FP16-LABEL: ex2_2xhalf( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b32 %r<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b32 %r1, [ex2_2xhalf_param_0]; +; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-FP16-NEXT: ret; + %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0) ret <2 x half> %res } diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll new file mode 100644 index 0000000000000..c9eff2a8ff17d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +target triple = "nvptx-nvidia-cuda" + +declare float @llvm.nvvm.ex2.approx.f(float) + +; CHECK-LABEL: ex2_float +define float @ex2_float(float %0) { +; CHECK-LABEL: ex2_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_param_0]; +; CHECK-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %res = call float @llvm.nvvm.ex2.approx.f(float %0) + ret float %res +} + +; CHECK-LABEL: ex2_float_ftz +define float @ex2_float_ftz(float %0) { +; CHECK-LABEL: ex2_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_ftz_param_0]; +; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0) + ret float %res +} diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll new file mode 100644 index 0000000000000..13324c6860926 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify %} +target triple = "nvptx-nvidia-cuda" + +declare float @llvm.nvvm.lg2.approx.f(float) +declare float @llvm.nvvm.lg2.approx.ftz.f(float) + +; CHECK-LABEL: lg2_float +define float @lg2_float(float %0) { +; CHECK-LABEL: lg2_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [lg2_float_param_0]; +; CHECK-NEXT: lg2.approx.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %res = call float @llvm.nvvm.lg2.approx.f(float %0) + ret float %res +} + +; CHECK-LABEL: lg2_float_ftz +define float @lg2_float_ftz(float %0) { +; CHECK-LABEL: lg2_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [lg2_float_ftz_param_0]; +; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %res = call float @llvm.nvvm.lg2.approx.ftz.f(float %0) + ret float %res +} diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll new file mode 100644 index 0000000000000..7e485dca65764 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fexp2.ll @@ -0,0 +1,414 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-BF16 %s +; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} +; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} +target triple = "nvptx64-nvidia-cuda" + +; --- f32 --- + +; CHECK-LABEL: exp2_test +define float @exp2_test(float %in) { +; CHECK-LABEL: exp2_test( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.f32 %f1, [exp2_test_param_0]; +; CHECK-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_test( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .f32 %f<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_test_param_0]; +; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_test( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .f32 %f<3>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_test_param_0]; +; CHECK-BF16-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call float @llvm.exp2.f32(float %in) + ret float %exp2 +} + +; CHECK-LABEL: exp2_ftz_test +define float @exp2_ftz_test(float %in) #0 { +; CHECK-LABEL: exp2_ftz_test( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0]; +; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_ftz_test( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .f32 %f<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0]; +; CHECK-FP16-NEXT: ex2.approx.ftz.f32 %f2, %f1; +; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_ftz_test( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .f32 %f<3>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0]; +; CHECK-BF16-NEXT: ex2.approx.ftz.f32 %f2, %f1; +; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call float @llvm.exp2.f32(float %in) + ret float %exp2 +} + +; CHECK-LABEL: exp2_test_v +define <2 x float> @exp2_test_v(<2 x float> %in) { +; CHECK-LABEL: exp2_test_v( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0]; +; CHECK-NEXT: ex2.approx.f32 %f3, %f2; +; CHECK-NEXT: ex2.approx.f32 %f4, %f1; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_test_v( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .f32 %f<5>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0]; +; CHECK-FP16-NEXT: ex2.approx.f32 %f3, %f2; +; CHECK-FP16-NEXT: ex2.approx.f32 %f4, %f1; +; CHECK-FP16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_test_v( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .f32 %f<5>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0]; +; CHECK-BF16-NEXT: ex2.approx.f32 %f3, %f2; +; CHECK-BF16-NEXT: ex2.approx.f32 %f4, %f1; +; CHECK-BF16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) + ret <2 x float> %exp2 +} + +; --- f16 --- + +; CHECK-LABEL: exp2_f16_test +define half @exp2_f16_test(half %in) { +; CHECK-LABEL: exp2_f16_test( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b16 %rs1, [exp2_f16_test_param_0]; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs1; +; CHECK-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_f16_test( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b16 %rs<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.b16 %rs1, [exp2_f16_test_param_0]; +; CHECK-FP16-NEXT: ex2.approx.f16 %rs2, %rs1; +; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_f16_test( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .b16 %rs<3>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.b16 %rs1, [exp2_f16_test_param_0]; +; CHECK-BF16-NEXT: ex2.approx.f16 %rs2, %rs1; +; CHECK-BF16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call half @llvm.exp2.f16(half %in) + ret half %exp2 +} + +; COM: we should never have .ftz for f16 +; CHECK-LABEL: exp2_f16_ftz_test +define half @exp2_f16_ftz_test(half %in) #0 { +; CHECK-LABEL: exp2_f16_ftz_test( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0]; +; CHECK-NEXT: cvt.ftz.f32.f16 %f1, %rs1; +; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_f16_ftz_test( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b16 %rs<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0]; +; CHECK-FP16-NEXT: ex2.approx.f16 %rs2, %rs1; +; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_f16_ftz_test( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .b16 %rs<3>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.b16 %rs1, [exp2_f16_ftz_test_param_0]; +; CHECK-BF16-NEXT: ex2.approx.f16 %rs2, %rs1; +; CHECK-BF16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call half @llvm.exp2.f16(half %in) + ret half %exp2 +} + +; CHECK-LABEL: exp2_f16_test_v +define <2 x half> @exp2_f16_test_v(<2 x half> %in) { +; CHECK-LABEL: exp2_f16_test_v( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b32 %r1, [exp2_f16_test_v_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NEXT: ex2.approx.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_f16_test_v( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b32 %r<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.b32 %r1, [exp2_f16_test_v_param_0]; +; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_f16_test_v( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .b32 %r<3>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.b32 %r1, [exp2_f16_test_v_param_0]; +; CHECK-BF16-NEXT: ex2.approx.f16x2 %r2, %r1; +; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> %in) + ret <2 x half> %exp2 +} + +; --- bf16 --- + +; COM: we should always have .ftz for bf16 +; CHECK-LABEL: exp2_bf16_test +define bfloat @exp2_bf16_test(bfloat %in) { +; CHECK-LABEL: exp2_bf16_test( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 16; +; CHECK-NEXT: mov.b32 %f1, %r2; +; CHECK-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-NEXT: mov.b32 %r3, %f2; +; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1; +; CHECK-NEXT: add.s32 %r5, %r4, %r3; +; CHECK-NEXT: add.s32 %r6, %r5, 32767; +; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2; +; CHECK-NEXT: or.b32 %r7, %r3, 4194304; +; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_bf16_test( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .pred %p<2>; +; CHECK-FP16-NEXT: .reg .b16 %rs<2>; +; CHECK-FP16-NEXT: .reg .b32 %r<9>; +; CHECK-FP16-NEXT: .reg .f32 %f<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0]; +; CHECK-FP16-NEXT: shl.b32 %r2, %r1, 16; +; CHECK-FP16-NEXT: mov.b32 %f1, %r2; +; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-FP16-NEXT: mov.b32 %r3, %f2; +; CHECK-FP16-NEXT: bfe.u32 %r4, %r3, 16, 1; +; CHECK-FP16-NEXT: add.s32 %r5, %r4, %r3; +; CHECK-FP16-NEXT: add.s32 %r6, %r5, 32767; +; CHECK-FP16-NEXT: setp.nan.f32 %p1, %f2, %f2; +; CHECK-FP16-NEXT: or.b32 %r7, %r3, 4194304; +; CHECK-FP16-NEXT: selp.b32 %r8, %r7, %r6, %p1; +; CHECK-FP16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } +; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_bf16_test( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .b16 %rs<3>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.b16 %rs1, [exp2_bf16_test_param_0]; +; CHECK-BF16-NEXT: ex2.approx.ftz.bf16 %rs2, %rs1; +; CHECK-BF16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call bfloat @llvm.exp2.bf16(bfloat %in) + ret bfloat %exp2 +} + +; CHECK-LABEL: exp2_bf16_test_v +define <2 x bfloat> @exp2_bf16_test_v(<2 x bfloat> %in) { +; CHECK-LABEL: exp2_bf16_test_v( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs2; +; CHECK-NEXT: shl.b32 %r3, %r2, 16; +; CHECK-NEXT: mov.b32 %f1, %r3; +; CHECK-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-NEXT: mov.b32 %r4, %f2; +; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 1; +; CHECK-NEXT: add.s32 %r6, %r5, %r4; +; CHECK-NEXT: add.s32 %r7, %r6, 32767; +; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2; +; CHECK-NEXT: or.b32 %r8, %r4, 4194304; +; CHECK-NEXT: selp.b32 %r9, %r8, %r7, %p1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs1; +; CHECK-NEXT: shl.b32 %r11, %r10, 16; +; CHECK-NEXT: mov.b32 %f3, %r11; +; CHECK-NEXT: ex2.approx.f32 %f4, %f3; +; CHECK-NEXT: mov.b32 %r12, %f4; +; CHECK-NEXT: bfe.u32 %r13, %r12, 16, 1; +; CHECK-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-NEXT: add.s32 %r15, %r14, 32767; +; CHECK-NEXT: setp.nan.f32 %p2, %f4, %f4; +; CHECK-NEXT: or.b32 %r16, %r12, 4194304; +; CHECK-NEXT: selp.b32 %r17, %r16, %r15, %p2; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U; +; CHECK-NEXT: st.param.b32 [func_retval0], %r18; +; CHECK-NEXT: ret; +; +; CHECK-FP16-LABEL: exp2_bf16_test_v( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .pred %p<3>; +; CHECK-FP16-NEXT: .reg .b16 %rs<3>; +; CHECK-FP16-NEXT: .reg .b32 %r<19>; +; CHECK-FP16-NEXT: .reg .f32 %f<5>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: // %entry +; CHECK-FP16-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0]; +; CHECK-FP16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-FP16-NEXT: cvt.u32.u16 %r2, %rs2; +; CHECK-FP16-NEXT: shl.b32 %r3, %r2, 16; +; CHECK-FP16-NEXT: mov.b32 %f1, %r3; +; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1; +; CHECK-FP16-NEXT: mov.b32 %r4, %f2; +; CHECK-FP16-NEXT: bfe.u32 %r5, %r4, 16, 1; +; CHECK-FP16-NEXT: add.s32 %r6, %r5, %r4; +; CHECK-FP16-NEXT: add.s32 %r7, %r6, 32767; +; CHECK-FP16-NEXT: setp.nan.f32 %p1, %f2, %f2; +; CHECK-FP16-NEXT: or.b32 %r8, %r4, 4194304; +; CHECK-FP16-NEXT: selp.b32 %r9, %r8, %r7, %p1; +; CHECK-FP16-NEXT: cvt.u32.u16 %r10, %rs1; +; CHECK-FP16-NEXT: shl.b32 %r11, %r10, 16; +; CHECK-FP16-NEXT: mov.b32 %f3, %r11; +; CHECK-FP16-NEXT: ex2.approx.f32 %f4, %f3; +; CHECK-FP16-NEXT: mov.b32 %r12, %f4; +; CHECK-FP16-NEXT: bfe.u32 %r13, %r12, 16, 1; +; CHECK-FP16-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-FP16-NEXT: add.s32 %r15, %r14, 32767; +; CHECK-FP16-NEXT: setp.nan.f32 %p2, %f4, %f4; +; CHECK-FP16-NEXT: or.b32 %r16, %r12, 4194304; +; CHECK-FP16-NEXT: selp.b32 %r17, %r16, %r15, %p2; +; CHECK-FP16-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r18; +; CHECK-FP16-NEXT: ret; +; +; CHECK-BF16-LABEL: exp2_bf16_test_v( +; CHECK-BF16: { +; CHECK-BF16-NEXT: .reg .b32 %r<3>; +; CHECK-BF16-EMPTY: +; CHECK-BF16-NEXT: // %bb.0: // %entry +; CHECK-BF16-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0]; +; CHECK-BF16-NEXT: ex2.approx.ftz.bf16x2 %r2, %r1; +; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-BF16-NEXT: ret; +entry: + %exp2 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in) + ret <2 x bfloat> %exp2 +} + +declare float @llvm.exp2.f32(float %val) + +declare <2 x float> @llvm.exp2.v2f32(<2 x float> %val) + +declare half @llvm.exp2.f16(half %val) + +declare <2 x half> @llvm.exp2.v2f16(<2 x half> %val) + +declare bfloat @llvm.exp2.bf16(bfloat %val) + +declare <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %val) + +attributes #0 = {"denormal-fp-math"="preserve-sign"} diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll new file mode 100644 index 0000000000000..ff762dcf74b2f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/flog2.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_50 -mattr=+ptx32 -nvptx-approx-log2f32 | %ptxas-verify -arch=sm_50 %} +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: log2_test +define float @log2_test(float %in) { +; CHECK-LABEL: log2_test( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.f32 %f1, [log2_test_param_0]; +; CHECK-NEXT: lg2.approx.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; +entry: + %log2 = call float @llvm.log2.f32(float %in) + ret float %log2 +} + +; CHECK-LABEL: log2_ftz_test +define float @log2_ftz_test(float %in) #0 { +; CHECK-LABEL: log2_ftz_test( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.f32 %f1, [log2_ftz_test_param_0]; +; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; +entry: + %log2 = call float @llvm.log2.f32(float %in) + ret float %log2 +} + +; CHECK-LABEL: log2_test_v +define <2 x float> @log2_test_v(<2 x float> %in) { +; CHECK-LABEL: log2_test_v( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [log2_test_v_param_0]; +; CHECK-NEXT: lg2.approx.f32 %f3, %f2; +; CHECK-NEXT: lg2.approx.f32 %f4, %f1; +; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-NEXT: ret; +entry: + %log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) + ret <2 x float> %log2 +} + +; --- f16 --- + +; CHECK-LABEL: log2_f16_test +define half @log2_f16_test(half %in) { +; CHECK-LABEL: log2_f16_test( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b16 %rs1, [log2_f16_test_param_0]; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs1; +; CHECK-NEXT: lg2.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; +entry: + %log2 = call half @llvm.log2.f16(half %in) + ret half %log2 +} + +; CHECK-LABEL: log2_f16_ftz_test +define half @log2_f16_ftz_test(half %in) #0 { +; CHECK-LABEL: log2_f16_ftz_test( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b16 %rs1, [log2_f16_ftz_test_param_0]; +; CHECK-NEXT: cvt.ftz.f32.f16 %f1, %rs1; +; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; +entry: + %log2 = call half @llvm.log2.f16(half %in) + ret half %log2 +} + +; CHECK-LABEL: log2_f16_test_v +define <2 x half> @log2_f16_test_v(<2 x half> %in) { +; CHECK-LABEL: log2_f16_test_v( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b32 %r1, [log2_f16_test_v_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: lg2.approx.f32 %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f2; +; CHECK-NEXT: cvt.f32.f16 %f3, %rs1; +; CHECK-NEXT: lg2.approx.f32 %f4, %f3; +; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %f4; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; +entry: + %log2 = call <2 x half> @llvm.log2.v2f16(<2 x half> %in) + ret <2 x half> %log2 +} + +; --- bf16 --- + +; CHECK-LABEL: log2_bf16_test +define bfloat @log2_bf16_test(bfloat %in) { +; CHECK-LABEL: log2_bf16_test( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_test_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 16; +; CHECK-NEXT: mov.b32 %f1, %r2; +; CHECK-NEXT: lg2.approx.f32 %f2, %f1; +; CHECK-NEXT: mov.b32 %r3, %f2; +; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1; +; CHECK-NEXT: add.s32 %r5, %r4, %r3; +; CHECK-NEXT: add.s32 %r6, %r5, 32767; +; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2; +; CHECK-NEXT: or.b32 %r7, %r3, 4194304; +; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; +entry: + %log2 = call bfloat @llvm.log2.bf16(bfloat %in) + ret bfloat %log2 +} + +; CHECK-LABEL: log2_bf16_ftz_test +define bfloat @log2_bf16_ftz_test(bfloat %in) #0 { +; CHECK-LABEL: log2_bf16_ftz_test( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_ftz_test_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 16; +; CHECK-NEXT: mov.b32 %f1, %r2; +; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1; +; CHECK-NEXT: mov.b32 %r3, %f2; +; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1; +; CHECK-NEXT: add.s32 %r5, %r4, %r3; +; CHECK-NEXT: add.s32 %r6, %r5, 32767; +; CHECK-NEXT: setp.nan.ftz.f32 %p1, %f2, %f2; +; CHECK-NEXT: or.b32 %r7, %r3, 4194304; +; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; } +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; +entry: + %log2 = call bfloat @llvm.log2.bf16(bfloat %in) + ret bfloat %log2 +} + +; CHECK-LABEL: log2_bf16_test_v +define <2 x bfloat> @log2_bf16_test_v(<2 x bfloat> %in) { +; CHECK-LABEL: log2_bf16_test_v( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b32 %r1, [log2_bf16_test_v_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs2; +; CHECK-NEXT: shl.b32 %r3, %r2, 16; +; CHECK-NEXT: mov.b32 %f1, %r3; +; CHECK-NEXT: lg2.approx.f32 %f2, %f1; +; CHECK-NEXT: mov.b32 %r4, %f2; +; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 1; +; CHECK-NEXT: add.s32 %r6, %r5, %r4; +; CHECK-NEXT: add.s32 %r7, %r6, 32767; +; CHECK-NEXT: setp.nan.f32 %p1, %f2, %f2; +; CHECK-NEXT: or.b32 %r8, %r4, 4194304; +; CHECK-NEXT: selp.b32 %r9, %r8, %r7, %p1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs1; +; CHECK-NEXT: shl.b32 %r11, %r10, 16; +; CHECK-NEXT: mov.b32 %f3, %r11; +; CHECK-NEXT: lg2.approx.f32 %f4, %f3; +; CHECK-NEXT: mov.b32 %r12, %f4; +; CHECK-NEXT: bfe.u32 %r13, %r12, 16, 1; +; CHECK-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-NEXT: add.s32 %r15, %r14, 32767; +; CHECK-NEXT: setp.nan.f32 %p2, %f4, %f4; +; CHECK-NEXT: or.b32 %r16, %r12, 4194304; +; CHECK-NEXT: selp.b32 %r17, %r16, %r15, %p2; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U; +; CHECK-NEXT: st.param.b32 [func_retval0], %r18; +; CHECK-NEXT: ret; +entry: + %log2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in) + ret <2 x bfloat> %log2 +} + +declare float @llvm.log2.f32(float %val) + +declare <2 x float> @llvm.log2.v2f32(<2 x float> %val) + +declare half @llvm.log2.f16(half %val) + +declare <2 x half> @llvm.log2.v2f16(<2 x half> %val) + +declare bfloat @llvm.log2.bf16(bfloat %val) + +declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %val) + +attributes #0 = {"denormal-fp-math"="preserve-sign"}