diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a6ba6e518899f..3f6ac9d1e6404 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18862,27 +18862,38 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const TargetLowering &TLI) { - // We only do this if the target has legal ftrunc. Otherwise, we'd likely be - // replacing casts with a libcall. We also must be allowed to ignore -0.0 - // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer - // conversions would return +0.0. + // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc. + // If NoSignedZerosFPMath is enabled, this is a direct replacement. + // Otherwise, for strict math, we must handle edge cases: + // 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0 + // as example, it first becomes integer 0, and is converted back to +0.0. + // FTRUNC on its own could produce -0.0. + // FIXME: We should be able to use node-level FMF here. - // TODO: If strict math, should we use FABS (+ range check for signed cast)? EVT VT = N->getValueType(0); - if (!TLI.isOperationLegal(ISD::FTRUNC, VT) || - !DAG.getTarget().Options.NoSignedZerosFPMath) + if (!TLI.isOperationLegal(ISD::FTRUNC, VT)) return SDValue(); // fptosi/fptoui round towards zero, so converting from FP to integer and // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X SDValue N0 = N->getOperand(0); if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && - N0.getOperand(0).getValueType() == VT) - return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + N0.getOperand(0).getValueType() == VT) { + if (DAG.getTarget().Options.NoSignedZerosFPMath) + return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + } if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && - N0.getOperand(0).getValueType() == VT) - return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + N0.getOperand(0).getValueType() == VT) { + if (DAG.getTarget().Options.NoSignedZerosFPMath) + return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + + // Strict math: use FABS to handle negative inputs correctly. + if (TLI.isFAbsFree(VT)) { + SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0)); + return DAG.getNode(ISD::FTRUNC, DL, VT, Abs); + } + } return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll index f4ae66a3b2259..4ad5b38b256fe 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -70,6 +70,12 @@ define half @t3(half %x) { ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; +; USE-NEON-NO-GPRS-LABEL: t3: +; USE-NEON-NO-GPRS: // %bb.0: // %entry +; USE-NEON-NO-GPRS-NEXT: fcvtzs h0, h0 +; USE-NEON-NO-GPRS-NEXT: scvtf h0, h0 +; USE-NEON-NO-GPRS-NEXT: ret +; ; NONEON-NOSVE-LABEL: t3: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: fcvt s0, h0 @@ -147,6 +153,12 @@ define half @t6(half %x) { ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; +; USE-NEON-NO-GPRS-LABEL: t6: +; USE-NEON-NO-GPRS: // %bb.0: // %entry +; USE-NEON-NO-GPRS-NEXT: fcvtzu h0, h0 +; USE-NEON-NO-GPRS-NEXT: ucvtf h0, h0 +; USE-NEON-NO-GPRS-NEXT: ret +; ; NONEON-NOSVE-LABEL: t6: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: fcvt s0, h0 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll new file mode 100644 index 0000000000000..49204f84acb85 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) { +; GFX6-LABEL: fptoui_f32_to_i16_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i16 + %fp = uitofp i16 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) { +; GFX6-LABEL: fptoui_f32_to_i32_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i32_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i32 + %fp = uitofp i32 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) { +; GFX6-LABEL: fptoui_f32_to_i64_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i64_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i64 + %fp = uitofp i64 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) { +; GFX6-LABEL: fptoui_f16_to_i16_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i16_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i16 + %fp = uitofp i16 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) { +; GFX6-LABEL: fptoui_f16_to_i32_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i32_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i32 + %fp = uitofp i32 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) { +; GFX6-LABEL: fptoui_f16_to_i64_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i64_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i64 + %fp = uitofp i64 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) { +; GFX6-LABEL: fptoui_f64_to_i16_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i16_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i16 + %fp = uitofp i16 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) { +; GFX6-LABEL: fptoui_f64_to_i32_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i32_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i32 + %fp = uitofp i32 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) { +; GFX6-LABEL: fptoui_f64_to_i64_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s5, 0xfffff +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_not_b32_e32 v0, 31 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s7, s3, 0xb0014 +; GFX6-NEXT: s_addk_i32 s7, 0xfc01 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; GFX6-NEXT: s_and_b32 s8, s3, 0x80000000 +; GFX6-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] +; GFX6-NEXT: s_cmp_lt_i32 s7, 0 +; GFX6-NEXT: s_cselect_b32 s4, 0, s4 +; GFX6-NEXT: s_cselect_b32 s5, s8, s5 +; GFX6-NEXT: s_cmp_gt_i32 s7, 51 +; GFX6-NEXT: s_cselect_b32 s3, s3, s5 +; GFX6-NEXT: s_cselect_b32 s2, s2, s4 +; GFX6-NEXT: v_ldexp_f64 v[0:1], s[2:3], v0 +; GFX6-NEXT: v_mov_b32_e32 v4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3 +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3] +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] +; GFX6-NEXT: v_cvt_u32_f64_e32 v2, v[2:3] +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[2:3], v2 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i64_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i64 + %fp = uitofp i64 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +}