Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18862,27 +18862,38 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {

static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const TargetLowering &TLI) {
// We only do this if the target has legal ftrunc. Otherwise, we'd likely be
// replacing casts with a libcall. We also must be allowed to ignore -0.0
// because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
// conversions would return +0.0.
// We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
// If NoSignedZerosFPMath is enabled, this is a direct replacement.
// Otherwise, for strict math, we must handle edge cases:
// 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
// as example, it first becomes integer 0, and is converted back to +0.0.
// FTRUNC on its own could produce -0.0.

// FIXME: We should be able to use node-level FMF here.
// TODO: If strict math, should we use FABS (+ range check for signed cast)?
EVT VT = N->getValueType(0);
if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
!DAG.getTarget().Options.NoSignedZerosFPMath)
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
return SDValue();

// fptosi/fptoui round towards zero, so converting from FP to integer and
// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
SDValue N0 = N->getOperand(0);
if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
N0.getOperand(0).getValueType() == VT)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
N0.getOperand(0).getValueType() == VT) {
if (DAG.getTarget().Options.NoSignedZerosFPMath)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
}

if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
N0.getOperand(0).getValueType() == VT)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
N0.getOperand(0).getValueType() == VT) {
if (DAG.getTarget().Options.NoSignedZerosFPMath)
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));

// Strict math: use FABS to handle negative inputs correctly.
if (TLI.isFAbsFree(VT)) {
SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
}
}

return SDValue();
}
Expand Down
12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ define half @t3(half %x) {
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t3:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzs h0, h0
; USE-NEON-NO-GPRS-NEXT: scvtf h0, h0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t3:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
Expand Down Expand Up @@ -147,6 +153,12 @@ define half @t6(half %x) {
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t6:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzu h0, h0
; USE-NEON-NO-GPRS-NEXT: ucvtf h0, h0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t6:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
Expand Down
296 changes: 296 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s

define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) {
; GFX6-LABEL: fptoui_f32_to_i16_to_f32:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f32_to_i16_to_f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui float %x to i16
%fp = uitofp i16 %ui to float
store float %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) {
; GFX6-LABEL: fptoui_f32_to_i32_to_f32:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f32_to_i32_to_f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui float %x to i32
%fp = uitofp i32 %ui to float
store float %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) {
; GFX6-LABEL: fptoui_f32_to_i64_to_f32:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f32_to_i64_to_f32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui float %x to i64
%fp = uitofp i64 %ui to float
store float %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) {
; GFX6-LABEL: fptoui_f16_to_i16_to_f16:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f16_to_i16_to_f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui half %x to i16
%fp = uitofp i16 %ui to half
store half %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) {
; GFX6-LABEL: fptoui_f16_to_i32_to_f16:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: v_trunc_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f16_to_i32_to_f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui half %x to i32
%fp = uitofp i32 %ui to half
store half %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) {
; GFX6-LABEL: fptoui_f16_to_i64_to_f16:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: v_trunc_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f16_to_i64_to_f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui half %x to i64
%fp = uitofp i64 %ui to half
store half %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) {
; GFX6-LABEL: fptoui_f64_to_i16_to_f64:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f64_to_i16_to_f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui double %x to i16
%fp = uitofp i16 %ui to double
store double %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) {
; GFX6-LABEL: fptoui_f64_to_i32_to_f64:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f64_to_i32_to_f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui double %x to i32
%fp = uitofp i32 %ui to double
store double %fp, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) {
; GFX6-LABEL: fptoui_f64_to_i64_to_f64:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s5, 0xfffff
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: v_not_b32_e32 v0, 31
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_bfe_u32 s7, s3, 0xb0014
; GFX6-NEXT: s_addk_i32 s7, 0xfc01
; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
; GFX6-NEXT: s_and_b32 s8, s3, 0x80000000
; GFX6-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5]
; GFX6-NEXT: s_cmp_lt_i32 s7, 0
; GFX6-NEXT: s_cselect_b32 s4, 0, s4
; GFX6-NEXT: s_cselect_b32 s5, s8, s5
; GFX6-NEXT: s_cmp_gt_i32 s7, 51
; GFX6-NEXT: s_cselect_b32 s3, s3, s5
; GFX6-NEXT: s_cselect_b32 s2, s2, s4
; GFX6-NEXT: v_ldexp_f64 v[0:1], s[2:3], v0
; GFX6-NEXT: v_mov_b32_e32 v4, -1
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3]
; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; GFX6-NEXT: v_cvt_u32_f64_e32 v2, v[2:3]
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX6-NEXT: v_cvt_f64_u32_e32 v[2:3], v2
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: fptoui_f64_to_i64_to_f64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%ui = fptoui double %x to i64
%fp = uitofp i64 %ui to double
store double %fp, ptr addrspace(1) %out
ret void
}
Loading