Skip to content

Commit

Permalink
AMDGPU: Fix not using v_cvt_f16_[iu]16
Browse files Browse the repository at this point in the history
We weren't treating i16->f16 casts as legal on targets with these
instructions, and always using a pair of casts through i32.
  • Loading branch information
arsenm committed Jan 7, 2020
1 parent 640d0ba commit 68e70fb
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 31 deletions.
39 changes: 31 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -2503,15 +2503,25 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,

SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOperand(0).getValueType() == MVT::i64 &&
"operation should be legal");

// TODO: Factor out code common with LowerSINT_TO_FP.

EVT DestVT = Op.getValueType();
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();

if (SrcVT == MVT::i16) {
if (DestVT == MVT::f16)
return Op;
SDLoc DL(Op);

// Promote src to i32
SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
}

assert(SrcVT == MVT::i64 && "operation should be legal");

if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);

SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
Expand All @@ -2530,12 +2540,25 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,

SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOperand(0).getValueType() == MVT::i64 &&
"operation should be legal");
EVT DestVT = Op.getValueType();

SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();

if (SrcVT == MVT::i16) {
if (DestVT == MVT::f16)
return Op;

SDLoc DL(Op);
// Promote src to i32
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
}

assert(SrcVT == MVT::i64 && "operation should be legal");

// TODO: Factor out code common with LowerUINT_TO_FP.

EVT DestVT = Op.getValueType();
if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -492,8 +492,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
Expand All @@ -508,6 +506,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::f16, Promote);

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);

setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
Expand Up @@ -365,15 +365,15 @@ entry:
}

; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16:
; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_cvt_f32_i32_sdwa
; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_cvt_f16_i16_sdwa

; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}
; SDWA-DAG: v_cvt_f16_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}} dst_sel:{{(WORD_1|DWORD)?}} dst_unused:UNUSED_PAD src0_sel:WORD_1

; FIXME: Should be able to avoid or
define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
<2 x half> addrspace(1)* %r,
<2 x i16> addrspace(1)* %a) {
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
Expand Up @@ -3,8 +3,12 @@

; GCN-LABEL: {{^}}sitofp_i16_to_f16
; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]]
; GCN: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]

; SI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]

; VI: v_cvt_f16_i16_e32 v[[R_F16:[0-9]+]], v[[A_I16]]

; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @sitofp_i16_to_f16(
Expand Down Expand Up @@ -45,10 +49,8 @@ entry:
; SI-DAG: v_lshlrev_b32_e32
; SI: v_or_b32_e32

; VI-DAG: v_cvt_f32_i32_sdwa
; VI-DAG: v_cvt_f32_i32_sdwa
; VI-DAG: v_cvt_f16_f32_e32
; VI-DAG: v_cvt_f16_f32_sdwa
; VI-DAG: v_cvt_f16_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-DAG: v_cvt_f16_i16_e32
; VI: v_or_b32_e32

; GCN: buffer_store_dword
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
Expand Up @@ -104,7 +104,7 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out,
; SI: v_cvt_f64_u32_e32 v{{\[[0-9]+:[0-9]+\]}}, [[ZEXT]]

; VI: s_and_b32 [[ZEXT:s[0-9]+]], [[VAL]], 0xff{{$}}
; VI: v_cvt_f64_i32_e32 v{{\[[0-9]+:[0-9]+\]}}, [[ZEXT]]
; VI: v_cvt_f64_u32_e32 v{{\[[0-9]+:[0-9]+\]}}, [[ZEXT]]
define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(double addrspace(1)* %out, i8 %in) {
%fp = uitofp i8 %in to double
store double %fp, double addrspace(1)* %out
Expand All @@ -118,7 +118,7 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(double addrspace(1)* %out, i8

; VI: v_mov_b32_e32 v{{[0-9]+}}
; VI: v_and_b32_sdwa
; VI: v_cvt_f64_i32_e32 v{{\[[0-9]+:[0-9]+\]}},
; VI: v_cvt_f64_u32_e32 v{{\[[0-9]+:[0-9]+\]}},
define double @v_uint_to_fp_i8_to_f64(i8 %in) {
%fp = uitofp i8 %in to double
ret double %fp
Expand Down
13 changes: 7 additions & 6 deletions llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
Expand Up @@ -4,8 +4,10 @@
; GCN-LABEL: {{^}}uitofp_i16_to_f16
; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]
; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; VI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]

; VI: v_cvt_f16_u16_e32 v[[R_F16:[0-9]+]], v[[A_I16]]

; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @uitofp_i16_to_f16(
Expand Down Expand Up @@ -46,10 +48,9 @@ entry:
; SI-DAG: v_lshlrev_b32_e32
; SI: v_or_b32_e32

; VI-DAG: v_cvt_f16_f32_e32
; VI-DAG: v_cvt_f32_i32_sdwa
; VI-DAG: v_cvt_f32_i32_sdwa
; VI-DAG: v_cvt_f16_f32_sdwa

; VI-DAG: v_cvt_f16_u16_e32
; VI-DAG: v_cvt_f16_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI: v_or_b32_e32

; GCN: buffer_store_dword
Expand Down

0 comments on commit 68e70fb

Please sign in to comment.