Skip to content

Commit

Permalink
AMDGPU: Combine down fcopysign f64 magnitude
Browse files Browse the repository at this point in the history
Copy through the low bits and only apply an f32
copysign to the high half. This is effectively
what we do for codegen anyway, but this provides
some combine benefits. The cases involving constants
show some small improvements.

https://reviews.llvm.org/D142682
  • Loading branch information
arsenm committed Mar 6, 2023
1 parent 9f493be commit 9f4746b
Show file tree
Hide file tree
Showing 4 changed files with 493 additions and 488 deletions.
27 changes: 24 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -9468,13 +9468,34 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,

SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SDValue MagnitudeOp = N->getOperand(0);
SDValue SignOp = N->getOperand(1);
if (SignOp.getValueType() != MVT::f64)
return SDValue();

SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);

// f64 fcopysign is really an f32 copysign on the high bits, so replace the
// lower half with a copy.
// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
if (MagnitudeOp.getValueType() == MVT::f64) {
SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
SDValue MagLo =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
DAG.getConstant(0, DL, MVT::i32));
SDValue MagHi =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
DAG.getConstant(1, DL, MVT::i32));

SDValue HiOp =
DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);

SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);

return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
}

if (SignOp.getValueType() != MVT::f64)
return SDValue();

// Reduce width of sign operand, we only need the highest bit.
//
// fcopysign f64:x, f64:y ->
Expand Down
20 changes: 11 additions & 9 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
Expand Up @@ -872,22 +872,24 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[0:3], 0 addr64
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[12:15], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; SI-NEXT: buffer_load_ushort v0, v[1:2], s[0:3], 0 addr64
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s4
; SI-NEXT: s_mov_b32 s9, s5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_bfi_b32 v4, s0, v4, v0
; SI-NEXT: buffer_store_dwordx2 v[3:4], off, s[8:11], 0
; SI-NEXT: v_bfi_b32 v1, s0, v1, v2
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
Expand Down

0 comments on commit 9f4746b

Please sign in to comment.