Skip to content

Commit

Permalink
DAG: Improve fminimum/fmaximum vector expansion logic (#93579)
Browse files Browse the repository at this point in the history
First, expandFMINIMUM_FMAXIMUM should be a never-fail API. The client
wanted it expanded, and it can always be expanded. This logic was tied
up with what the VectorLegalizer wanted.
    
Prefer using the min/max opcodes, and unrolling if we don't have a
vselect.
This seems to produce better code in all the changed tests.
  • Loading branch information
arsenm committed Jun 6, 2024
1 parent ab33fa5 commit 212b78a
Show file tree
Hide file tree
Showing 11 changed files with 4,758 additions and 16,271 deletions.
7 changes: 2 additions & 5 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -992,11 +992,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
break;
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
Results.push_back(Expanded);
return;
}
break;
Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
return;
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
Expand Down
7 changes: 3 additions & 4 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8427,10 +8427,6 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
bool IsMax = Opc == ISD::FMAXIMUM;
SDNodeFlags Flags = N->getFlags();

if (VT.isVector() &&
isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
return SDValue();

// First, implement comparison not propagating NaN. If no native fmin or fmax
// available, use plain select with setcc instead.
SDValue MinMax;
Expand All @@ -8447,6 +8443,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
} else if (isOperationLegalOrCustom(CompOpc, VT)) {
MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags);
} else {
if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
return DAG.UnrollVectorOp(N);

// NaN (if exists) will be propagated later, so orderness doesn't matter.
SDValue Compare =
DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
Expand Down
600 changes: 120 additions & 480 deletions llvm/test/CodeGen/AMDGPU/fmaximum3.ll

Large diffs are not rendered by default.

600 changes: 120 additions & 480 deletions llvm/test/CodeGen/AMDGPU/fminimum3.ll

Large diffs are not rendered by default.

88 changes: 17 additions & 71 deletions llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -554,28 +554,14 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
; GFX8-NEXT: v_max_f16_e32 v4, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
Expand Down Expand Up @@ -669,26 +655,9 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX8-LABEL: v_maximum_v2f16__nnan:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximum_v2f16__nnan:
Expand Down Expand Up @@ -754,13 +723,11 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
; GFX8-NEXT: v_max_f16_e32 v4, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
Expand Down Expand Up @@ -857,14 +824,9 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX8-LABEL: v_maximum_v2f16__nnan_nsz:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
Expand Down Expand Up @@ -938,31 +900,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX8-NEXT: s_lshr_b32 s6, s5, 16
; GFX8-NEXT: s_lshr_b32 s7, s4, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s7, v0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00
; GFX8-NEXT: v_max_f16_e32 v1, s7, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s7, 64
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s6, 64
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s4, v1
; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc
; GFX8-NEXT: v_max_f16_e32 v3, s4, v1
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s4, 64
; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s5, 64
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v0
Expand Down
Loading

0 comments on commit 212b78a

Please sign in to comment.