Skip to content

Commit

Permalink
Enable Custom Lowering for fabs.v8f16 on AVX (#71730)
Browse files Browse the repository at this point in the history
[X86]: Enable custom lowering for fabs.v8f16 on AVX

Currently, custom lowering of fabs.v8f16 requires AVX512FP16, which is
too restrictive. For v8f16 fabs lowering, no instructions in AVX512FP16
are needed. Without the fix, horribly inefficient code is generated
without AVX512FP16. Note instcombiner generates calls to intrinsics
@llvm.fabs.v8f16 when simplifyping AND <8 x half> operations.
  • Loading branch information
david-xl committed Nov 16, 2023
1 parent 73e9633 commit ac3779e
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 0 deletions.
2 changes: 2 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1396,6 +1396,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, VT, Custom);
}

setOperationAction(ISD::FABS, MVT::v8f16, Custom);

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
Expand Down
82 changes: 82 additions & 0 deletions llvm/test/CodeGen/X86/vec_fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512VL
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X86,X86-AVX512FP16
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512VLDQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX512FP16
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512VLDQ

define <2 x double> @fabs_v2f64(<2 x double> %p) {
Expand Down Expand Up @@ -137,6 +139,86 @@ define <4 x double> @fabs_v4f64(<4 x double> %p) {
}
declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)

define <8 x half> @fabs_v8f16(ptr %p) {
; X86-AVX1-LABEL: fabs_v8f16:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: movl 4(%esp), [[ADDRREG:%.*]]
; X86-AVX1-NEXT: vmovaps ([[ADDRREG]]), %xmm0
; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-AVX1-NEXT: retl

; X86-AVX2-LABEL: fabs_v8f16:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: movl 4(%esp), [[REG:%.*]]
; X86-AVX2-NEXT: vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-AVX2-NEXT: vpand ([[REG]]), %xmm0, %xmm0
; X86-AVX2-NEXT: retl

; X64-AVX512VL-LABEL: fabs_v8f16:
; X64-AVX512VL: # %bb.0:
; X64-AVX512VL-NEXT: vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0
; X64-AVX512VL-NEXT: retq

; X64-AVX1-LABEL: fabs_v8f16:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: retq

; X64-AVX2-LABEL: fabs_v8f16:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0
; X64-AVX2-NEXT: retq

%v = load <8 x half>, ptr %p, align 16
%nnv = call <8 x half> @llvm.fabs.v8f16(<8 x half> %v)
ret <8 x half> %nnv
}
declare <8 x half> @llvm.fabs.v8f16(<8 x half> %p)

define <16 x half> @fabs_v16f16(ptr %p) {
; X86-AVX512FP16-LABEL: fabs_v16f16:
; X86-AVX512FP16: # %bb.0:
; X86-AVX512FP16-NEXT: movl 4(%esp), [[REG:%.*]]
; X86-AVX512FP16-NEXT: vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}, [[YMM:%ymm[0-9]+]]
; X86-AVX512FP16-NEXT: vpand ([[REG]]), [[YMM]], [[YMM]]
; X86-AVX512FP16-NEXT: retl

; X64-AVX512FP16-LABEL: fabs_v16f16:
; X64-AVX512FP16: # %bb.0:
; X64-AVX512FP16-NEXT: vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), [[YMM:%ymm[0-9]+]]
; X64-AVX512FP16-NEXT: vpand (%rdi), [[YMM]], [[YMM]]
; X64-AVX512FP16-NEXT: retq
;
%v = load <16 x half>, ptr %p, align 32
%nnv = call <16 x half> @llvm.fabs.v16f16(<16 x half> %v)
ret <16 x half> %nnv
}
declare <16 x half> @llvm.fabs.v16f16(<16 x half> %p)

define <32 x half> @fabs_v32f16(ptr %p) {
; X86-AVX512FP16-LABEL: fabs_v32f16:
; X86-AVX512FP16: # %bb.0:
; X86-AVX512FP16-NEXT: movl 4(%esp), [[REG:%.*]]
; X86-AVX512FP16-NEXT: vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}, [[ZMM:%zmm[0-9]+]]
; X86-AVX512FP16-NEXT: vpandq ([[REG]]), [[ZMM]], [[ZMM]]
; X86-AVX512FP16-NEXT: retl

; X64-AVX512FP16-LABEL: fabs_v32f16:
; X64-AVX512FP16: # %bb.0:
; X64-AVX512FP16-NEXT: vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), [[ZMM:%zmm[0-9]+]]
; X64-AVX512FP16-NEXT: vpandq (%rdi), [[ZMM]], [[ZMM]]
; X64-AVX512FP16-NEXT: retq

%v = load <32 x half>, ptr %p, align 64
%nnv = call <32 x half> @llvm.fabs.v32f16(<32 x half> %v)
ret <32 x half> %nnv
}
declare <32 x half> @llvm.fabs.v32f16(<32 x half> %p)


define <8 x float> @fabs_v8f32(<8 x float> %p) {
; X86-AVX1-LABEL: fabs_v8f32:
; X86-AVX1: # %bb.0:
Expand Down

0 comments on commit ac3779e

Please sign in to comment.