Skip to content

Commit

Permalink
[AArch64] Add patterns for FMADD, FMSUB
Browse files Browse the repository at this point in the history
FMADD, FMSUB instructions perform better or the same compared to indexed
FMLA, FMLS.

For example, the Arm Cortex-A55 Software Optimization Guide lists "FP
multiply accumulate" FMADD, FMSUB instructions with a throughput of 2
IPC, whereas it lists "ASIMD FP multiply accumulate, by element" FMLA,
FMLS with a throughput of 1 IPC.

The Arm Cortex-A77 Software Optimization Guide, however, does not
separately list "by element" variants of the "ASIMD FP multiply
accumulate" instructions, which are listed with the same throughput of 2
IPC as "FP multiply accumulate" instructions.

Reviewed By: samtebbs, dzhidzhoev

Differential Revision: https://reviews.llvm.org/D158008
  • Loading branch information
overmighty authored and dzhidzhoev committed Aug 30, 2023
1 parent 0563725 commit 38c92c1
Show file tree
Hide file tree
Showing 5 changed files with 470 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}]
// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
// COMMONIR: ret <1 x double> [[TMP7]]
float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
Expand All @@ -122,7 +122,7 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}]
// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
// COMMONIR: ret <1 x double> [[TMP7]]
float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
Expand Down
38 changes: 38 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -5409,6 +5409,44 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
let Inst{23-22} = 0b01; // 64-bit size flag
}

let Predicates = [HasFullFP16] in {
def : Pat<(f16 (node (f16 FPR16:$Rn),
(f16 (extractelt (v8f16 V128:$Rm), (i64 0))),
(f16 FPR16:$Ra))),
(!cast<Instruction>(NAME # Hrrr)
FPR16:$Rn, (f16 (EXTRACT_SUBREG V128:$Rm, hsub)), FPR16:$Ra)>;

def : Pat<(f16 (node (f16 (extractelt (v8f16 V128:$Rn), (i64 0))),
(f16 FPR16:$Rm),
(f16 FPR16:$Ra))),
(!cast<Instruction>(NAME # Hrrr)
(f16 (EXTRACT_SUBREG V128:$Rn, hsub)), FPR16:$Rm, FPR16:$Ra)>;
}

def : Pat<(f32 (node (f32 FPR32:$Rn),
(f32 (extractelt (v4f32 V128:$Rm), (i64 0))),
(f32 FPR32:$Ra))),
(!cast<Instruction>(NAME # Srrr)
FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub), FPR32:$Ra)>;

def : Pat<(f32 (node (f32 (extractelt (v4f32 V128:$Rn), (i64 0))),
(f32 FPR32:$Rm),
(f32 FPR32:$Ra))),
(!cast<Instruction>(NAME # Srrr)
(EXTRACT_SUBREG V128:$Rn, ssub), FPR32:$Rm, FPR32:$Ra)>;

def : Pat<(f64 (node (f64 FPR64:$Rn),
(f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
(f64 FPR64:$Ra))),
(!cast<Instruction>(NAME # Drrr)
FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub), FPR64:$Ra)>;

def : Pat<(f64 (node (f64 (extractelt (v2f64 V128:$Rn), (i64 0))),
(f64 FPR64:$Rm),
(f64 FPR64:$Ra))),
(!cast<Instruction>(NAME # Drrr)
(EXTRACT_SUBREG V128:$Rn, dsub), FPR64:$Rm, FPR64:$Ra)>;
}

//---
Expand Down
17 changes: 8 additions & 9 deletions llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,15 @@ target triple = "aarch64"
define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
; CHECK-LABEL: complex_mul_v2f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h3, v0.h[1]
; CHECK-NEXT: mov h2, v1.h[1]
; CHECK-NEXT: fmul h4, h0, v1.h[1]
; CHECK-NEXT: fnmul h2, h3, h2
; CHECK-NEXT: fmla h4, h3, v1.h[0]
; CHECK-NEXT: fmla h2, h0, v1.h[0]
; CHECK-NEXT: mov v2.h[1], v4.h[0]
; CHECK-NEXT: fmov d0, d2
; CHECK-NEXT: mov h2, v0.h[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul h3, h0, v1.h[1]
; CHECK-NEXT: fmul h4, h2, v1.h[1]
; CHECK-NEXT: fmadd h2, h1, h2, h3
; CHECK-NEXT: fnmsub h0, h1, h0, h4
; CHECK-NEXT: mov v0.h[1], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
%a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 0>
Expand Down
120 changes: 108 additions & 12 deletions llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,29 +80,75 @@ entry:
ret <8 x half> %0
}

define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_lane_f16:
define dso_local half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_lane_f16_0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmla h0, h1, v2.h[0]
; CHECK-NEXT: fmadd h0, h1, h2, h0
; CHECK-NEXT: ret
entry:
%extract = extractelement <4 x half> %c, i32 0
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
ret half %0
}

define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_laneq_f16:
define dso_local half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_lane_f16_0_swap:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmla h0, h1, v2.h[0]
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmadd h0, h2, h1, h0
; CHECK-NEXT: ret
entry:
%extract = extractelement <4 x half> %c, i32 0
%0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a)
ret half %0
}

define dso_local half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_lane_f16_3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmla h0, h1, v2.h[3]
; CHECK-NEXT: ret
entry:
%extract = extractelement <4 x half> %c, i32 3
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
ret half %0
}

define dso_local half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_laneq_f16_0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmadd h0, h1, h2, h0
; CHECK-NEXT: ret
entry:
%extract = extractelement <8 x half> %c, i32 0
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
ret half %0
}

define dso_local half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_laneq_f16_0_swap:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmadd h0, h2, h1, h0
; CHECK-NEXT: ret
entry:
%extract = extractelement <8 x half> %c, i32 0
%0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a)
ret half %0
}

define dso_local half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmah_laneq_f16_7:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmla h0, h1, v2.h[7]
; CHECK-NEXT: ret
entry:
%extract = extractelement <8 x half> %c, i32 7
%0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
ret half %0
}

define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfms_lane_f16:
; CHECK: // %bb.0: // %entry
Expand Down Expand Up @@ -181,23 +227,49 @@ entry:
ret <8 x half> %0
}

define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_lane_f16:
define dso_local half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_lane_f16_0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmsub h0, h2, h1, h0
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
%extract = extractelement <4 x half> %c, i32 0
%1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
ret half %1
}

define dso_local half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_lane_f16_0_swap:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmls h0, h1, v2.h[0]
; CHECK-NEXT: fmsub h0, h2, h1, h0
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
%extract = extractelement <4 x half> %c, i32 0
%1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a)
ret half %1
}

define dso_local half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_lane_f16_3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: fmls h0, h1, v2.h[3]
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
%extract = extractelement <4 x half> %c, i32 3
%1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
ret half %1
}

define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_laneq_f16:
define dso_local half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_laneq_f16_0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmls h0, h1, v2.h[0]
; CHECK-NEXT: fmsub h0, h2, h1, h0
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
Expand All @@ -206,6 +278,30 @@ entry:
ret half %1
}

define dso_local half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_laneq_f16_0_swap:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmsub h0, h2, h1, h0
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
%extract = extractelement <8 x half> %c, i32 0
%1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a)
ret half %1
}

define dso_local half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
; CHECK-LABEL: t_vfmsh_laneq_f16_7:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmls h0, h1, v2.h[7]
; CHECK-NEXT: ret
entry:
%0 = fsub half 0xH8000, %b
%extract = extractelement <8 x half> %c, i32 7
%1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
ret half %1
}

define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
; CHECK-LABEL: t_vmul_laneq_f16:
; CHECK: // %bb.0: // %entry
Expand Down

0 comments on commit 38c92c1

Please sign in to comment.