Skip to content

Commit

Permalink
[ARM] Add scheduling info for VFMS
Browse files Browse the repository at this point in the history
The scalar VFMS instructions did not have scheduling information attached (but
VFMA did), which was causing assertion failures with the Cortex-A57 scheduling
model and -fp-contract=fast.

Differential Revision: https://reviews.llvm.org/D34040

llvm-svn: 305064
  • Loading branch information
ostannard committed Jun 9, 2017
1 parent 1b47ff7 commit ad09735
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 8 deletions.
9 changes: 6 additions & 3 deletions llvm/lib/Target/ARM/ARMInstrVFP.td
Expand Up @@ -1958,15 +1958,17 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0,
[(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
(f64 DPR:$Ddin)))]>,
RegConstraint<"$Ddin = $Dd">,
Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;

def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm",
[(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
SPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
// Some single precision VFP instructions may be executed on both NEON and
// VFP pipelines.
}
Expand All @@ -1976,7 +1978,8 @@ def VFMSH : AHbI<0b11101, 0b10, 1, 0,
IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
[]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>;
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;

def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
(VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
Expand Down
91 changes: 86 additions & 5 deletions llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
@@ -1,5 +1,6 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FAST
; Check latencies of vmul/vfma accumulate chains.

define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
Expand All @@ -14,15 +15,17 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float
; > VMULS read-advanced latency to VMLAS = 0
; CHECK-SAME: Latency=0

; CHECK: VMLAS
; CHECK-DEFAULT: VMLAS
; CHECK-FAST: VFMAS
; > VMLAS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLAS read-advanced latency to the next VMLAS = 4
; CHECK-SAME: Latency=4

; CHECK: VMLAS
; CHECK-DEFAULT: VMLAS
; CHECK-FAST: VFMAS
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
Expand Down Expand Up @@ -51,15 +54,17 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
; VMULfd read-advanced latency to VMLAfd = 0
; CHECK-SAME: Latency=0

; CHECK: VMLAfd
; CHECK-DEFAULT: VMLAfd
; CHECK-FAST: VFMAfd
; > VMLAfd common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLAfd read-advanced latency to the next VMLAfd = 4
; CHECK-SAME: Latency=4

; CHECK: VMLAfd
; CHECK-DEFAULT: VMLAfd
; CHECK-FAST: VFMAfd
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
Expand All @@ -75,3 +80,79 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2
ret <2 x float> %add2
}

define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
; CHECK: ********** MI Scheduling **********
; CHECK: Test3:BB#0

; CHECK: VMULS
; > VMULS common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
; CHECK: data
; > VMULS read-advanced latency to VMLSS = 0
; CHECK-SAME: Latency=0

; CHECK-DEFAULT: VMLSS
; CHECK-FAST: VFMSS
; > VMLSS common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLSS read-advanced latency to the next VMLSS = 4
; CHECK-SAME: Latency=4

; CHECK-DEFAULT: VMLSS
; CHECK-FAST: VFMSS
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLSS not-optimized latency to VMOVRS = 9
; CHECK-SAME: Latency=9

; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS
%mul1 = fmul float %f1, %f2
%mul2 = fmul float %f3, %f4
%mul3 = fmul float %f5, %f6
%sub1 = fsub float %mul1, %mul2
%sub2 = fsub float %sub1, %mul3
ret float %sub2
}

; ASIMD form
define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
; CHECK: ********** MI Scheduling **********
; CHECK: Test4:BB#0

; CHECK: VMULfd
; > VMULfd common latency = 5
; CHECK: Latency : 5
; CHECK: Successors:
; CHECK: data
; VMULfd read-advanced latency to VMLSfd = 0
; CHECK-SAME: Latency=0

; CHECK-DEFAULT: VMLSfd
; CHECK-FAST: VFMSfd
; > VMLSfd common latency = 9
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLSfd read-advanced latency to the next VMLSfd = 4
; CHECK-SAME: Latency=4

; CHECK-DEFAULT: VMLSfd
; CHECK-FAST: VFMSfd
; CHECK: Latency : 9
; CHECK: Successors:
; CHECK: data
; > VMLSfd not-optimized latency to VMOVRRD = 9
; CHECK-SAME: Latency=9

; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS
%mul1 = fmul <2 x float> %f1, %f2
%mul2 = fmul <2 x float> %f3, %f4
%mul3 = fmul <2 x float> %f5, %f6
%sub1 = fsub <2 x float> %mul1, %mul2
%sub2 = fsub <2 x float> %sub1, %mul3
ret <2 x float> %sub2
}

0 comments on commit ad09735

Please sign in to comment.