25 changes: 16 additions & 9 deletions llvm/lib/Target/PowerPC/PPCInstrVSX.td
Original file line number Diff line number Diff line change
Expand Up @@ -906,16 +906,13 @@ let hasSideEffects = 0 in {
// Rounding Instructions respecting current rounding mode
def XSRDPIC : XX2Form<60, 107,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xsrdpic $XT, $XB", IIC_VecFP,
[(set f64:$XT, (fnearbyint f64:$XB))]>;
"xsrdpic $XT, $XB", IIC_VecFP, []>;
def XVRDPIC : XX2Form<60, 235,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrdpic $XT, $XB", IIC_VecFP,
[(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
"xvrdpic $XT, $XB", IIC_VecFP, []>;
def XVRSPIC : XX2Form<60, 171,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrspic $XT, $XB", IIC_VecFP,
[(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
"xvrspic $XT, $XB", IIC_VecFP, []>;
// Max/Min Instructions
let isCommutable = 1 in {
def XSMAXDP : XX3Form<60, 160,
Expand Down Expand Up @@ -2783,9 +2780,6 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be ForceXForm:$src)), (LXVD2X ForceXForm:$s
def : Pat<(f32 (any_fround f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPI
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
def : Pat<(f32 (fnearbyint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
def : Pat<(f32 (any_ffloor f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIM
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
Expand All @@ -2804,6 +2798,19 @@ def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>;
def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;

// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
// these need to be defined after the any_frint versions so ISEL will correctly
// add the chain to the strict versions.
def : Pat<(f32 (fnearbyint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
def : Pat<(f64 (fnearbyint f64:$S)),
(f64 (XSRDPIC $S))>;
def : Pat<(v2f64 (fnearbyint v2f64:$S)),
(v2f64 (XVRDPIC $S))>;
def : Pat<(v4f32 (fnearbyint v4f32:$S)),
(v4f32 (XVRSPIC $S))>;

// Materialize a zero-vector of long long
def : Pat<(v2i64 immAllZerosV),
(v2i64 (XXLXORz))>;
Expand Down
127 changes: 127 additions & 0 deletions llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
; The non-strictfp version of test/CodeGen/PowerPC/respect-rounding-mode.ll
; Without strictfp, CSE should be free to eliminate the repeated multiply
; and conversion instructions.
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2

; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8

define dso_local signext i32 @func1() local_unnamed_addr #0 {
entry:
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %0, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext3 = extractelement <2 x double> %1, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end

if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable

if.end: ; preds = %entry
ret i32 %conv
}

declare void @directCall(...) local_unnamed_addr

declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)

declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)

declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)

declare void @exit(i32 signext) local_unnamed_addr

define dso_local signext i32 @func2() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end

if.end: ; preds = %entry
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0
br label %cleanup

cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}

declare <2 x double> @getvector1(...) local_unnamed_addr

declare <2 x double> @getvector2(...) local_unnamed_addr

declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)

declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>)

define dso_local signext i32 @func3() local_unnamed_addr #0 {
entry:
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %1, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
%2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %2() #0
%3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext4 = extractelement <2 x double> %3, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end

if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable

if.end: ; preds = %entry
ret i32 %conv
}

define dso_local signext i32 @func4() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end

if.end: ; preds = %entry
%1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %1() #0
%mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0
br label %cleanup

cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}

declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)

attributes #0 = { nounwind }
128 changes: 128 additions & 0 deletions llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
; The strictfp version of test/CodeGen/PowerPC/cse-despit-rounding-mode.ll
; With strictfp, the MachineIR optimizations need to assume that a call
; can change the rounding mode and must not move/eliminate the repeated
; multiply/convert instructions in this test.
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4

; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8

define dso_local signext i32 @func1() local_unnamed_addr #0 {
entry:
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %0, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext3 = extractelement <2 x double> %1, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end

if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable

if.end: ; preds = %entry
ret i32 %conv
}

declare void @directCall(...) local_unnamed_addr

declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)

declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)

declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)

declare void @exit(i32 signext) local_unnamed_addr

define dso_local signext i32 @func2() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end

if.end: ; preds = %entry
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0
br label %cleanup

cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}

declare <2 x double> @getvector1(...) local_unnamed_addr

declare <2 x double> @getvector2(...) local_unnamed_addr

declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)

declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>)

define dso_local signext i32 @func3() local_unnamed_addr #0 {
entry:
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %1, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
%2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %2() #0
%3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext4 = extractelement <2 x double> %3, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end

if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable

if.end: ; preds = %entry
ret i32 %conv
}

define dso_local signext i32 @func4() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end

if.end: ; preds = %entry
%1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %1() #0
%mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0
br label %cleanup

cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}

declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)

attributes #0 = { nounwind strictfp }
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4631,14 +4631,14 @@ entry:
define <4 x double> @constrained_vector_rint_v4f64(<4 x double> %x) #0 {
; PC64LE-LABEL: constrained_vector_rint_v4f64:
; PC64LE: # %bb.0: # %entry
; PC64LE-NEXT: xvrdpic 34, 34
; PC64LE-NEXT: xvrdpic 35, 35
; PC64LE-NEXT: xvrdpic 34, 34
; PC64LE-NEXT: blr
;
; PC64LE9-LABEL: constrained_vector_rint_v4f64:
; PC64LE9: # %bb.0: # %entry
; PC64LE9-NEXT: xvrdpic 34, 34
; PC64LE9-NEXT: xvrdpic 35, 35
; PC64LE9-NEXT: xvrdpic 34, 34
; PC64LE9-NEXT: blr
entry:
%rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(
Expand Down