Skip to content

Commit

Permalink
[x86, SSE/AVX] allow 128/256-bit lowering for copysign vector intrins…
Browse files Browse the repository at this point in the history
…ics (PR30433)

This should fix:
https://llvm.org/bugs/show_bug.cgi?id=30433

There are a couple of open questions about the codegen:
1. Should we let scalar ops be scalars and avoid vector constant loads/splats?
2. Should we have a pass to combine constants such as the inverted pair that we have here?

Differential Revision: https://reviews.llvm.org/D25165
 

llvm-svn: 283119
  • Loading branch information
rotateright committed Oct 3, 2016
1 parent 45e8ba8 commit d27a218
Show file tree
Hide file tree
Showing 4 changed files with 260 additions and 451 deletions.
44 changes: 27 additions & 17 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -730,6 +730,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
Expand Down Expand Up @@ -765,6 +766,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
Expand Down Expand Up @@ -980,6 +982,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
Expand Down Expand Up @@ -14662,31 +14665,39 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
bool IsF128 = (VT == MVT::f128);
assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v8f32) &&
"Unexpected type in LowerFCOPYSIGN");

MVT EltVT = VT.getScalarType();
const fltSemantics &Sem =
VT == MVT::f64 ? APFloat::IEEEdouble :
(IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
const unsigned SizeInBits = VT.getSizeInBits();
EltVT == MVT::f64 ? APFloat::IEEEdouble
: (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);

// Perform all logic operations as 16-byte vectors because there are no
// Perform all scalar logic operations as 16-byte vectors because there are no
// scalar FP logic instructions in SSE.
MVT LogicVT =
(VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
// TODO: This isn't necessary. If we used scalar types, we might avoid some
// unnecessary splats, but we might miss load folding opportunities. Should
// this decision be based on OptimizeForSize?
bool IsFakeVector = !VT.isVector() && !IsF128;
MVT LogicVT = VT;
if (IsFakeVector)
LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

// The mask constants are automatically splatted for vector types.
unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue SignMask = DAG.getConstantFP(
APFloat(Sem, APInt::getSignBit(SizeInBits)), dl, LogicVT);
APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
SDValue MagMask = DAG.getConstantFP(
APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);

// First, clear all bits but the sign bit from the second operand (sign).
if (!IsF128)
if (IsFakeVector)
Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

// Next, clear the sign bit from the first operand (magnitude).
// If it's a constant, we can clear it here.
SDValue MagMask = DAG.getConstantFP(
APFloat(Sem, ~APInt::getSignBit(SizeInBits)), dl, LogicVT);

// TODO: If we had general constant folding for FP logic ops, this check
// wouldn't be necessary.
SDValue MagBits;
Expand All @@ -14696,16 +14707,15 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
MagBits = DAG.getConstantFP(APF, dl, LogicVT);
} else {
// If the magnitude operand wasn't a constant, we need to AND out the sign.
if (!IsF128)
if (IsFakeVector)
Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
}

// OR the magnitude value with the sign bit.
SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
return IsF128 ? Or :
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
DAG.getIntPtrConstant(0, dl));
return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
DAG.getIntPtrConstant(0, dl));
}

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
Expand Down
56 changes: 28 additions & 28 deletions llvm/test/Analysis/CostModel/X86/arith-fp.ll
Expand Up @@ -401,22 +401,22 @@ define i32 @fcopysign(i32 %arg) {
; AVX2: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
; AVX512: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
%F32 = call float @llvm.copysign.f32(float undef, float undef)
; SSE2: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; SSE42: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; AVX: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; AVX2: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; AVX512: cost of 17 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; SSE2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; SSE42: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; AVX: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; AVX2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
; AVX512: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
%V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
; SSE2: cost of 34 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; SSE42: cost of 34 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; AVX: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; AVX2: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; AVX512: cost of 37 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; SSE2: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; SSE42: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; AVX: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; AVX2: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
; AVX512: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
%V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
; SSE2: cost of 68 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; SSE42: cost of 68 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; AVX: cost of 74 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; AVX2: cost of 74 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; SSE2: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
; AVX512: cost of 77 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
%V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)

Expand All @@ -426,22 +426,22 @@ define i32 @fcopysign(i32 %arg) {
; AVX2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
; AVX512: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
%F64 = call double @llvm.copysign.f64(double undef, double undef)
; SSE2: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; SSE42: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; AVX: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; AVX2: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; AVX512: cost of 7 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; SSE2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; SSE42: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; AVX: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; AVX2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
; AVX512: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
%V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
; SSE2: cost of 14 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; SSE42: cost of 14 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; AVX: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; AVX2: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; AVX512: cost of 17 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; SSE2: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; SSE42: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; AVX: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; AVX2: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
; AVX512: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
%V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
; SSE2: cost of 28 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; SSE42: cost of 28 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; AVX: cost of 34 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; AVX2: cost of 34 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; SSE2: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
; AVX512: cost of 37 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
%V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)

Expand Down

0 comments on commit d27a218

Please sign in to comment.