Skip to content

Commit

Permalink
[X86] Improve v2i64->v2f32 and v4i64->v4f32 uint_to_fp on avx and avx…
Browse files Browse the repository at this point in the history
…2 targets.

Summary:
Based on Simon's D52965, but improved to handle strict fp and improve some of the shuffling.

Rather than use v2i1/v4i1 and let type legalization continue, just generate all the code with legal types and use an explicit shuffle.

I also added an explicit setcc to the v4i64 code to match the semantics of vselect which doesn't just use the sign bit. I'm also using a v4i64->v4i32 truncate instead of the shuffle in Simon's original code. With the setcc this will become a pack.

Future work can look into using X86ISD::BLENDV and a different shuffle that only moves the sign bit.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71956
  • Loading branch information
topperc committed Jan 6, 2020
1 parent ca3bf28 commit 9584086
Show file tree
Hide file tree
Showing 5 changed files with 514 additions and 638 deletions.
149 changes: 125 additions & 24 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -1176,6 +1176,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
// We need to mark SINT_TO_FP as Custom even though we want to expand it
// so that DAG combine doesn't try to turn it into uint_to_fp.
setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
}

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
Expand Down Expand Up @@ -18620,42 +18629,91 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(Subtarget.hasDQI() && !Subtarget.hasVLX() && "Unexpected features");

SDLoc DL(Op);
bool IsStrict = Op->isStrictFPOpcode();
MVT VT = Op->getSimpleValueType(0);
SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
assert((Src.getSimpleValueType() == MVT::v2i64 ||
Src.getSimpleValueType() == MVT::v4i64) &&
"Unsupported custom type");

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!");
MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
if (Subtarget.hasDQI()) {
assert(!Subtarget.hasVLX() && "Unexpected features");

// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
SDValue Tmp =
IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) : DAG.getUNDEF(MVT::v8i64);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
DAG.getIntPtrConstant(0, DL));
SDValue Res, Chain;
assert((Src.getSimpleValueType() == MVT::v2i64 ||
Src.getSimpleValueType() == MVT::v4i64) &&
"Unsupported custom type");

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!");
MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
: DAG.getUNDEF(MVT::v8i64);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
DAG.getIntPtrConstant(0, DL));
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
{Op->getOperand(0), Src});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
}

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
DAG.getIntPtrConstant(0, DL));

if (IsStrict)
return DAG.getMergeValues({Res, Chain}, DL);
return Res;
}

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
if (VT != MVT::v4f32 || IsSigned)
return SDValue();

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
SmallVector<SDValue, 4> SignCvts(4);
SmallVector<SDValue, 4> Chains(4);
for (int i = 0; i != 4; ++i) {
SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
DAG.getIntPtrConstant(i, DL));
if (IsStrict) {
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
{Op.getOperand(0), Src});
Chains[i] = SignCvts[i].getValue(1);
} else {
SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
}
}
SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

SDValue Slow, Chain;
if (IsStrict) {
Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
{Op->getOperand(0), Src});
Chain = Res.getValue(1);
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
{Chain, SignCvt, SignCvt});
Chain = Slow.getValue(1);
} else {
Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
}

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
DAG.getIntPtrConstant(0, DL));
IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

if (IsStrict)
return DAG.getMergeValues({Res, Chain}, DL);
return Res;
return DAG.getMergeValues({Cvt, Chain}, DL);

return Cvt;
}

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
Expand Down Expand Up @@ -29011,6 +29069,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
Subtarget.hasAVX() && !Subtarget.hasAVX512()) {
// TODO Any SSE41+ subtarget should work here but BLENDV codegen ends up
// a lot worse than it should be.
SDValue Zero = DAG.getConstant(0, dl, SrcVT);
SDValue One = DAG.getConstant(1, dl, SrcVT);
SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
for (int i = 0; i != 2; ++i) {
SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
SignSrc, DAG.getIntPtrConstant(i, dl));
if (IsStrict)
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
{N->getOperand(0), Src});
else
SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
};
SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
SDValue Slow, Chain;
if (IsStrict) {
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
SignCvts[0].getValue(1), SignCvts[1].getValue(1));
Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
{Chain, SignCvt, SignCvt});
Chain = Slow.getValue(1);
} else {
Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
}
IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
IsNeg =
DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
Results.push_back(Cvt);
if (IsStrict)
Results.push_back(Chain);
return;
}

if (SrcVT != MVT::v2i32)
return;

Expand Down
41 changes: 14 additions & 27 deletions llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
Expand Up @@ -311,33 +311,20 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
;
; AVX1-64-LABEL: uitofp_v2i64_v2f32:
; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1
; AVX1-64-NEXT: jns .LBB3_2
; AVX1-64-NEXT: # %bb.1:
; AVX1-64-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-64-NEXT: .LBB3_2:
; AVX1-64-NEXT: vmovq %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm0
; AVX1-64-NEXT: jns .LBB3_4
; AVX1-64-NEXT: # %bb.3:
; AVX1-64-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-64-NEXT: .LBB3_4:
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX1-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm2
; AVX1-64-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-64-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
; AVX1-64-NEXT: vmovq %xmm1, %rax
; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm2
; AVX1-64-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-64-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-64-NEXT: retq
;
; AVX512F-64-LABEL: uitofp_v2i64_v2f32:
Expand Down
150 changes: 43 additions & 107 deletions llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
Expand Up @@ -1058,123 +1058,59 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
;
; AVX1-64-LABEL: uitofp_v4i64_v4f32:
; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1
; AVX1-64-NEXT: jns .LBB19_2
; AVX1-64-NEXT: # %bb.1:
; AVX1-64-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-64-NEXT: .LBB19_2:
; AVX1-64-NEXT: vmovq %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2
; AVX1-64-NEXT: jns .LBB19_4
; AVX1-64-NEXT: # %bb.3:
; AVX1-64-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-64-NEXT: .LBB19_4:
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-64-NEXT: vmovq %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm2
; AVX1-64-NEXT: jns .LBB19_6
; AVX1-64-NEXT: # %bb.5:
; AVX1-64-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-64-NEXT: .LBB19_6:
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-64-NEXT: movq %rax, %rcx
; AVX1-64-NEXT: shrq %rcx
; AVX1-64-NEXT: movl %eax, %edx
; AVX1-64-NEXT: andl $1, %edx
; AVX1-64-NEXT: orq %rcx, %rdx
; AVX1-64-NEXT: testq %rax, %rax
; AVX1-64-NEXT: cmovnsq %rax, %rdx
; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0
; AVX1-64-NEXT: jns .LBB19_8
; AVX1-64-NEXT: # %bb.7:
; AVX1-64-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-64-NEXT: .LBB19_8:
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm1
; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-64-NEXT: vpsrlq $1, %xmm2, %xmm3
; AVX1-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-64-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm3
; AVX1-64-NEXT: vorpd %ymm3, %ymm1, %ymm1
; AVX1-64-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm3
; AVX1-64-NEXT: vpextrq $1, %xmm3, %rax
; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
; AVX1-64-NEXT: vmovq %xmm3, %rax
; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-64-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1
; AVX1-64-NEXT: vmovq %xmm1, %rax
; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm3
; AVX1-64-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-64-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; AVX1-64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-64-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0
; AVX1-64-NEXT: vzeroupper
; AVX1-64-NEXT: retq
;
; AVX2-64-LABEL: uitofp_v4i64_v4f32:
; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-64-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm2
; AVX2-64-NEXT: vpsrlq $1, %ymm0, %ymm3
; AVX2-64-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-64-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-64-NEXT: movq %rax, %rcx
; AVX2-64-NEXT: shrq %rcx
; AVX2-64-NEXT: movl %eax, %edx
; AVX2-64-NEXT: andl $1, %edx
; AVX2-64-NEXT: orq %rcx, %rdx
; AVX2-64-NEXT: testq %rax, %rax
; AVX2-64-NEXT: cmovnsq %rax, %rdx
; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1
; AVX2-64-NEXT: jns .LBB19_2
; AVX2-64-NEXT: # %bb.1:
; AVX2-64-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-64-NEXT: .LBB19_2:
; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2
; AVX2-64-NEXT: vmovq %xmm0, %rax
; AVX2-64-NEXT: movq %rax, %rcx
; AVX2-64-NEXT: shrq %rcx
; AVX2-64-NEXT: movl %eax, %edx
; AVX2-64-NEXT: andl $1, %edx
; AVX2-64-NEXT: orq %rcx, %rdx
; AVX2-64-NEXT: testq %rax, %rax
; AVX2-64-NEXT: cmovnsq %rax, %rdx
; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2
; AVX2-64-NEXT: jns .LBB19_4
; AVX2-64-NEXT: # %bb.3:
; AVX2-64-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-64-NEXT: .LBB19_4:
; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3
; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-64-NEXT: vmovq %xmm0, %rax
; AVX2-64-NEXT: movq %rax, %rcx
; AVX2-64-NEXT: shrq %rcx
; AVX2-64-NEXT: movl %eax, %edx
; AVX2-64-NEXT: andl $1, %edx
; AVX2-64-NEXT: orq %rcx, %rdx
; AVX2-64-NEXT: testq %rax, %rax
; AVX2-64-NEXT: cmovnsq %rax, %rdx
; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm2
; AVX2-64-NEXT: jns .LBB19_6
; AVX2-64-NEXT: # %bb.5:
; AVX2-64-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX2-64-NEXT: .LBB19_6:
; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3
; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-64-NEXT: movq %rax, %rcx
; AVX2-64-NEXT: shrq %rcx
; AVX2-64-NEXT: movl %eax, %edx
; AVX2-64-NEXT: andl $1, %edx
; AVX2-64-NEXT: orq %rcx, %rdx
; AVX2-64-NEXT: testq %rax, %rax
; AVX2-64-NEXT: cmovnsq %rax, %rdx
; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0
; AVX2-64-NEXT: jns .LBB19_8
; AVX2-64-NEXT: # %bb.7:
; AVX2-64-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-64-NEXT: .LBB19_8:
; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0
; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; AVX2-64-NEXT: vaddps %xmm0, %xmm0, %xmm2
; AVX2-64-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX2-64-NEXT: vzeroupper
; AVX2-64-NEXT: retq
;
Expand Down

0 comments on commit 9584086

Please sign in to comment.