Skip to content
75 changes: 73 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2098,9 +2098,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
// narrower widths.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
MVT::v16f32, MVT::v8f64})
setOperationAction(ISD::FLDEXP, VT, Custom);

// These operations are handled on non-VLX by artificially widening in
// isel patterns.

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
Expand Down Expand Up @@ -19220,6 +19224,72 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return SDValue();
}

static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue X = Op.getOperand(0);
MVT XTy = X.getSimpleValueType();
SDValue Exp = Op.getOperand(1);

switch (XTy.SimpleTy) {
default:
return SDValue();
case MVT::f16:
if (!Subtarget.hasFP16())
X = DAG.getFPExtendOrRound(X, DL, MVT::f32);
[[fallthrough]];
case MVT::f32:
case MVT::f64: {
MVT VT = MVT::getVectorVT(X.getSimpleValueType(),
128 / X.getSimpleValueType().getSizeInBits());
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp);
SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X);
SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp);
SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp);
SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0);
return DAG.getFPExtendOrRound(Final, DL, XTy);
}
case MVT::v4f32:
case MVT::v2f64:
case MVT::v8f32:
case MVT::v4f64:
case MVT::v16f32:
case MVT::v8f64:
if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) {
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
}
break;
case MVT::v8f16:
case MVT::v16f16:
if (Subtarget.hasFP16()) {
if (Subtarget.hasVLX()) {
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
}
break;
}
X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32));
Exp = DAG.getSExtOrTrunc(Exp, DL,
X.getSimpleValueType().changeTypeToInteger());
break;
case MVT::v32f16:
if (Subtarget.hasFP16()) {
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp);
return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp);
}
return splitVectorOp(Op, DAG, DL);
}
SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512);
SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512);
Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp);
SDValue Scalef =
DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp);
SDValue Final =
DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0);
return DAG.getFPExtendOrRound(Final, DL, XTy);
}

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
Expand Down Expand Up @@ -33734,7 +33804,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
// clang-format on
case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG);
// clang-format on
}
}

Expand Down
158 changes: 53 additions & 105 deletions llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,38 +79,54 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: subq $40, %rsp
; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48
; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vmovd %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; CHECK-AVX-NEXT: addq $40, %rsp
; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX-NEXT: retq
; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: subq $40, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vmovd %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; CHECK-AVX2-NEXT: addq $40, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX2-NEXT: retq
;
; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-ONLY-AVX512F: # %bb.0:
; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
; CHECK-ONLY-AVX512F-NEXT: vmovaps %xmm0, %xmm0
; CHECK-ONLY-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
; CHECK-ONLY-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-ONLY-AVX512F-NEXT: vzeroupper
; CHECK-ONLY-AVX512F-NEXT: retq
;
; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-SKX: # %bb.0:
; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0
; CHECK-SKX-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
Expand Down Expand Up @@ -562,79 +578,11 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
;
; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
; CHECK-AVX512F: # %bb.0:
; CHECK-AVX512F-NEXT: subq $72, %rsp
; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-AVX512F-NEXT: addq $72, %rsp
; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
; CHECK-AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
; CHECK-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-AVX512F-NEXT: vzeroupper
; CHECK-AVX512F-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
Expand Down
Loading
Loading