From 9f7bddbc1fa02a841c2083aadc9cac5a569a388e Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Thu, 6 Nov 2025 14:44:06 -0500 Subject: [PATCH 1/8] [X86] Lower mathlib call ldexp into scalef when avx512 is enabled #165694 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 73 +++ .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 271 ++++++---- llvm/test/CodeGen/X86/ldexp-avx512.ll | 510 +++++------------- 3 files changed, 365 insertions(+), 489 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fa3dce256046f..ac885964f9a28 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2591,6 +2591,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); } + if (Subtarget.hasAVX512()) { + for (MVT VT : { MVT::f32, MVT::f64, MVT::v16f32, MVT::v8f64}) + setOperationAction(ISD::FLDEXP, VT, Custom); + + if (Subtarget.hasVLX()) { + for (MVT VT : { MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64 }) + setOperationAction(ISD::FLDEXP, VT, Custom); + + if (Subtarget.hasFP16()) { + for (MVT VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16 }) + setOperationAction(ISD::FLDEXP, VT, Custom); + } + } + + if (Subtarget.hasFP16()) { + for (MVT VT : { MVT::f16, MVT::v32f16 }) + setOperationAction(ISD::FLDEXP, VT, Custom); + } + } + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header @@ -19149,6 +19169,58 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return SDValue(); } +static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + MVT XTy = X.getSimpleValueType(); + SDValue Exp = Op.getOperand(1); + MVT XVT, ExpVT; + + switch (XTy.SimpleTy) { + default: + return SDValue(); + case MVT::f16: + if (Subtarget.hasFP16()) { + XVT = Subtarget.hasVLX() ? MVT::v8f16 : MVT::v32f16; + ExpVT = XVT; + break; + } + X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); + [[fallthrough]]; + case MVT::f32: + XVT = MVT::v4f32; + ExpVT = MVT::v4f32; + break; + case MVT::f64: + XVT = MVT::v2f64; + ExpVT = MVT::v2f64; + break; + case MVT::v4f32: + case MVT::v2f64: + case MVT::v8f32: + case MVT::v4f64: + case MVT::v16f32: + case MVT::v8f64: + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + } + + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + SDValue VX = + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero); + SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT, + DAG.getUNDEF(ExpVT), Exp, Zero); + SDValue Scalef = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX); + SDValue Final = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), Scalef, Zero); + if (X.getValueType() != XTy) + Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, + DAG.getIntPtrConstant(1, SDLoc(Op))); + return Final; +} + static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -33681,6 +33753,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG); + case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG); // clang-format on } } diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 81529aff39ff1..499695f408396 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -79,38 +79,64 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: subq $40, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vmovd %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-AVX-NEXT: addq $40, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: subq $40, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48 +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vmovd %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-AVX2-NEXT: addq $40, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX2-NEXT: retq +; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm1 +; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm1, %xmm2, %xmm1 +; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1] +; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm2, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-ONLY-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] +; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) ret <4 x float> %r } @@ -560,82 +586,109 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 ; CHECK-AVX2-NEXT: retq ; -; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: -; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: subq $72, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX512F-NEXT: addq $72, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX512F-NEXT: retq +; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpextrw $7, %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm2 +; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm2, %xmm1, %xmm2 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; CHECK-ONLY-AVX512F-NEXT: vpextrw $6, %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; CHECK-ONLY-AVX512F-NEXT: vpextrw $5, %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vpextrw $4, %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-ONLY-AVX512F-NEXT: vpextrw $3, %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-ONLY-AVX512F-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-ONLY-AVX512F-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-ONLY-AVX512F-NEXT: vmovd %xmm0, %eax +; CHECK-ONLY-AVX512F-NEXT: cwtl +; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm1, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; CHECK-ONLY-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-ONLY-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow2_ldexp_8xhalf: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpextrw $7, %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1 +; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm2 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] +; CHECK-SKX-NEXT: vscalefss %xmm1, %xmm2, %xmm1 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; CHECK-SKX-NEXT: vpextrw $6, %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 +; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; CHECK-SKX-NEXT: vpextrw $5, %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 +; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-SKX-NEXT: vpextrw $4, %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 +; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SKX-NEXT: vpextrw $3, %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 +; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; CHECK-SKX-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 +; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; CHECK-SKX-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 +; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; CHECK-SKX-NEXT: vmovd %xmm0, %eax +; CHECK-SKX-NEXT: cwtl +; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 +; CHECK-SKX-NEXT: vscalefss %xmm0, %xmm2, %xmm0 +; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; CHECK-SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SKX-NEXT: retq %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> , <8 x i16> %i) ret <8 x half> %r } @@ -1769,3 +1822,5 @@ define x86_fp80 @pr128528(i1 %cond) { %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800 ret x86_fp80 %mul } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-AVX512F: {{.*}} diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll index 21491bc2cc8f5..75c829a6e97ce 100644 --- a/llvm/test/CodeGen/X86/ldexp-avx512.ll +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -1,16 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VL define half @test_half(half %x, i32 %exp) nounwind { -; CHECK-LABEL: test_half: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: popq %rax -; CHECK-NEXT: retq +; AVX512-LABEL: test_half: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_half: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1 +; AVX512VL-NEXT: vscalefsh %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq entry: %r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp) ret half %r @@ -20,7 +25,9 @@ declare half @llvm.ldexp.f16.i32(half, i32) memory(none) define float @test_float(float %x, i32 %exp) nounwind { ; CHECK-LABEL: test_float: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: jmp ldexpf@PLT # TAILCALL +; CHECK-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 +; CHECK-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %r = tail call fast float @ldexpf(float %x, i32 %exp) ret float %r @@ -30,7 +37,9 @@ declare float @ldexpf(float, i32) memory(none) define double @test_double(double %x, i32 %exp) nounwind { ; CHECK-LABEL: test_double: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: jmp ldexp@PLT # TAILCALL +; CHECK-NEXT: vcvtsi2sd %edi, %xmm15, %xmm1 +; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %r = tail call fast double @ldexp(double %x, i32 %exp) ret double %r @@ -229,39 +238,32 @@ define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind { declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { -; CHECK-LABEL: test_ldexp_4xfloat: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovd %xmm1, %edi -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-NEXT: addq $56, %rsp -; CHECK-NEXT: retq +; AVX512-LABEL: test_ldexp_4xfloat: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm2 +; AVX512-NEXT: vscalefss %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_4xfloat: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp) ret <4 x float> %r } @@ -270,20 +272,13 @@ declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_2xdouble: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovd %xmm1, %edi -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm2 +; CHECK-NEXT: vscalefsd %xmm2, %xmm0, %xmm2 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm1 +; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; CHECK-NEXT: retq %r = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %x, <2 x i32> %exp) ret <2 x double> %r @@ -666,121 +661,84 @@ define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>) define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { -; CHECK-LABEL: test_ldexp_8xfloat: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $120, %rsp -; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovd %xmm1, %edi -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovd %xmm0, %edi -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $120, %rsp -; CHECK-NEXT: retq +; AVX512-LABEL: test_ldexp_8xfloat: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm4 +; AVX512-NEXT: vscalefss %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm2[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 +; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm3 +; AVX512-NEXT: vscalefss %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[0] +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm3 +; AVX512-NEXT: vscalefss %xmm3, %xmm0, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2ps %xmm5, %xmm5 +; AVX512-NEXT: vscalefss %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX512-NEXT: vcvtdq2ps %xmm5, %xmm5 +; AVX512-NEXT: vscalefss %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_8xfloat: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512VL-NEXT: vscalefps %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp) ret <8 x float> %r } declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind { -; CHECK-LABEL: test_ldexp_4xdouble: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractps $2, %xmm1, %edi -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovd %xmm0, %edi -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $88, %rsp -; CHECK-NEXT: retq +; AVX512-LABEL: test_ldexp_4xdouble: +; AVX512: # %bb.0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512-NEXT: vcvtdq2pd %xmm3, %xmm3 +; AVX512-NEXT: vscalefsd %xmm3, %xmm2, %xmm3 +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm4 +; AVX512-NEXT: vscalefsd %xmm4, %xmm0, %xmm4 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtdq2pd %xmm4, %xmm4 +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512-NEXT: vscalefsd %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_4xdouble: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512VL-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp) ret <4 x double> %r } @@ -1518,148 +1476,8 @@ declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>) define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_16xfloat: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $216, %rsp -; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovd %xmm1, %edi -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovd %xmm0, %edi -; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovd %xmm0, %edi -; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vmovd %xmm0, %edi -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexpf@PLT -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: addq $216, %rsp +; CHECK-NEXT: vcvtdq2ps %zmm1, %zmm1 +; CHECK-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %r = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %x, <16 x i32> %exp) ret <16 x float> %r @@ -1669,78 +1487,8 @@ declare <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>) define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_8xdouble: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractps $2, %xmm1, %edi -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovd %xmm0, %edi -; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractps $2, %xmm0, %edi -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractps $3, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovd %xmm0, %edi -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractps $1, %xmm0, %edi -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq ldexp@PLT -; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 +; CHECK-NEXT: vscalefpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %r = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %x, <8 x i32> %exp) ret <8 x double> %r From 484cea4f06e5d19a70bef870ee0439ada0aeb7df Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Fri, 7 Nov 2025 21:50:07 -0500 Subject: [PATCH 2/8] Widen 128/256 bit vector types when AVX512VL is not available. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 123 ++++++++------- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 18 +-- llvm/test/CodeGen/X86/ldexp-avx512.ll | 140 ++++++++---------- 3 files changed, 129 insertions(+), 152 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ac885964f9a28..508ec24846511 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1829,6 +1829,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FCANONICALIZE, VT, Custom); } + + for (MVT VT : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64, MVT::v8f32, + MVT::v4f64, MVT::v16f32, MVT::v8f64}) + setOperationAction(ISD::FLDEXP, VT, Custom); + + if (Subtarget.hasFP16()) { + for (MVT VT : {MVT::f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) + setOperationAction(ISD::FLDEXP, VT, Custom); + } + setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); setOperationAction(ISD::LRINT, MVT::v8f64, @@ -2591,26 +2601,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); } - if (Subtarget.hasAVX512()) { - for (MVT VT : { MVT::f32, MVT::f64, MVT::v16f32, MVT::v8f64}) - setOperationAction(ISD::FLDEXP, VT, Custom); - - if (Subtarget.hasVLX()) { - for (MVT VT : { MVT::v4f32, MVT::v2f64, MVT::v8f32, MVT::v4f64 }) - setOperationAction(ISD::FLDEXP, VT, Custom); - - if (Subtarget.hasFP16()) { - for (MVT VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16 }) - setOperationAction(ISD::FLDEXP, VT, Custom); - } - } - - if (Subtarget.hasFP16()) { - for (MVT VT : { MVT::f16, MVT::v32f16 }) - setOperationAction(ISD::FLDEXP, VT, Custom); - } - } - // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header @@ -19177,48 +19167,67 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, SDValue Exp = Op.getOperand(1); MVT XVT, ExpVT; - switch (XTy.SimpleTy) { - default: - return SDValue(); - case MVT::f16: - if (Subtarget.hasFP16()) { - XVT = Subtarget.hasVLX() ? MVT::v8f16 : MVT::v32f16; - ExpVT = XVT; - break; - } - X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); - [[fallthrough]]; - case MVT::f32: - XVT = MVT::v4f32; - ExpVT = MVT::v4f32; + switch (XTy.SimpleTy) { + default: + return SDValue(); + case MVT::f16: + if (Subtarget.hasFP16()) { + XVT = MVT::v8f16; + ExpVT = XVT; + break; + } + X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); + [[fallthrough]]; + case MVT::f32: + XVT = MVT::v4f32; + ExpVT = MVT::v4f32; + break; + case MVT::f64: + XVT = MVT::v2f64; + ExpVT = MVT::v2f64; + break; + case MVT::v4f32: + case MVT::v2f64: + if (!Subtarget.hasVLX()) { + XVT = XTy == MVT::v4f32 ? MVT::v16f32 : MVT::v8f64; + ExpVT = XVT; break; - case MVT::f64: - XVT = MVT::v2f64; - ExpVT = MVT::v2f64; + } + [[fallthrough]]; + case MVT::v8f32: + case MVT::v4f64: + if (!Subtarget.hasVLX()) { + XVT = XTy == MVT::v8f32 ? MVT::v16f32 : MVT::v8f64; + ExpVT = XVT; break; - case MVT::v4f32: - case MVT::v2f64: - case MVT::v8f32: - case MVT::v4f64: - case MVT::v16f32: - case MVT::v8f64: - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); - return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + } + [[fallthrough]]; + case MVT::v16f32: + case MVT::v8f64: + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); } - SDValue Zero = DAG.getConstant(0, DL, MVT::i64); Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); - SDValue VX = - DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero); - SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT, - DAG.getUNDEF(ExpVT), Exp, Zero); - SDValue Scalef = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX); - SDValue Final = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), Scalef, Zero); - if (X.getValueType() != XTy) - Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, - DAG.getIntPtrConstant(1, SDLoc(Op))); - return Final; + if (XTy.isVector()) { + SDValue WideX = + DAG.getInsertSubvector(DL, DAG.getUNDEF(XVT), X, 0); + SDValue WideExp = + DAG.getInsertSubvector(DL, DAG.getUNDEF(ExpVT), Exp, 0); + SDValue Scalef = + DAG.getNode(X86ISD::SCALEF, DL, XVT, WideX, WideExp, WideX); + SDValue Final = DAG.getExtractSubvector(DL, XTy, Scalef, 0); + return Final; + } else { + SDValue VX = DAG.getInsertVectorElt(DL, DAG.getUNDEF(XVT), X, 0); + SDValue VExp = DAG.getInsertVectorElt(DL, DAG.getUNDEF(ExpVT), Exp, 0); + SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX); + SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0); + if (X.getValueType() != XTy) + Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, + DAG.getIntPtrConstant(1, SDLoc(Op))); + return Final; + } } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 499695f408396..bc530e83593f2 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -114,21 +114,11 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; ; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat: ; CHECK-ONLY-AVX512F: # %bb.0: -; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm1 -; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm1, %xmm2, %xmm1 -; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1] -; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] -; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm3, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm2, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; CHECK-ONLY-AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] ; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm2, %xmm0 -; CHECK-ONLY-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-ONLY-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0 +; CHECK-ONLY-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-ONLY-AVX512F-NEXT: vzeroupper ; CHECK-ONLY-AVX512F-NEXT: retq ; ; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat: diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll index 75c829a6e97ce..8acbfb83e0e17 100644 --- a/llvm/test/CodeGen/X86/ldexp-avx512.ll +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -1,21 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512FP16 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VLF +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VLFP16 define half @test_half(half %x, i32 %exp) nounwind { -; AVX512-LABEL: test_half: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_half: +; AVX512FP16: # %bb.0: # %entry +; AVX512FP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1 +; AVX512FP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq ; ; AVX512VL-LABEL: test_half: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1 -; AVX512VL-NEXT: vscalefsh %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_half: +; AVX512VLFP16: # %bb.0: # %entry +; AVX512VLFP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1 +; AVX512VLFP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: retq entry: %r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp) ret half %r @@ -240,23 +256,11 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_4xfloat: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm2 -; AVX512-NEXT: vscalefss %xmm2, %xmm0, %xmm2 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm4 -; AVX512-NEXT: vscalefss %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX512-NEXT: vcvtdq2ps %xmm4, %xmm4 -; AVX512-NEXT: vscalefss %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: test_ldexp_4xfloat: @@ -264,6 +268,12 @@ define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind ; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 ; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_4xfloat: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512VLFP16-NEXT: vscalefps %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp) ret <4 x float> %r } @@ -663,43 +673,10 @@ declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>) define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_8xfloat: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm4 -; AVX512-NEXT: vscalefss %xmm4, %xmm2, %xmm4 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 -; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm2[1,0] -; AVX512-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX512-NEXT: vcvtdq2ps %xmm6, %xmm6 -; AVX512-NEXT: vscalefss %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX512-NEXT: vcvtdq2ps %xmm3, %xmm3 -; AVX512-NEXT: vscalefss %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[0] -; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm3 -; AVX512-NEXT: vscalefss %xmm3, %xmm0, %xmm3 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX512-NEXT: vcvtdq2ps %xmm5, %xmm5 -; AVX512-NEXT: vscalefss %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] -; AVX512-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX512-NEXT: vcvtdq2ps %xmm5, %xmm5 -; AVX512-NEXT: vscalefss %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX512-NEXT: vscalefss %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: test_ldexp_8xfloat: @@ -707,6 +684,12 @@ define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind ; AVX512VL-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX512VL-NEXT: vscalefps %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_8xfloat: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512VLFP16-NEXT: vscalefps %ymm1, %ymm0, %ymm0 +; AVX512VLFP16-NEXT: retq %r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp) ret <8 x float> %r } @@ -715,23 +698,10 @@ declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_4xdouble: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX512-NEXT: vcvtdq2pd %xmm3, %xmm3 -; AVX512-NEXT: vscalefsd %xmm3, %xmm2, %xmm3 -; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm4 -; AVX512-NEXT: vscalefsd %xmm4, %xmm0, %xmm4 -; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3,3,3] -; AVX512-NEXT: vcvtdq2pd %xmm4, %xmm4 -; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vscalefsd %xmm4, %xmm2, %xmm2 -; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1 -; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512-NEXT: vscalefpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: test_ldexp_4xdouble: @@ -739,6 +709,12 @@ define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwi ; AVX512VL-NEXT: vcvtdq2pd %xmm1, %ymm1 ; AVX512VL-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_4xdouble: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512VLFP16-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 +; AVX512VLFP16-NEXT: retq %r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp) ret <4 x double> %r } @@ -1495,3 +1471,5 @@ define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwi } declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512VLF: {{.*}} From 8649bce62db216d21f881e21bc7e97cabf591500 Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Tue, 11 Nov 2025 21:57:26 -0500 Subject: [PATCH 3/8] Added fp16 vector handling to LowerFLDEXP. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 122 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 116 +- llvm/test/CodeGen/X86/ldexp-avx512.ll | 1365 ++--------------- 3 files changed, 193 insertions(+), 1410 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 508ec24846511..d39921552add7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1830,15 +1830,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FCANONICALIZE, VT, Custom); } - for (MVT VT : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64, MVT::v8f32, - MVT::v4f64, MVT::v16f32, MVT::v8f64}) - setOperationAction(ISD::FLDEXP, VT, Custom); - - if (Subtarget.hasFP16()) { - for (MVT VT : {MVT::f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) - setOperationAction(ISD::FLDEXP, VT, Custom); - } - setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); setOperationAction(ISD::LRINT, MVT::v8f64, @@ -2111,6 +2102,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // These operations are handled on non-VLX by artificially widening in // isel patterns. + for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32, + MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16, + MVT::v16f32, MVT::v8f64}) + setOperationAction(ISD::FLDEXP, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); @@ -19160,72 +19156,110 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { + SelectionDAG &DAG) { SDLoc DL(Op); SDValue X = Op.getOperand(0); MVT XTy = X.getSimpleValueType(); SDValue Exp = Op.getOperand(1); - MVT XVT, ExpVT; switch (XTy.SimpleTy) { default: return SDValue(); case MVT::f16: - if (Subtarget.hasFP16()) { - XVT = MVT::v8f16; - ExpVT = XVT; - break; + if (!Subtarget.hasFP16()) { + X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); } - X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); - [[fallthrough]]; - case MVT::f32: - XVT = MVT::v4f32; - ExpVT = MVT::v4f32; break; + case MVT::f32: case MVT::f64: - XVT = MVT::v2f64; - ExpVT = MVT::v2f64; break; case MVT::v4f32: case MVT::v2f64: - if (!Subtarget.hasVLX()) { - XVT = XTy == MVT::v4f32 ? MVT::v16f32 : MVT::v8f64; - ExpVT = XVT; + if (Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEFS, DL, XTy, X, Exp, X); + } + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + break; + case MVT::v8f16: + if (Subtarget.hasFP16()) { + if (Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + } break; } - [[fallthrough]]; + X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::v8f32, X); + Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::v8i32, Exp); + break; case MVT::v8f32: case MVT::v4f64: - if (!Subtarget.hasVLX()) { - XVT = XTy == MVT::v8f32 ? MVT::v16f32 : MVT::v8f64; - ExpVT = XVT; + if (Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + } + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + break; + case MVT::v16f16: + if (Subtarget.hasFP16()) { + if (Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + } break; } - [[fallthrough]]; + X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::v16f32, X); + Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::v16i32, Exp); + break; case MVT::v16f32: case MVT::v8f64: Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + case MVT::v32f16: + if (Subtarget.hasFP16()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + } + SDValue Low = DAG.getExtractSubvector(DL, MVT::v16f16, X, 0); + SDValue High = DAG.getExtractSubvector(DL, MVT::v16f16, X, 16); + SDValue ExpLow = DAG.getExtractSubvector(DL, MVT::v16i16, Exp, 0); + SDValue ExpHigh = DAG.getExtractSubvector(DL, MVT::v16i16, Exp, 16); + + SDValue OpLow = DAG.getNode(ISD::FLDEXP, DL, MVT::v16f16, Low, ExpLow); + SDValue OpHigh = DAG.getNode(ISD::FLDEXP, DL, MVT::v16f16, High, ExpHigh); + SDValue ScaledLow = LowerFLDEXP(OpLow, Subtarget, DAG); + SDValue ScaledHigh = LowerFLDEXP(OpHigh, Subtarget, DAG); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v32f16, ScaledLow, + ScaledHigh); } - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); if (XTy.isVector()) { - SDValue WideX = - DAG.getInsertSubvector(DL, DAG.getUNDEF(XVT), X, 0); - SDValue WideExp = - DAG.getInsertSubvector(DL, DAG.getUNDEF(ExpVT), Exp, 0); - SDValue Scalef = - DAG.getNode(X86ISD::SCALEF, DL, XVT, WideX, WideExp, WideX); - SDValue Final = DAG.getExtractSubvector(DL, XTy, Scalef, 0); - return Final; + SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512); + SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512); + if (XTy.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) { + WideExp = + DAG.getNode(ISD::SINT_TO_FP, DL, WideX.getSimpleValueType(), WideExp); + SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), + WideX, WideExp, WideX); + MVT ExtractVT = XTy == MVT::v8f16 ? MVT::v8f32 : MVT::v16f32; + SDValue LowHalf = DAG.getExtractSubvector(DL, ExtractVT, Scalef, 0); + return DAG.getNode(ISD::FP_ROUND, DL, XTy, LowHalf, + DAG.getTargetConstant(0, DL, MVT::i32)); + } + SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), + WideX, WideExp, WideX); + return DAG.getExtractSubvector(DL, XTy, Scalef, 0); } else { - SDValue VX = DAG.getInsertVectorElt(DL, DAG.getUNDEF(XVT), X, 0); - SDValue VExp = DAG.getInsertVectorElt(DL, DAG.getUNDEF(ExpVT), Exp, 0); - SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, XVT, VX, VExp, VX); + MVT VT = MVT::getVectorVT(X.getSimpleValueType(), + 128 / X.getSimpleValueType().getSizeInBits()); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + SDValue VX = DAG.getInsertVectorElt(DL, DAG.getUNDEF(VT), X, 0); + SDValue VExp = DAG.getInsertVectorElt(DL, DAG.getUNDEF(VT), Exp, 0); + SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp, VX); SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0); if (X.getValueType() != XTy) Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, - DAG.getIntPtrConstant(1, SDLoc(Op))); + DAG.getTargetConstant(0, DL, MVT::i32)); return Final; } } @@ -33763,7 +33797,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG); case ISD::FLDEXP: return LowerFLDEXP(Op, Subtarget, DAG); - // clang-format on + // clang-format on } } diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index bc530e83593f2..a04efe9f02f2d 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -125,7 +125,7 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: vscalefss %xmm0, %xmm1, %xmm0 ; CHECK-SKX-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) ret <4 x float> %r @@ -576,109 +576,15 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 ; CHECK-AVX2-NEXT: retq ; -; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: -; CHECK-ONLY-AVX512F: # %bb.0: -; CHECK-ONLY-AVX512F-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm2 -; CHECK-ONLY-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm2, %xmm1, %xmm2 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; CHECK-ONLY-AVX512F-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-ONLY-AVX512F-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-ONLY-AVX512F-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm3, %xmm1, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-ONLY-AVX512F-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-ONLY-AVX512F-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm4, %xmm1, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-ONLY-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-ONLY-AVX512F-NEXT: cwtl -; CHECK-ONLY-AVX512F-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-ONLY-AVX512F-NEXT: vscalefss %xmm0, %xmm1, %xmm0 -; CHECK-ONLY-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-ONLY-AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; CHECK-ONLY-AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; CHECK-ONLY-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-ONLY-AVX512F-NEXT: retq -; -; CHECK-SKX-LABEL: fmul_pow2_ldexp_8xhalf: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1 -; CHECK-SKX-NEXT: vmovss {{.*#+}} xmm2 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-SKX-NEXT: vscalefss %xmm1, %xmm2, %xmm1 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; CHECK-SKX-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 -; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; CHECK-SKX-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 -; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-SKX-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 -; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SKX-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm3 -; CHECK-SKX-NEXT: vscalefss %xmm3, %xmm2, %xmm3 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; CHECK-SKX-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 -; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-SKX-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm4 -; CHECK-SKX-NEXT: vscalefss %xmm4, %xmm2, %xmm4 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; CHECK-SKX-NEXT: vmovd %xmm0, %eax -; CHECK-SKX-NEXT: cwtl -; CHECK-SKX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; CHECK-SKX-NEXT: vscalefss %xmm0, %xmm2, %xmm0 -; CHECK-SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-SKX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; CHECK-SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; CHECK-SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-SKX-NEXT: retq +; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: +; CHECK-AVX512F: # %bb.0: +; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] +; CHECK-AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; CHECK-AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0 +; CHECK-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-AVX512F-NEXT: vzeroupper +; CHECK-AVX512F-NEXT: retq %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> , <8 x i16> %i) ret <8 x half> %r } @@ -1812,5 +1718,3 @@ define x86_fp80 @pr128528(i1 %cond) { %mul = fmul x86_fp80 %conv, 0xK4007D055555555555800 ret x86_fp80 %mul } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-AVX512F: {{.*}} diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll index 8acbfb83e0e17..4d086a56a2f3a 100644 --- a/llvm/test/CodeGen/X86/ldexp-avx512.ll +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -73,181 +73,41 @@ entry: declare fp128 @ldexpl(fp128, i32) memory(none) define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind { -; AVX512-LABEL: test_ldexp_8xhalf: -; AVX512: # %bb.0: -; AVX512-NEXT: subq $88, %rsp -; AVX512-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm0 -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: addq $88, %rsp -; AVX512-NEXT: retq +; AVX512F-LABEL: test_ldexp_8xhalf: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_ldexp_8xhalf: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512FP16-NEXT: vinsertf32x4 $0, %xmm0, %zmm2, %zmm0 +; AVX512FP16-NEXT: vmovaps %xmm1, %xmm1 +; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512FP16-NEXT: vzeroupper +; AVX512FP16-NEXT: retq ; ; AVX512VL-LABEL: test_ldexp_8xhalf: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: subq $88, %rsp -; AVX512VL-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm0 -; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,0] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512VL-NEXT: addq $88, %rsp +; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512VL-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_8xhalf: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtw2ph %xmm1, %xmm1 +; AVX512VLFP16-NEXT: vscalefph %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: retq %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp) ret <8 x half> %r } @@ -256,8 +116,8 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_4xfloat: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovaps %xmm0, %xmm0 ; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -266,13 +126,13 @@ define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind ; AVX512VL-LABEL: test_ldexp_4xfloat: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vscalefss %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512VLFP16-LABEL: test_ldexp_4xfloat: ; AVX512VLFP16: # %bb.0: ; AVX512VLFP16-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX512VLFP16-NEXT: vscalefps %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: vscalefss %xmm1, %xmm0, %xmm0 ; AVX512VLFP16-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp) ret <4 x float> %r @@ -296,375 +156,38 @@ define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwi declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind { -; AVX512-LABEL: test_ldexp_16xhalf: -; AVX512: # %bb.0: -; AVX512-NEXT: subq $168, %rsp -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX512-NEXT: addq $168, %rsp -; AVX512-NEXT: retq +; AVX512F-LABEL: test_ldexp_16xhalf: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_ldexp_16xhalf: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512FP16-NEXT: vinsertf64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512FP16-NEXT: vmovaps %ymm1, %ymm1 +; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512FP16-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512FP16-NEXT: retq ; ; AVX512VL-LABEL: test_ldexp_16xhalf: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: subq $168, %rsp -; AVX512VL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,0] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,0] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512VL-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX512VL-NEXT: addq $168, %rsp +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512VL-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_16xhalf: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtw2ph %ymm1, %ymm1 +; AVX512VLFP16-NEXT: vscalefph %ymm1, %ymm0, %ymm0 +; AVX512VLFP16-NEXT: retq %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp) ret <16 x half> %r } @@ -673,8 +196,8 @@ declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>) define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_8xfloat: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512-NEXT: vmovaps %ymm0, %ymm0 ; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq @@ -698,8 +221,8 @@ declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_4xdouble: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512-NEXT: vmovapd %ymm0, %ymm0 ; AVX512-NEXT: vscalefpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq @@ -721,729 +244,51 @@ define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwi declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind { -; AVX512-LABEL: test_ldexp_32xhalf: -; AVX512: # %bb.0: -; AVX512-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpextrw $6, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpextrw $5, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpextrw $4, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpextrw $2, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpextrw $1, %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovd %xmm1, %eax -; AVX512-NEXT: movswl %ax, %edi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq ldexpf@PLT -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] -; AVX512-NEXT: addq $360, %rsp # imm = 0x168 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_ldexp_32xhalf: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm2 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm3 +; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 +; AVX512F-NEXT: vscalefps %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 +; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_ldexp_32xhalf: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vcvtw2ph %zmm1, %zmm1 +; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512FP16-NEXT: retq ; ; AVX512VL-LABEL: test_ldexp_32xhalf: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512VL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,0] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,0] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,0] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,0] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movswl %ax, %edi -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq ldexpf@PLT -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512VL-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] -; AVX512VL-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm2 +; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm3 +; AVX512VL-NEXT: vcvtdq2ps %zmm3, %zmm3 +; AVX512VL-NEXT: vscalefps %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vcvtps2ph $4, %zmm2, %ymm2 +; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512VL-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_32xhalf: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtw2ph %zmm1, %zmm1 +; AVX512VLFP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512VLFP16-NEXT: retq %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp) ret <32 x half> %r } From 1806c40936b75e95ae9471089f7f917de44964fb Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Thu, 13 Nov 2025 23:33:51 -0500 Subject: [PATCH 4/8] Call SCALEF instead of SCALEFS for v4f32/v2f64 and refactored code. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 50 +++++-------------- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 2 +- llvm/test/CodeGen/X86/ldexp-avx512.ll | 4 +- 3 files changed, 15 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d39921552add7..f73f2515302f5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19161,6 +19161,7 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, SDValue X = Op.getOperand(0); MVT XTy = X.getSimpleValueType(); SDValue Exp = Op.getOperand(1); + MVT ExtVT; switch (XTy.SimpleTy) { default: @@ -19175,31 +19176,17 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, break; case MVT::v4f32: case MVT::v2f64: - if (Subtarget.hasVLX()) { - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); - return DAG.getNode(X86ISD::SCALEFS, DL, XTy, X, Exp, X); - } - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); - break; - case MVT::v8f16: - if (Subtarget.hasFP16()) { - if (Subtarget.hasVLX()) { - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); - return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); - } - break; - } - X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::v8f32, X); - Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::v8i32, Exp); - break; case MVT::v8f32: case MVT::v4f64: - if (Subtarget.hasVLX()) { + case MVT::v16f32: + case MVT::v8f64: + if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) { Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); } Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); break; + case MVT::v8f16: case MVT::v16f16: if (Subtarget.hasFP16()) { if (Subtarget.hasVLX()) { @@ -19208,29 +19195,16 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, } break; } - X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::v16f32, X); - Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::v16i32, Exp); + ExtVT = XTy.changeVectorElementType(MVT::f32); + X = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, X); + Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT.changeTypeToInteger(), Exp); break; - case MVT::v16f32: - case MVT::v8f64: - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); - return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); case MVT::v32f16: if (Subtarget.hasFP16()) { Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); } - SDValue Low = DAG.getExtractSubvector(DL, MVT::v16f16, X, 0); - SDValue High = DAG.getExtractSubvector(DL, MVT::v16f16, X, 16); - SDValue ExpLow = DAG.getExtractSubvector(DL, MVT::v16i16, Exp, 0); - SDValue ExpHigh = DAG.getExtractSubvector(DL, MVT::v16i16, Exp, 16); - - SDValue OpLow = DAG.getNode(ISD::FLDEXP, DL, MVT::v16f16, Low, ExpLow); - SDValue OpHigh = DAG.getNode(ISD::FLDEXP, DL, MVT::v16f16, High, ExpHigh); - SDValue ScaledLow = LowerFLDEXP(OpLow, Subtarget, DAG); - SDValue ScaledHigh = LowerFLDEXP(OpHigh, Subtarget, DAG); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v32f16, ScaledLow, - ScaledHigh); + return splitVectorOp(Op, DAG, DL); } if (XTy.isVector()) { @@ -19241,9 +19215,9 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, DAG.getNode(ISD::SINT_TO_FP, DL, WideX.getSimpleValueType(), WideExp); SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp, WideX); - MVT ExtractVT = XTy == MVT::v8f16 ? MVT::v8f32 : MVT::v16f32; - SDValue LowHalf = DAG.getExtractSubvector(DL, ExtractVT, Scalef, 0); - return DAG.getNode(ISD::FP_ROUND, DL, XTy, LowHalf, + SDValue Final = + DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0); + return DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, DAG.getTargetConstant(0, DL, MVT::i32)); } SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index a04efe9f02f2d..05159724717f8 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -125,7 +125,7 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-SKX-NEXT: vscalefss %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0 ; CHECK-SKX-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> , <4 x i32> %i) ret <4 x float> %r diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll index 4d086a56a2f3a..a853da2404b23 100644 --- a/llvm/test/CodeGen/X86/ldexp-avx512.ll +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -126,13 +126,13 @@ define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind ; AVX512VL-LABEL: test_ldexp_4xfloat: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512VLFP16-LABEL: test_ldexp_4xfloat: ; AVX512VLFP16: # %bb.0: ; AVX512VLFP16-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX512VLFP16-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: vscalefps %xmm1, %xmm0, %xmm0 ; AVX512VLFP16-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp) ret <4 x float> %r From c30244b92b73ad3547caf553ac93554c44818bb5 Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Mon, 17 Nov 2025 11:08:57 -0500 Subject: [PATCH 5/8] Minor refactoring. Switch getInsertVectorElts -> SCALAR_TO_VECTOR, and fixed styling issue. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 68 +++++++++++-------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f73f2515302f5..a4d258bcb259b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19167,24 +19167,31 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, default: return SDValue(); case MVT::f16: - if (!Subtarget.hasFP16()) { - X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); - } - break; + if (!Subtarget.hasFP16()) + X = DAG.getFPExtendOrRound(X, DL, MVT::f32); + [[fallthrough]]; case MVT::f32: - case MVT::f64: - break; + case MVT::f64: { + MVT VT = MVT::getVectorVT(X.getSimpleValueType(), + 128 / X.getSimpleValueType().getSizeInBits()); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); + SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X); + SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp); + SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp, VX); + SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0); + if (X.getValueType() != XTy) + Final = DAG.getFPExtendOrRound(Final, DL, XTy); + return Final; + } case MVT::v4f32: case MVT::v2f64: case MVT::v8f32: case MVT::v4f64: case MVT::v16f32: case MVT::v8f64: - if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) { - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); - } - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); break; case MVT::v8f16: case MVT::v16f16: @@ -19196,8 +19203,8 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, break; } ExtVT = XTy.changeVectorElementType(MVT::f32); - X = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, X); - Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT.changeTypeToInteger(), Exp); + X = DAG.getFPExtendOrRound(X, DL, ExtVT); + Exp = DAG.getSExtOrTrunc(Exp, DL, ExtVT.changeTypeToInteger()); break; case MVT::v32f16: if (Subtarget.hasFP16()) { @@ -19207,35 +19214,20 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, return splitVectorOp(Op, DAG, DL); } - if (XTy.isVector()) { - SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512); - SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512); - if (XTy.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) { - WideExp = - DAG.getNode(ISD::SINT_TO_FP, DL, WideX.getSimpleValueType(), WideExp); - SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), - WideX, WideExp, WideX); - SDValue Final = - DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0); - return DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, - DAG.getTargetConstant(0, DL, MVT::i32)); - } + SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512); + SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512); + if (X.getValueType() != XTy) { + WideExp = + DAG.getNode(ISD::SINT_TO_FP, DL, WideX.getSimpleValueType(), WideExp); SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp, WideX); - return DAG.getExtractSubvector(DL, XTy, Scalef, 0); - } else { - MVT VT = MVT::getVectorVT(X.getSimpleValueType(), - 128 / X.getSimpleValueType().getSizeInBits()); - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); - SDValue VX = DAG.getInsertVectorElt(DL, DAG.getUNDEF(VT), X, 0); - SDValue VExp = DAG.getInsertVectorElt(DL, DAG.getUNDEF(VT), Exp, 0); - SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp, VX); - SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0); - if (X.getValueType() != XTy) - Final = DAG.getNode(ISD::FP_ROUND, DL, XTy, Final, - DAG.getTargetConstant(0, DL, MVT::i32)); - return Final; + SDValue Final = + DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0); + return DAG.getFPExtendOrRound(Final, DL, XTy); } + SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, + WideExp, WideX); + return DAG.getExtractSubvector(DL, XTy, Scalef, 0); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, From e1bcedc633349313ad58c6e05344f1447fbe7b2f Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Wed, 19 Nov 2025 09:19:00 -0500 Subject: [PATCH 6/8] Ensure Exp is type integer after switch statement. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 44 +++++++------------ .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 3 +- llvm/test/CodeGen/X86/ldexp-avx512.ll | 42 +++++++----------- 3 files changed, 35 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a4d258bcb259b..80d4f67e18934 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1829,7 +1829,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FCANONICALIZE, VT, Custom); } - setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); setOperationAction(ISD::LRINT, MVT::v8f64, @@ -19161,7 +19160,6 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, SDValue X = Op.getOperand(0); MVT XTy = X.getSimpleValueType(); SDValue Exp = Op.getOperand(1); - MVT ExtVT; switch (XTy.SimpleTy) { default: @@ -19177,11 +19175,9 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, Exp = DAG.getNode(ISD::SINT_TO_FP, DL, X.getValueType(), Exp); SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, X); SDValue VExp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Exp); - SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp, VX); + SDValue Scalefs = DAG.getNode(X86ISD::SCALEFS, DL, VT, VX, VExp); SDValue Final = DAG.getExtractVectorElt(DL, X.getValueType(), Scalefs, 0); - if (X.getValueType() != XTy) - Final = DAG.getFPExtendOrRound(Final, DL, XTy); - return Final; + return DAG.getFPExtendOrRound(Final, DL, XTy); } case MVT::v4f32: case MVT::v2f64: @@ -19189,45 +19185,39 @@ static SDValue LowerFLDEXP(SDValue Op, const X86Subtarget &Subtarget, case MVT::v4f64: case MVT::v16f32: case MVT::v8f64: - Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); - if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) - return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + if (XTy.getSizeInBits() == 512 || Subtarget.hasVLX()) { + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); + } break; case MVT::v8f16: case MVT::v16f16: if (Subtarget.hasFP16()) { if (Subtarget.hasVLX()) { Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); - return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); } break; } - ExtVT = XTy.changeVectorElementType(MVT::f32); - X = DAG.getFPExtendOrRound(X, DL, ExtVT); - Exp = DAG.getSExtOrTrunc(Exp, DL, ExtVT.changeTypeToInteger()); + X = DAG.getFPExtendOrRound(X, DL, XTy.changeVectorElementType(MVT::f32)); + Exp = DAG.getSExtOrTrunc(Exp, DL, + X.getSimpleValueType().changeTypeToInteger()); break; case MVT::v32f16: if (Subtarget.hasFP16()) { Exp = DAG.getNode(ISD::SINT_TO_FP, DL, XTy, Exp); - return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp, X); + return DAG.getNode(X86ISD::SCALEF, DL, XTy, X, Exp); } return splitVectorOp(Op, DAG, DL); } - SDValue WideX = widenSubVector(X, true, Subtarget, DAG, DL, 512); SDValue WideExp = widenSubVector(Exp, true, Subtarget, DAG, DL, 512); - if (X.getValueType() != XTy) { - WideExp = - DAG.getNode(ISD::SINT_TO_FP, DL, WideX.getSimpleValueType(), WideExp); - SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), - WideX, WideExp, WideX); - SDValue Final = - DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0); - return DAG.getFPExtendOrRound(Final, DL, XTy); - } - SDValue Scalef = DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, - WideExp, WideX); - return DAG.getExtractSubvector(DL, XTy, Scalef, 0); + Exp = DAG.getNode(ISD::SINT_TO_FP, DL, WideExp.getSimpleValueType(), Exp); + SDValue Scalef = + DAG.getNode(X86ISD::SCALEF, DL, WideX.getValueType(), WideX, WideExp); + SDValue Final = + DAG.getExtractSubvector(DL, X.getSimpleValueType(), Scalef, 0); + return DAG.getFPExtendOrRound(Final, DL, XTy); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 05159724717f8..3f63d2a483ae0 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -115,7 +115,7 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat: ; CHECK-ONLY-AVX512F: # %bb.0: ; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] -; CHECK-ONLY-AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vmovaps %xmm0, %xmm0 ; CHECK-ONLY-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0 ; CHECK-ONLY-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-ONLY-AVX512F-NEXT: vzeroupper @@ -580,7 +580,6 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; CHECK-AVX512F: # %bb.0: ; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] ; CHECK-AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; CHECK-AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0 ; CHECK-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0 ; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; CHECK-AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll index a853da2404b23..bb6dc3162eb1f 100644 --- a/llvm/test/CodeGen/X86/ldexp-avx512.ll +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -75,9 +75,8 @@ declare fp128 @ldexpl(fp128, i32) memory(none) define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind { ; AVX512F-LABEL: test_ldexp_8xhalf: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -95,9 +94,8 @@ define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind { ; ; AVX512VL-LABEL: test_ldexp_8xhalf: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX512VL-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -116,7 +114,7 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_4xfloat: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovaps %xmm1, %xmm1 ; AVX512-NEXT: vmovaps %xmm0, %xmm0 ; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -158,9 +156,8 @@ declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind { ; AVX512F-LABEL: test_ldexp_16xhalf: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512F-NEXT: retq @@ -176,9 +173,8 @@ define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind ; ; AVX512VL-LABEL: test_ldexp_16xhalf: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512VL-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512VL-NEXT: retq @@ -196,7 +192,7 @@ declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>) define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_8xfloat: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512-NEXT: vmovaps %ymm1, %ymm1 ; AVX512-NEXT: vmovaps %ymm0, %ymm0 ; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -221,7 +217,7 @@ declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind { ; AVX512-LABEL: test_ldexp_4xdouble: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512-NEXT: vmovapd %xmm1, %xmm1 ; AVX512-NEXT: vmovapd %ymm0, %ymm0 ; AVX512-NEXT: vscalefpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -246,16 +242,14 @@ declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind { ; AVX512F-LABEL: test_ldexp_32xhalf: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm2 -; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm3 -; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512F-NEXT: vscalefps %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm2 +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm3 +; AVX512F-NEXT: vscalefps %zmm2, %zmm3, %zmm2 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 @@ -269,16 +263,14 @@ define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind ; ; AVX512VL-LABEL: test_ldexp_32xhalf: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm2 -; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm3 -; AVX512VL-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512VL-NEXT: vscalefps %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm2 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm3 +; AVX512VL-NEXT: vscalefps %zmm2, %zmm3, %zmm2 ; AVX512VL-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512VL-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 From e5bcb93220d8f9480f0c64f9919bafa644a31263 Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Thu, 20 Nov 2025 11:03:46 -0500 Subject: [PATCH 7/8] Move FLDEXP setOperationAction above comment for clarity --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 80d4f67e18934..69bbd85e417e8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1785,6 +1785,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + } if (Subtarget.hasDQI() && Subtarget.hasVLX()) { for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { @@ -2098,14 +2099,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // pre-AVX512 equivalents. Without VLX we use 512-bit operations for // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { - // These operations are handled on non-VLX by artificially widening in - // isel patterns. - for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32, MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16, MVT::v16f32, MVT::v8f64}) setOperationAction(ISD::FLDEXP, VT, Custom); + // These operations are handled on non-VLX by artificially widening in + // isel patterns. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); From ea457ee3890665ffb8a12fead21b2cbc07c585a2 Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Thu, 20 Nov 2025 11:09:47 -0500 Subject: [PATCH 8/8] Executed clang-format --- llvm/lib/Target/X86/X86ISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 69bbd85e417e8..7d9a8a374fc6b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1785,7 +1785,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - } if (Subtarget.hasDQI() && Subtarget.hasVLX()) { for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {