From ceffb43b1b7e6d5d8f666acaa51647c87834af09 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 15 Nov 2016 16:24:40 +0000 Subject: [PATCH] [X86][SSE] Improve SINT_TO_FP of boolean vector results (signum) This patch helps avoids poor legalization of boolean vector results (e.g. 8f32 -> 8i1 -> 8i16) that feed into SINT_TO_FP by inserting an early SIGN_EXTEND and so help improve the truncation logic. This is not necessary for AVX512 targets where boolean vectors are legal - AVX512 manages to lower ( sint_to_fp vXi1 ) into some form of ( select mask, 1.0f , 0.0f ) in most cases. Fix for PR13248 Differential Revision: https://reviews.llvm.org/D26583 llvm-svn: 286979 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +- llvm/test/CodeGen/X86/avx512-cvt.ll | 19 +---- llvm/test/CodeGen/X86/sse-fsignum.ll | 104 +++--------------------- 3 files changed, 19 insertions(+), 109 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cd1ce17811212..ca3e0896cf297 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31752,9 +31752,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, EVT InVT = Op0.getValueType(); EVT InSVT = InVT.getScalarType(); + // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) - if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { + if (InVT.isVector() && + (InSVT == MVT::i8 || InSVT == MVT::i16 || + (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index c370b79008cd8..165ffb2a3e456 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -836,8 +836,6 @@ define <4 x double> @sitofp_4i1_double(<4 x double> %a) { ; KNL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: vpslld $31, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 ; KNL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; KNL-NEXT: retq ; @@ -860,21 +858,8 @@ define <2 x float> @sitofp_2i1_float(<2 x float> %a) { ; KNL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; KNL-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm1 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: xorl %ecx, %ecx -; KNL-NEXT: testb $1, %al -; KNL-NEXT: movl $-1, %eax -; KNL-NEXT: movl $0, %edx -; KNL-NEXT: cmovnel %eax, %edx -; KNL-NEXT: vcvtsi2ssl %edx, %xmm2, %xmm1 -; KNL-NEXT: vmovq %xmm0, %rdx -; KNL-NEXT: testb $1, %dl -; KNL-NEXT: cmovnel %eax, %ecx -; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm0 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; KNL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sitofp_2i1_float: diff --git a/llvm/test/CodeGen/X86/sse-fsignum.ll b/llvm/test/CodeGen/X86/sse-fsignum.ll index a9c0a11e0f7fa..3941e05ad6821 100644 --- a/llvm/test/CodeGen/X86/sse-fsignum.ll +++ b/llvm/test/CodeGen/X86/sse-fsignum.ll @@ -33,59 +33,19 @@ entry: } define void @signum64a(<2 x double>*) { -; AVX1-LABEL: signum64a: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovapd (%rdi), %xmm0 -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vmovq %xmm2, %rcx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: signum64a: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovapd (%rdi), %xmm0 -; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vmovq %xmm2, %rcx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: signum64a: -; AVX512F: # BB#0: # %entry -; AVX512F-NEXT: vmovapd (%rdi), %xmm0 -; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512F-NEXT: vcvtdq2pd %xmm2, %xmm2 -; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX512F-NEXT: vsubpd %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovapd %xmm0, (%rdi) -; AVX512F-NEXT: retq +; AVX-LABEL: signum64a: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 +; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovapd %xmm0, (%rdi) +; AVX-NEXT: retq entry: %1 = load <2 x double>, <2 x double>* %0 %2 = fcmp olt <2 x double> %1, zeroinitializer @@ -107,24 +67,8 @@ define void @signum32b(<8 x float>*) { ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 -; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rdi) @@ -136,18 +80,8 @@ define void @signum32b(<8 x float>*) { ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllw $15, %xmm2, %xmm2 -; AVX2-NEXT: vpsraw $15, %xmm2, %xmm2 -; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 ; AVX2-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovaps %ymm0, (%rdi) @@ -189,14 +123,10 @@ define void @signum64b(<4 x double>*) { ; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vcvtdq2pd %xmm2, %ymm2 ; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX1-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, (%rdi) @@ -210,14 +140,10 @@ define void @signum64b(<4 x double>*) { ; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX2-NEXT: vcvtdq2pd %xmm2, %ymm2 ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX2-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, (%rdi) @@ -230,13 +156,9 @@ define void @signum64b(<4 x double>*) { ; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512F-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX512F-NEXT: vcvtdq2pd %xmm2, %ymm2 ; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512F-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vmovapd %ymm0, (%rdi)