Skip to content

Commit

Permalink
[X86][SSE] Replace (V)CVTTPS2DQ and VCVTTPD2DQ truncating (round to z…
Browse files Browse the repository at this point in the history
…ero) f32/f64 to i32 with generic IR (llvm)

This patch removes the llvm intrinsics (V)CVTTPS2DQ and VCVTTPD2DQ truncation (round to zero) conversions and auto-upgrades to FP_TO_SINT calls instead.

Note: I looked at updating CVTTPD2DQ as well but this still requires a lot more work to correctly lower.

Differential Revision: http://reviews.llvm.org/D20860

llvm-svn: 271510
  • Loading branch information
RKSimon committed Jun 2, 2016
1 parent 7f74ded commit 0afd5a4
Show file tree
Hide file tree
Showing 9 changed files with 58 additions and 89 deletions.
6 changes: 0 additions & 6 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -488,8 +488,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
Expand Down Expand Up @@ -1725,12 +1723,8 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
}

// Vector bit test
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name == "x86.sse2.cvtps2pd" ||
Name == "x86.avx.cvtdq2.pd.256" ||
Name == "x86.avx.cvt.ps2.pd.256" ||
Name == "x86.sse2.cvttps2dq" ||
Name.startswith("x86.avx.cvtt.") ||
Name.startswith("x86.avx.vinsertf128.") ||
Name == "x86.avx2.vinserti128" ||
Name.startswith("x86.avx.vextractf128.") ||
Expand Down Expand Up @@ -498,6 +500,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
else
Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
} else if (Name == "llvm.x86.sse2.cvttps2dq" ||
Name.startswith("llvm.x86.avx.cvtt.")) {
// Truncation (round to zero) float/double to i32 vector conversion.
Value *Src = CI->getArgOperand(0);
VectorType *DstTy = cast<VectorType>(CI->getType());
Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
} else if (Name.startswith("llvm.x86.avx.movnt.")) {
Module *M = F->getParent();
SmallVector<Metadata *, 1> Elts;
Expand Down
31 changes: 8 additions & 23 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -2013,35 +2013,24 @@ def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// SSE2 packed instructions with XS prefix
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
[], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttps2dq
(loadv4f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
[], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
[], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
(loadv8f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
[], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
Sched<[WriteCvtF2ILd]>;

def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
[], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
[], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;

let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
Expand Down Expand Up @@ -2111,14 +2100,10 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
[], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
[], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;

Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -675,11 +675,10 @@ define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
%cvt = fptosi <4 x double> %a0 to <4 x i32>
%res = bitcast <4 x i32> %cvt to <2 x i64>
ret <2 x i64> %res
}
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone

define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvttps_epi32:
Expand All @@ -691,11 +690,10 @@ define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
; X64: # BB#0:
; X64-NEXT: vcvttps2dq %ymm0, %ymm0
; X64-NEXT: retq
%cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
%cvt = fptosi <8 x float> %a0 to <8 x i32>
%res = bitcast <8 x i32> %cvt to <4 x i64>
ret <4 x i64> %res
}
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone

define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_div_pd:
Expand Down
25 changes: 24 additions & 1 deletion llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
Original file line number Diff line number Diff line change
Expand Up @@ -357,12 +357,35 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone


define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone


define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone


define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; add operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
Expand Down
39 changes: 3 additions & 36 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3407,39 +3407,6 @@ define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone


define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
; AVX: ## BB#0:
; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0
; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone


define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
; AVX: ## BB#0:
; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone


define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; AVX-LABEL: test_x86_avx_dp_ps_256:
; AVX: ## BB#0:
Expand Down Expand Up @@ -4133,7 +4100,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vpermilpd LCPI233_0, %ymm0, %ymm0
; AVX512VL-NEXT: vpermilpd LCPI231_0, %ymm0, %ymm0
; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
Expand Down Expand Up @@ -4625,15 +4592,15 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
; AVX-LABEL: movnt_dq:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0
; AVX-NEXT: vpaddq LCPI258_0, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %ymm0, (%eax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
;
; AVX512VL-LABEL: movnt_dq:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vpaddq LCPI260_0, %xmm0, %xmm0
; AVX512VL-NEXT: vpaddq LCPI258_0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1280,11 +1280,10 @@ define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
; X64: # BB#0:
; X64-NEXT: cvttps2dq %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
%res = fptosi <4 x float> %a0 to <4 x i32>
%bc = bitcast <4 x i32> %res to <2 x i64>
ret <2 x i64> %bc
}
declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone

define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvttsd_si32:
Expand Down
13 changes: 12 additions & 1 deletion llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone


define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
; CHECK-LABEL: test_x86_sse2_cvttps2dq:
; CHECK: ## BB#0:
; CHECK-NEXT: cvttps2dq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone


define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_sse2_storel_dq:
; CHECK: ## BB#0:
Expand All @@ -101,7 +112,7 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: paddb LCPI7_0, %xmm0
; CHECK-NEXT: paddb LCPI8_0, %xmm0
; CHECK-NEXT: movdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
Expand Down
16 changes: 0 additions & 16 deletions llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
Original file line number Diff line number Diff line change
Expand Up @@ -322,22 +322,6 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone


define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse2_cvttps2dq:
; SSE: ## BB#0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: retl
;
; KNL-LABEL: test_x86_sse2_cvttps2dq:
; KNL: ## BB#0:
; KNL-NEXT: vcvttps2dq %xmm0, %xmm0
; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone


define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvttsd2si:
; SSE: ## BB#0:
Expand Down

0 comments on commit 0afd5a4

Please sign in to comment.