Skip to content

Commit

Permalink
[X86][SSE] Reimplement SSE fp2si conversion intrinsics instead of usi…
Browse files Browse the repository at this point in the history
…ng generic IR

D20859 and D20860 attempted to replace the SSE (V)CVTTPS2DQ and VCVTTPD2DQ truncating conversions with generic IR instead.

It turns out that the behaviour of these intrinsics is different enough from generic IR that this will cause problems, INF/NAN/out of range values are guaranteed to result in a 0x80000000 value - which plays havoc with constant folding which converts them to either zero or UNDEF. This is also an issue with the scalar implementations (which were already generic IR and what I was trying to match).

This patch changes both scalar and packed versions back to using x86-specific builtins.

It also deals with the other scalar conversion cases that are runtime rounding mode dependent and can have similar issues with constant folding.

A companion clang patch is at D22105

Differential Revision: https://reviews.llvm.org/D22106

llvm-svn: 275981
  • Loading branch information
RKSimon committed Jul 19, 2016
1 parent 9e4bd0c commit 0ea8d27
Show file tree
Hide file tree
Showing 14 changed files with 138 additions and 95 deletions.
6 changes: 6 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Expand Up @@ -479,6 +479,8 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
Expand Down Expand Up @@ -1512,8 +1514,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
}

// Vector bit test
Expand Down
19 changes: 10 additions & 9 deletions llvm/lib/Analysis/ConstantFolding.cpp
Expand Up @@ -1424,8 +1424,8 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
/// integer type Ty is used to select how many bits are available for the
/// result. Returns null if the conversion cannot be performed, otherwise
/// returns the Constant value resulting from the conversion.
Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
Type *Ty) {
Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
Type *Ty) {
// All of these conversion intrinsics form an integer of at most 64bits.
unsigned ResultWidth = Ty->getIntegerBitWidth();
assert(ResultWidth <= 64 &&
Expand All @@ -1438,7 +1438,8 @@ Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
/*isSigned=*/true, mode,
&isExact);
if (status != APFloat::opOK && status != APFloat::opInexact)
if (status != APFloat::opOK &&
(!roundTowardZero || status != APFloat::opInexact))
return nullptr;
return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
}
Expand Down Expand Up @@ -1676,17 +1677,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
case Intrinsic::x86_sse2_cvtsd2si:
case Intrinsic::x86_sse2_cvtsd2si64:
if (ConstantFP *FPOp =
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
return ConstantFoldConvertToInt(FPOp->getValueAPF(),
/*roundTowardZero=*/false, Ty);
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
/*roundTowardZero=*/false, Ty);
case Intrinsic::x86_sse_cvttss2si:
case Intrinsic::x86_sse_cvttss2si64:
case Intrinsic::x86_sse2_cvttsd2si:
case Intrinsic::x86_sse2_cvttsd2si64:
if (ConstantFP *FPOp =
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
return ConstantFoldConvertToInt(FPOp->getValueAPF(),
/*roundTowardZero=*/true, Ty);
dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
/*roundTowardZero=*/true, Ty);
}
}

Expand Down
8 changes: 0 additions & 8 deletions llvm/lib/IR/AutoUpgrade.cpp
Expand Up @@ -251,8 +251,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name == "sse2.cvtps2pd" ||
Name == "avx.cvtdq2.pd.256" ||
Name == "avx.cvt.ps2.pd.256" ||
Name == "sse2.cvttps2dq" ||
Name.startswith("avx.cvtt.") ||
Name.startswith("avx.vinsertf128.") ||
Name == "avx2.vinserti128" ||
Name.startswith("avx.vextractf128.") ||
Expand Down Expand Up @@ -712,12 +710,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
else
Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
} else if (IsX86 && (Name == "sse2.cvttps2dq" ||
Name.startswith("avx.cvtt."))) {
// Truncation (round to zero) float/double to i32 vector conversion.
Value *Src = CI->getArgOperand(0);
VectorType *DstTy = cast<VectorType>(CI->getType());
Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
} else if (IsX86 && Name.startswith("sse4a.movnt.")) {
Module *M = F->getParent();
SmallVector<Metadata *, 1> Elts;
Expand Down
31 changes: 23 additions & 8 deletions llvm/lib/Target/X86/X86InstrSSE.td
Expand Up @@ -2009,24 +2009,35 @@ def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// SSE2 packed instructions with XS prefix
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
[(set VR128:$dst, (int_x86_sse2_cvttps2dq
(loadv4f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
[(set VR256:$dst,
(int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
[(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
(loadv8f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
Sched<[WriteCvtF2ILd]>;

def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
[(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
[(set VR128:$dst,
(int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;

let Predicates = [HasAVX] in {
def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
Expand Down Expand Up @@ -2096,10 +2107,14 @@ def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
[(set VR128:$dst,
(int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
[(set VR128:$dst,
(int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;

Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
Expand Up @@ -681,10 +681,11 @@ define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%cvt = fptosi <4 x double> %a0 to <4 x i32>
%cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
%res = bitcast <4 x i32> %cvt to <2 x i64>
ret <2 x i64> %res
}
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone

define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvttps_epi32:
Expand All @@ -696,10 +697,11 @@ define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
; X64: # BB#0:
; X64-NEXT: vcvttps2dq %ymm0, %ymm0
; X64-NEXT: retq
%cvt = fptosi <8 x float> %a0 to <8 x i32>
%cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
%res = bitcast <8 x i32> %cvt to <4 x i64>
ret <4 x i64> %res
}
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone

define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_div_pd:
Expand Down
25 changes: 1 addition & 24 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
Expand Up @@ -359,35 +359,12 @@ define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone


define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone


define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
; CHECK: ## BB#0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone


define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; add operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
Expand Down
37 changes: 35 additions & 2 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
Expand Up @@ -3431,6 +3431,39 @@ define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone


define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
; AVX: ## BB#0:
; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0
; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone


define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
; AVX: ## BB#0:
; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone


define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; AVX-LABEL: test_x86_avx_dp_ps_256:
; AVX: ## BB#0:
Expand Down Expand Up @@ -4552,15 +4585,15 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
; AVX-LABEL: movnt_dq:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %ymm0, (%eax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
;
; AVX512VL-LABEL: movnt_dq:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
Expand Up @@ -6,13 +6,12 @@
define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_ss:
; X64: # BB#0:
; X64-NEXT: cvtsi2ssq %rdi, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: cvtsi2ssq %rdi, %xmm0
; X64-NEXT: retq
%cvt = sitofp i64 %a1 to float
%res = insertelement <4 x float> %a0, float %cvt, i32 0
%res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone

define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
; X64-LABEL: test_mm_cvtss_si64:
Expand All @@ -29,7 +28,7 @@ define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: retq
%cvt = extractelement <4 x float> %a0, i32 0
%res = fptosi float %cvt to i64
%res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
ret i64 %res
}
declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
18 changes: 7 additions & 11 deletions llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
Expand Up @@ -707,20 +707,17 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
; X32-LABEL: test_mm_cvtsi32_ss:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: cvtsi2ssl %eax, %xmm1
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi32_ss:
; X64: # BB#0:
; X64-NEXT: cvtsi2ssl %edi, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: cvtsi2ssl %edi, %xmm0
; X64-NEXT: retq
%cvt = sitofp i32 %a1 to float
%res = insertelement <4 x float> %a0, float %cvt, i32 0
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone

define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtss_f32:
Expand Down Expand Up @@ -762,10 +759,10 @@ define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
%cvt = extractelement <4 x float> %a0, i32 0
%res = fptosi float %cvt to i32
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
ret i32 %res
}
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone

define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvttss_si32:
Expand All @@ -777,8 +774,7 @@ define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
; X64: # BB#0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
%cvt = extractelement <4 x float> %a0, i32 0
%res = fptosi float %cvt to i32
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
ret i32 %res
}

Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
Expand Up @@ -25,13 +25,12 @@ define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_sd:
; X64: # BB#0:
; X64-NEXT: cvtsi2sdq %rdi, %xmm1
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-NEXT: cvtsi2sdq %rdi, %xmm0
; X64-NEXT: retq
%cvt = sitofp i64 %a1 to double
%res = insertelement <2 x double> %a0, double %cvt, i32 0
%res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone

define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
; X64-LABEL: test_mm_cvtsi64_si128:
Expand All @@ -48,10 +47,10 @@ define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
; X64: # BB#0:
; X64-NEXT: cvttsd2si %xmm0, %rax
; X64-NEXT: retq
%ext = extractelement <2 x double> %a0, i32 0
%res = fptosi double %ext to i64
%res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
ret i64 %res
}
declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone

define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
; X64-LABEL: test_mm_loadu_si64:
Expand Down

0 comments on commit 0ea8d27

Please sign in to comment.