Skip to content

Commit

Permalink
[DAG] visitINSERT_VECTOR_ELT - attempt to reconstruct BUILD_VECTOR be…
Browse files Browse the repository at this point in the history
…fore other fold interfere

Another issue unearthed by D127115

We take a long time to canonicalize an insert_vector_elt chain before being able to convert it into a build_vector - even if they are already in ascending insertion order, we fold the nodes one at a time into the build_vector 'seed', leaving plenty of time for other folds to alter it (in particular recognising when they come from extract_vector_elt resulting in a shuffle_vector that is much harder to fold with).

D127115 makes this particularly difficult as we're almost guaranteed to have the lost the sequence before all possible insertions have been folded.

This patch proposes to begin at the last insertion and attempt to collect all the (oneuse) insertions right away and create the build_vector before its too late.

Differential Revision: https://reviews.llvm.org/D127595
  • Loading branch information
RKSimon committed Jun 13, 2022
1 parent f97e15e commit 7d8fd4f
Show file tree
Hide file tree
Showing 13 changed files with 100 additions and 113 deletions.
35 changes: 35 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19426,6 +19426,41 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
return UpdateBuildVector(Ops);
}

// If we're inserting into the end of a vector as part of an sequence, see
// if we can create a BUILD_VECTOR by following the sequence back up the
// chain.
if (Elt == (NumElts - 1)) {
SmallVector<SDValue> ReverseInsertions;
ReverseInsertions.push_back(InVal);

EVT MaxEltVT = InVal.getValueType();
SDValue CurVec = InVec;
for (unsigned I = 1; I != NumElts; ++I) {
if (CurVec.getOpcode() != ISD::INSERT_VECTOR_ELT || !CurVec.hasOneUse())
break;

auto *CurIdx = dyn_cast<ConstantSDNode>(CurVec.getOperand(2));
if (!CurIdx || CurIdx->getAPIntValue() != ((NumElts - 1) - I))
break;
SDValue CurVal = CurVec.getOperand(1);
ReverseInsertions.push_back(CurVal);
if (VT.isInteger()) {
EVT CurValVT = CurVal.getValueType();
MaxEltVT = MaxEltVT.bitsGE(CurValVT) ? MaxEltVT : CurValVT;
}
CurVec = CurVec.getOperand(0);
}

if (ReverseInsertions.size() == NumElts) {
for (unsigned I = 0; I != NumElts; ++I) {
SDValue Val = ReverseInsertions[(NumElts - 1) - I];
Val = VT.isInteger() ? DAG.getAnyExtOrTrunc(Val, DL, MaxEltVT) : Val;
Ops.push_back(Val);
}
return DAG.getBuildVector(VT, DL, Ops);
}
}
}

return SDValue();
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,7 @@ define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
; CHECK-LABEL: ins2d1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v1.d[0], v0.d[0]
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%tmp3 = extractelement <2 x i64> %tmp1, i32 0
%tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
Expand All @@ -282,7 +280,7 @@ define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
; CHECK-LABEL: ins2f1:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2d, v0.d[1]
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%tmp3 = extractelement <2 x double> %tmp1, i32 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,8 @@ define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 {
define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 {
; VBITS_GE_256-LABEL: insertelement_v1f64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: fmov d0, #5.00000000
; VBITS_GE_256-NEXT: mov x8, #4617315517961601024
; VBITS_GE_256-NEXT: fmov d0, x8
; VBITS_GE_256-NEXT: ret
%r = insertelement <1 x double> %op1, double 5.0, i64 0
ret <1 x double> %r
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
; CHECK: Legally typed node: [[VTWOA]]: v2f64 = BUILD_VECTOR
; CHECK: Legalizing node: [[VTWOB:t.*]]: v2f64 = BUILD_VECTOR
; CHECK: Legally typed node: [[VTWOB]]: v2f64 = BUILD_VECTOR
; CHECK: Legalizing node: t34: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]]
; CHECK: Legalizing node: t30: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]]

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
Expand Down
5 changes: 1 addition & 4 deletions llvm/test/CodeGen/ARM/neon-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,7 @@ define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
; CHECK-LABEL: ins2d1:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov.32 d2[0], r0
; CHECK-NEXT: vmov.32 d2[1], r1
; CHECK-NEXT: vorr d0, d2, d2
; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: bx lr
%tmp3 = extractelement <2 x i64> %tmp1, i32 0
%tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
Expand Down
8 changes: 2 additions & 6 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,9 @@
define <1 x i1> @insertelt_v1i1(<1 x i1> %x, i1 %elt) nounwind {
; CHECK-LABEL: insertelt_v1i1:
; CHECK: # %bb.0:
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, mu
; CHECK-NEXT: vmv.s.x v8, a0
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
; CHECK-NEXT: vand.vi v8, v8, 1
; CHECK-NEXT: vmv.v.x v8, a0
; CHECK-NEXT: vmsne.vi v0, v8, 0
; CHECK-NEXT: ret
%y = insertelement <1 x i1> %x, i1 %elt, i64 0
Expand Down
25 changes: 10 additions & 15 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@ define <1 x i8> @mgather_v1i8(<1 x i8*> %ptrs, <1 x i1> %m, <1 x i8> %passthru)
; RV64ZVE32F-NEXT: andi a1, a1, 1
; RV64ZVE32F-NEXT: beqz a1, .LBB0_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: lb a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu
; RV64ZVE32F-NEXT: vmv.s.x v8, a0
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu
; RV64ZVE32F-NEXT: vlse8.v v8, (a0), zero
; RV64ZVE32F-NEXT: .LBB0_2: # %else
; RV64ZVE32F-NEXT: ret
%v = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> %ptrs, i32 1, <1 x i1> %m, <1 x i8> %passthru)
Expand Down Expand Up @@ -1012,9 +1011,8 @@ define <1 x i16> @mgather_v1i16(<1 x i16*> %ptrs, <1 x i1> %m, <1 x i16> %passth
; RV64ZVE32F-NEXT: andi a1, a1, 1
; RV64ZVE32F-NEXT: beqz a1, .LBB13_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: lh a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vmv.s.x v8, a0
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero
; RV64ZVE32F-NEXT: .LBB13_2: # %else
; RV64ZVE32F-NEXT: ret
%v = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> %ptrs, i32 2, <1 x i1> %m, <1 x i16> %passthru)
Expand Down Expand Up @@ -2325,9 +2323,8 @@ define <1 x i32> @mgather_v1i32(<1 x i32*> %ptrs, <1 x i1> %m, <1 x i32> %passth
; RV64ZVE32F-NEXT: andi a1, a1, 1
; RV64ZVE32F-NEXT: beqz a1, .LBB27_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: lw a0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vmv.s.x v8, a0
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero
; RV64ZVE32F-NEXT: .LBB27_2: # %else
; RV64ZVE32F-NEXT: ret
%v = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptrs, i32 4, <1 x i1> %m, <1 x i32> %passthru)
Expand Down Expand Up @@ -7574,9 +7571,8 @@ define <1 x half> @mgather_v1f16(<1 x half*> %ptrs, <1 x i1> %m, <1 x half> %pas
; RV64ZVE32F-NEXT: andi a1, a1, 1
; RV64ZVE32F-NEXT: beqz a1, .LBB58_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: flh ft0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu
; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu
; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero
; RV64ZVE32F-NEXT: .LBB58_2: # %else
; RV64ZVE32F-NEXT: ret
%v = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> %ptrs, i32 2, <1 x i1> %m, <1 x half> %passthru)
Expand Down Expand Up @@ -8594,9 +8590,8 @@ define <1 x float> @mgather_v1f32(<1 x float*> %ptrs, <1 x i1> %m, <1 x float> %
; RV64ZVE32F-NEXT: andi a1, a1, 1
; RV64ZVE32F-NEXT: beqz a1, .LBB68_2
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: flw ft0, 0(a0)
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu
; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero
; RV64ZVE32F-NEXT: .LBB68_2: # %else
; RV64ZVE32F-NEXT: ret
%v = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> %ptrs, i32 4, <1 x i1> %m, <1 x float> %passthru)
Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/X86/avx512-insert-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -593,8 +593,9 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
; CHECK-LABEL: insert_v2i64:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovq %rdi, %xmm1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <2 x i64> %x, i64 %val, i32 1
Expand Down
48 changes: 25 additions & 23 deletions llvm/test/CodeGen/X86/masked_gather_scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2284,50 +2284,52 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0
; KNL_64-NEXT: vmovq %rdi, %xmm2
; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2
; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; KNL_64-NEXT: vmovq %rdi, %xmm1
; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; KNL_64-NEXT: vmovq %xmm0, %rax
; KNL_64-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0
; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
; KNL_64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; KNL_64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; KNL_64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test26:
; KNL_32: # %bb.0:
; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; KNL_32-NEXT: vmovd %xmm0, %eax
; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1
; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0
; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0
; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0
; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
; SKX: # %bb.0:
; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
; SKX-NEXT: vpbroadcastq %rdi, %xmm2
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; SKX-NEXT: vmovq %xmm0, %rax
; SKX-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, %rcx
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; SKX-NEXT: retq
;
; SKX_32-LABEL: test26:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
; SKX_32-NEXT: vmovd %xmm0, %eax
; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1
; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1
; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0
; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0
; SKX_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0
; SKX_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
Expand Down
27 changes: 6 additions & 21 deletions llvm/test/CodeGen/X86/shuffle-extract-subvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,13 @@
define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
; CHECK-LABEL: f:
; CHECK: # %bb.0:
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: movzwl 2(%rdi), %ecx
; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movzwl 6(%rdi), %r8d
; CHECK-NEXT: movzwl 4(%rdi), %r11d
; CHECK-NEXT: movq (%rsi), %rsi
; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq (%rdi), %rax
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: pextrw $1, %xmm0, %r9d
; CHECK-NEXT: movd %xmm0, %r10d
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
; CHECK-NEXT: pextrw $3, %xmm0, %eax
; CHECK-NEXT: pextrw $2, %xmm0, %edi
; CHECK-NEXT: movw %r11w, 8(%rdx)
; CHECK-NEXT: movw %cx, 4(%rdx)
; CHECK-NEXT: movw %r8w, 12(%rdx)
; CHECK-NEXT: movw %si, (%rdx)
; CHECK-NEXT: movw %di, 10(%rdx)
; CHECK-NEXT: movw %ax, 14(%rdx)
; CHECK-NEXT: movw %r10w, 2(%rdx)
; CHECK-NEXT: movw %r9w, 6(%rdx)
; CHECK-NEXT: movq (%rsi), %rax
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-NEXT: movdqa %xmm0, (%rdx)
; CHECK-NEXT: retq
%tmp4 = load <4 x half>, <4 x half>* %a
%tmp5 = load <4 x half>, <4 x half>* %b
Expand Down
23 changes: 6 additions & 17 deletions llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -379,26 +379,15 @@ define <4 x i32> @insert_i32_two_elts(<4 x i32> %x, i32* %s.addr) {
}

define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, i64* %s.addr) {
; SSE2-LABEL: insert_i64_two_elts:
; SSE2: # %bb.0:
; SSE2-NEXT: movq (%rdi), %rax
; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: insert_i64_two_elts:
; SSE41: # %bb.0:
; SSE41-NEXT: movq (%rdi), %rax
; SSE41-NEXT: pinsrq $0, %rax, %xmm0
; SSE41-NEXT: pinsrq $1, %rax, %xmm0
; SSE41-NEXT: retq
; SSE-LABEL: insert_i64_two_elts:
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_i64_two_elts:
; AVX: # %bb.0:
; AVX-NEXT: movq (%rdi), %rax
; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0
; AVX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX-NEXT: retq
%s = load i64, i64* %s.addr
%i0 = insertelement <2 x i64> %x, i64 %s, i32 0
Expand Down
23 changes: 5 additions & 18 deletions llvm/test/CodeGen/X86/sse-insertelt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -352,24 +352,11 @@ define <4 x i32> @insert_i32_two_elts(<4 x i32> %x, i32 %s) {
}

define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, i64 %s) {
; SSE2-LABEL: insert_i64_two_elts:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %xmm0
; SSE2-NEXT: movq %rdi, %xmm1
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: insert_i64_two_elts:
; SSE41: # %bb.0:
; SSE41-NEXT: pinsrq $0, %rdi, %xmm0
; SSE41-NEXT: pinsrq $1, %rdi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_i64_two_elts:
; AVX: # %bb.0:
; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; AVX-NEXT: retq
; SSE-LABEL: insert_i64_two_elts:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
%i0 = insertelement <2 x i64> %x, i64 %s, i32 0
%i1 = insertelement <2 x i64> %i0, i64 %s, i32 1
ret <2 x i64> %i1
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/X86/vec_insert-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X86-LABEL: mmx_movzl:
; X86: ## %bb.0:
; X86-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0
; X86-NEXT: movl $32, %eax
; X86-NEXT: movd %eax, %mm0
; X86-NEXT: retl
;
; X64-LABEL: mmx_movzl:
Expand Down

0 comments on commit 7d8fd4f

Please sign in to comment.