Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[x86] favor vector constant load to avoid GPR to XMM transfer, part 2
This replaces the build_vector lowering code that was just added in
D80013
and matches the pattern later from the x86-specific "vzext_movl".
That seems to result in the same or better improvements and gets rid
of the 'TODO' items from that patch.

AFAICT, we always shrink wider constant vectors to 128-bit on these
patterns, so we still get the implicit zero-extension to ymm/zmm
without wasting space on larger vector constants. There's a trade-off
there because that means we miss potential load-folding.

Similarly, we could load scalar constants here with implicit
zero-extension even to 128-bit. That saves constant space, but it
means we forego load-folding, and so it increases register pressure.
This seems like a good middle-ground between those 2 options.

Differential Revision: https://reviews.llvm.org/D80131
  • Loading branch information
rotateright committed May 25, 2020
1 parent 8f48814 commit fa038e0
Show file tree
Hide file tree
Showing 24 changed files with 147 additions and 227 deletions.
33 changes: 24 additions & 9 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -10209,15 +10209,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (NumZero == 0)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

// Just load a vector integer constant. Loading is better for code size,
// avoids move GPR immediate --> XMM, and reduces register pressure.
if (IsAllConstants && VT.isInteger()) {
// TODO: Remove -1 restriction with demanded elements improvement?
// TODO: Insert 128-bit load into wider undef vector?
if (VT.is128BitVector() && !isAllOnesConstant(Item))
return SDValue();
}

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
(EltVT == MVT::i64 && Subtarget.is64Bit())) {
assert((VT.is128BitVector() || VT.is256BitVector() ||
Expand Down Expand Up @@ -35858,6 +35849,30 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
}

// Load a scalar integer constant directly to XMM instead of transferring an
// immediate value from GPR.
// vzext_movl (scalar_to_vector C) --> load [C,0...]
if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
// Create a vector constant - scalar constant followed by zeros.
EVT ScalarVT = N0.getOperand(0).getValueType();
Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
unsigned NumElts = VT.getVectorNumElements();
Constant *Zero = ConstantInt::getNullValue(ScalarTy);
SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

// Load the vector constant from constant pool.
MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
MachinePointerInfo MPI =
MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
MachineMemOperand::MOLoad);
}
}

return SDValue();
}
case X86ISD::BLENDI: {
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/X86/avx-load-store.ll
Expand Up @@ -220,8 +220,7 @@ define void @f_f() nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB9_4
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
;
Expand All @@ -238,13 +237,12 @@ define void @f_f() nounwind {
; CHECK_O0-NEXT: jne .LBB9_3
; CHECK_O0-NEXT: jmp .LBB9_4
; CHECK_O0-NEXT: .LBB9_3: # %cif_mixed_test_all
; CHECK_O0-NEXT: movl $-1, %eax
; CHECK_O0-NEXT: vmovd %eax, %xmm0
; CHECK_O0-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK_O0-NEXT: vmovdqa %xmm0, %xmm0
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK_O0-NEXT: # implicit-def: $rcx
; CHECK_O0-NEXT: # implicit-def: $rax
; CHECK_O0-NEXT: # implicit-def: $ymm2
; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rcx)
; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rax)
; CHECK_O0-NEXT: .LBB9_4: # %cif_mixed_test_any_check
allocas:
br i1 undef, label %cif_mask_all, label %cif_mask_mixed
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/avx2-arith.ll
Expand Up @@ -347,15 +347,13 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
define <8 x i32> @mul_const9(<8 x i32> %x) {
; X32-LABEL: mul_const9:
; X32: # %bb.0:
; X32-NEXT: movl $2, %eax
; X32-NEXT: vmovd %eax, %xmm1
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const9:
; X64: # %bb.0:
; X64-NEXT: movl $2, %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand Down
40 changes: 12 additions & 28 deletions llvm/test/CodeGen/X86/combine-udiv.ll
Expand Up @@ -590,9 +590,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
;
; XOP-LABEL: combine_vec_udiv_nonuniform2:
; XOP: # %bb.0:
; XOP-NEXT: movl $65535, %eax # imm = 0xFFFF
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
Expand Down Expand Up @@ -664,31 +662,17 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_udiv_nonuniform4:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_udiv_nonuniform4:
; AVX2: # %bb.0:
; AVX2-NEXT: movl $171, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
; AVX-LABEL: combine_vec_udiv_nonuniform4:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/fcmp-constant.ll
Expand Up @@ -92,8 +92,7 @@ define <2 x i64> @fcmp_ueq_v2f64_undef() {
define <2 x i64> @fcmp_ueq_v2f64_undef_elt() {
; CHECK-LABEL: fcmp_ueq_v2f64_undef_elt:
; CHECK: # %bb.0:
; CHECK-NEXT: movq $-1, %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,0]
; CHECK-NEXT: retq
%1 = fcmp ueq <2 x double> <double 0x3FF0000000000000, double 0xFFEFFFFFFFFFFFFF>, <double undef, double 0x3FF0000000000000>
%2 = sext <2 x i1> %1 to <2 x i64>
Expand Down
48 changes: 20 additions & 28 deletions llvm/test/CodeGen/X86/insert-into-constant-vector.ll
Expand Up @@ -129,10 +129,8 @@ define <4 x i32> @elt3_v4i32(i32 %x) {
define <2 x i64> @elt0_v2i64(i64 %x) {
; X32SSE-LABEL: elt0_v2i64:
; X32SSE: # %bb.0:
; X32SSE-NEXT: movl $1, %eax
; X32SSE-NEXT: movd %eax, %xmm1
; X32SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X32SSE-NEXT: retl
;
; X64SSE2-LABEL: elt0_v2i64:
Expand All @@ -150,10 +148,8 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
;
; X32AVX-LABEL: elt0_v2i64:
; X32AVX: # %bb.0:
; X32AVX-NEXT: movl $1, %eax
; X32AVX-NEXT: vmovd %eax, %xmm0
; X32AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X32AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X32AVX-NEXT: retl
;
; X64AVX-LABEL: elt0_v2i64:
Expand Down Expand Up @@ -365,10 +361,9 @@ define <8 x float> @elt6_v8f32(float %x) {
define <8 x i64> @elt5_v8i64(i64 %x) {
; X32SSE-LABEL: elt5_v8i64:
; X32SSE: # %bb.0:
; X32SSE-NEXT: movl $4, %eax
; X32SSE-NEXT: movd %eax, %xmm2
; X32SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; X32SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32SSE-NEXT: movaps {{.*#+}} xmm2 = [4,0,0,0]
; X32SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; X32SSE-NEXT: movaps {{.*#+}} xmm0 = [42,0,1,0]
; X32SSE-NEXT: movaps {{.*#+}} xmm1 = [2,0,3,0]
; X32SSE-NEXT: movaps {{.*#+}} xmm3 = [6,0,7,0]
Expand All @@ -395,10 +390,9 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
;
; X32AVX1-LABEL: elt5_v8i64:
; X32AVX1: # %bb.0:
; X32AVX1-NEXT: movl $4, %eax
; X32AVX1-NEXT: vmovd %eax, %xmm0
; X32AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X32AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
; X32AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X32AVX1-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
; X32AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
; X32AVX1-NEXT: retl
Expand All @@ -413,11 +407,10 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
;
; X32AVX2-LABEL: elt5_v8i64:
; X32AVX2: # %bb.0:
; X32AVX2-NEXT: movl $4, %eax
; X32AVX2-NEXT: vmovd %eax, %xmm0
; X32AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X32AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32AVX2-NEXT: vinserti128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
; X32AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
; X32AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X32AVX2-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
; X32AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
; X32AVX2-NEXT: retl
;
Expand All @@ -431,13 +424,12 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
;
; X32AVX512F-LABEL: elt5_v8i64:
; X32AVX512F: # %bb.0:
; X32AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
; X32AVX512F-NEXT: movl $4, %eax
; X32AVX512F-NEXT: vmovd %eax, %xmm1
; X32AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; X32AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X32AVX512F-NEXT: vinserti128 $1, {{\.LCPI.*}}, %ymm1, %ymm1
; X32AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X32AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
; X32AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X32AVX512F-NEXT: vmovaps {{.*#+}} xmm2 = [4,0,0,0]
; X32AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X32AVX512F-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm1, %ymm1
; X32AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; X32AVX512F-NEXT: retl
;
; X64AVX512F-LABEL: elt5_v8i64:
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/X86/packss.ll
Expand Up @@ -159,13 +159,12 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
; X86-SSE-NEXT: psllq $63, %xmm1
; X86-SSE-NEXT: psllq $63, %xmm0
; X86-SSE-NEXT: psrlq $63, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <1,0,u,u>
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,0,0]
; X86-SSE-NEXT: pxor %xmm2, %xmm0
; X86-SSE-NEXT: pcmpeqd %xmm3, %xmm3
; X86-SSE-NEXT: paddq %xmm3, %xmm0
; X86-SSE-NEXT: psubq %xmm2, %xmm0
; X86-SSE-NEXT: psrlq $63, %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm1
; X86-SSE-NEXT: paddq %xmm3, %xmm1
; X86-SSE-NEXT: psubq %xmm2, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-SSE-NEXT: packssdw %xmm1, %xmm0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/pshufb-mask-comments.ll
Expand Up @@ -54,7 +54,7 @@ define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) {
define <16 x i8> @test5(<16 x i8> %V) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,0]
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,0,0,0]
; CHECK-NEXT: movaps %xmm1, (%rax)
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; CHECK-NEXT: movaps %xmm1, (%rax)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/ret-mmx.ll
Expand Up @@ -32,7 +32,7 @@ define <1 x i64> @t2() nounwind {
define <2 x i32> @t3() nounwind {
; CHECK-LABEL: t3:
; CHECK: ## %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = <1,0,u,u>
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
; CHECK-NEXT: retq
ret <2 x i32> <i32 1, i32 0>
}
Expand Down
65 changes: 15 additions & 50 deletions llvm/test/CodeGen/X86/sad.ll
Expand Up @@ -544,8 +544,7 @@ define i32 @sad_2i8() nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB3_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
Expand Down Expand Up @@ -995,54 +994,20 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: sad_unroll_nonzero_initial:
; AVX1: # %bb.0: # %bb
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
; AVX1-NEXT: vmovdqu (%rdx), %xmm1
; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: sad_unroll_nonzero_initial:
; AVX2: # %bb.0: # %bb
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vmovdqu (%rdx), %xmm1
; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX2-NEXT: movl $1, %eax
; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: sad_unroll_nonzero_initial:
; AVX512: # %bb.0: # %bb
; AVX512-NEXT: vmovdqu (%rdi), %xmm0
; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
; AVX512-NEXT: vmovdqu (%rdx), %xmm1
; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX512-NEXT: movl $1, %eax
; AVX512-NEXT: vmovd %eax, %xmm2
; AVX512-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: retq
; AVX-LABEL: sad_unroll_nonzero_initial:
; AVX: # %bb.0: # %bb
; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu (%rdx), %xmm1
; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
bb:
%tmp = load <16 x i8>, <16 x i8>* %arg, align 1
%tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
Expand Up @@ -325,7 +325,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,0,u>
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
Expand Down Expand Up @@ -452,7 +452,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,0,u>
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
Expand Down Expand Up @@ -1314,7 +1314,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,0,u>
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0]
; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/vec_set-A.ll
Expand Up @@ -10,7 +10,7 @@ define <2 x i64> @test1() nounwind {
;
; X64-LABEL: test1:
; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [1,0]
; X64-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0]
; X64-NEXT: retq
ret <2 x i64> < i64 1, i64 0 >
}
Expand Down

0 comments on commit fa038e0

Please sign in to comment.