Skip to content

Commit

Permalink
[X86][SSE] getV4X86ShuffleImm8 - canonicalize broadcast masks
Browse files Browse the repository at this point in the history
If the mask input to getV4X86ShuffleImm8 only refers to a single source element (+ undefs) then canonicalize to a full broadcast.

getV4X86ShuffleImm8 defaults to inline values for undefs, which can be useful for shuffle widening/narrowing but does leave SimplifyDemanded* calls thinking the shuffle depends on unnecessary elements.

I'm still investigating what we should do more generally to avoid these undemanded elements, but broadcast cases was a simpler win.
  • Loading branch information
RKSimon committed Jul 29, 2020
1 parent 2e7baf6 commit 0c005be
Show file tree
Hide file tree
Showing 181 changed files with 2,865 additions and 2,833 deletions.
9 changes: 9 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -10938,6 +10938,15 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

// If the mask only uses one non-undef element, then fully 'splat' it to
// improve later broadcast matching.
int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");

int FirstElt = Mask[FirstIndex];
if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

unsigned Imm = 0;
Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/2011-05-09-loaduse.ll
Expand Up @@ -8,7 +8,7 @@ define float @test(<4 x float>* %A) nounwind {
; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movaps (%eax), %xmm0
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X86-NEXT: xorps %xmm1, %xmm1
; X86-NEXT: movaps %xmm1, (%eax)
; X86-NEXT: movss %xmm0, (%esp)
Expand All @@ -19,7 +19,7 @@ define float @test(<4 x float>* %A) nounwind {
; X64-LABEL: test:
; X64: # %bb.0: # %entry
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: movaps %xmm1, (%rdi)
; X64-NEXT: retq
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/X86/atomic-fp.ll
Expand Up @@ -113,7 +113,7 @@ define void @fadd_64r(double* %loc, double %val) nounwind {
; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: faddl 12(%ebp)
Expand Down Expand Up @@ -278,7 +278,7 @@ define void @fadd_64g() nounwind {
; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fld1
; X86-SSE1-NEXT: faddl (%esp)
Expand Down Expand Up @@ -441,7 +441,7 @@ define void @fadd_64imm() nounwind {
; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fld1
; X86-SSE1-NEXT: faddl (%esp)
Expand Down Expand Up @@ -610,7 +610,7 @@ define void @fadd_64stack() nounwind {
; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fld1
; X86-SSE1-NEXT: faddl (%esp)
Expand Down Expand Up @@ -716,7 +716,7 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) nounwind {
; X86-SSE1-NEXT: xorps %xmm1, %xmm1
; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-SSE1-NEXT: movss %xmm1, (%esp)
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: faddl 12(%ebp)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/atomic-non-integer.ll
Expand Up @@ -278,7 +278,7 @@ define double @load_double(double* %fptr) {
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movss %xmm0, (%esp)
; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: addl $12, %esp
Expand Down Expand Up @@ -665,7 +665,7 @@ define double @load_double_seq_cst(double* %fptr) {
; X86-SSE1-NEXT: xorps %xmm0, %xmm0
; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE1-NEXT: movss %xmm0, (%esp)
; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: fldl (%esp)
; X86-SSE1-NEXT: addl $12, %esp
Expand Down
44 changes: 22 additions & 22 deletions llvm/test/CodeGen/X86/avg.ll
Expand Up @@ -162,28 +162,28 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vmovdqa (%rsi), %xmm6
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
Expand Down Expand Up @@ -456,25 +456,25 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
Expand All @@ -485,32 +485,32 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
Expand Down Expand Up @@ -2386,12 +2386,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
Expand All @@ -2402,7 +2402,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm5, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1]
; SSE2-NEXT: movupd %xmm1, (%rax)
Expand Down Expand Up @@ -2558,7 +2558,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
Expand Up @@ -1895,15 +1895,15 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_set1_epi16:
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/avx-splat.ll
@@ -1,4 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64

Expand All @@ -16,7 +16,7 @@ entry:
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
Expand Down Expand Up @@ -134,7 +134,7 @@ define <8 x float> @funcF(i32 %val) nounwind {
; X64-LABEL: funcF:
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx-vbroadcast.ll
Expand Up @@ -624,12 +624,12 @@ entry:
define <4 x i32> @H(<4 x i32> %a) {
; X32-LABEL: H:
; X32: ## %bb.0: ## %entry
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X32-NEXT: retl
;
; X64-LABEL: H:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-NEXT: retq
entry:
%x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx-vinsertf128.ll
Expand Up @@ -29,7 +29,7 @@ define void @insert_crash() nounwind {
; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: vmovups %xmm0, (%rax)
; CHECK-NEXT: retq
allocas:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx-vperm2x128.ll
Expand Up @@ -265,7 +265,7 @@ define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounw
;
; AVX2-LABEL: shuffle_v8f32_uu67uu67:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx512-any_extend_load.ll
Expand Up @@ -8,7 +8,7 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
; KNL: # %bb.0:
; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; KNL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
Expand Down

0 comments on commit 0c005be

Please sign in to comment.