141 changes: 63 additions & 78 deletions llvm/test/CodeGen/X86/horizontal-sum.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,24 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: retq
; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: pair_sum_v4f32_v4f32:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: retq
;
; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: retq
%5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
%6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
%7 = fadd <2 x float> %5, %6
Expand Down Expand Up @@ -126,34 +114,28 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: pair_sum_v4i32_v4i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: retq
; AVX-FAST-LABEL: pair_sum_v4i32_v4i32:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1
; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: retq
;
; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: retq
%5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
%6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
%7 = add <2 x i32> %5, %6
Expand Down Expand Up @@ -191,15 +173,14 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2
; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1
; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2
; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4
; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
Expand Down Expand Up @@ -266,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
Expand All @@ -287,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
Expand Down Expand Up @@ -440,9 +421,11 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Expand All @@ -461,9 +444,11 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Expand Down Expand Up @@ -752,15 +737,15 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm5
; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm4, %xmm4
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
Expand All @@ -776,14 +761,14 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,2,2]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: retq
%5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
%6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/icmp-shift-opt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,11 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
; X86-LABEL: opt_setcc_expanded_shl_correct_shifts:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $17, %ecx
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: shldl $17, %eax, %ecx
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
;
Expand Down
77 changes: 53 additions & 24 deletions llvm/test/CodeGen/X86/insertelement-var-index.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2288,31 +2288,58 @@ define i32 @PR44139(ptr %p) {
; SSE-NEXT: divl %ecx
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: PR44139:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX1OR2-NEXT: movl (%rdi), %eax
; AVX1OR2-NEXT: vmovaps %ymm0, 64(%rdi)
; AVX1OR2-NEXT: vmovaps %ymm0, 96(%rdi)
; AVX1OR2-NEXT: vmovaps %ymm0, (%rdi)
; AVX1OR2-NEXT: vmovaps %ymm0, 32(%rdi)
; AVX1OR2-NEXT: leal 2147483647(%rax), %ecx
; AVX1OR2-NEXT: testl %eax, %eax
; AVX1OR2-NEXT: cmovnsl %eax, %ecx
; AVX1OR2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
; AVX1OR2-NEXT: addl %eax, %ecx
; AVX1OR2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1OR2-NEXT: xorl %edx, %edx
; AVX1OR2-NEXT: divl %ecx
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
; AVX1-LABEL: PR44139:
; AVX1: # %bb.0:
; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX1-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vmovaps %ymm0, 64(%rdi)
; AVX1-NEXT: vmovaps %ymm0, 96(%rdi)
; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
; AVX1-NEXT: movl (%rdi), %eax
; AVX1-NEXT: vmovaps %ymm1, (%rdi)
; AVX1-NEXT: leal 2147483647(%rax), %ecx
; AVX1-NEXT: testl %eax, %eax
; AVX1-NEXT: cmovnsl %eax, %ecx
; AVX1-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
; AVX1-NEXT: addl %eax, %ecx
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1-NEXT: xorl %edx, %edx
; AVX1-NEXT: divl %ecx
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR44139:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0
; AVX2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi)
; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi)
; AVX2-NEXT: movl (%rdi), %eax
; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
; AVX2-NEXT: leal 2147483647(%rax), %ecx
; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: cmovnsl %eax, %ecx
; AVX2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
; AVX2-NEXT: addl %eax, %ecx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: xorl %edx, %edx
; AVX2-NEXT: divl %ecx
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR44139:
; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastsd (%rdi), %zmm0
; AVX512-NEXT: movl (%rdi), %eax
; AVX512-NEXT: vmovaps %zmm0, (%rdi)
; AVX512-NEXT: vmovaps %zmm0, 64(%rdi)
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpbroadcastq (%rdi), %zmm1
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpinsrq $1, (%rdi), %xmm1, %xmm2
; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm2
; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rdi)
; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: leal 2147483647(%rax), %ecx
; AVX512-NEXT: testl %eax, %eax
; AVX512-NEXT: cmovnsl %eax, %ecx
Expand All @@ -2327,12 +2354,14 @@ define i32 @PR44139(ptr %p) {
; X86AVX2-LABEL: PR44139:
; X86AVX2: # %bb.0:
; X86AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86AVX2-NEXT: movl (%ecx), %eax
; X86AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
; X86AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
; X86AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx)
; X86AVX2-NEXT: vmovaps %ymm0, 96(%ecx)
; X86AVX2-NEXT: vmovaps %ymm0, (%ecx)
; X86AVX2-NEXT: vmovaps %ymm0, 32(%ecx)
; X86AVX2-NEXT: movl (%ecx), %eax
; X86AVX2-NEXT: vmovaps %ymm1, (%ecx)
; X86AVX2-NEXT: leal 2147483647(%eax), %ecx
; X86AVX2-NEXT: testl %eax, %eax
; X86AVX2-NEXT: cmovnsl %eax, %ecx
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/is_fpclass-fp80.ll
Original file line number Diff line number Diff line change
Expand Up @@ -319,10 +319,10 @@ define i1 @is_neginf_f80(x86_fp80 %x) {
; CHECK-64-LABEL: is_neginf_f80:
; CHECK-64: # %bb.0: # %entry
; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; CHECK-64-NEXT: xorl $65535, %eax # imm = 0xFFFF
; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF
; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
; CHECK-64-NEXT: orq %rcx, %rax
; CHECK-64-NEXT: orq %rax, %rcx
; CHECK-64-NEXT: sete %al
; CHECK-64-NEXT: retq
entry:
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@ define void @csrot_(ptr %0) {
; CHECK-LABEL: csrot_:
; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: xorps %xmm0, %xmm1
; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],mem[1,2,3]
; CHECK-NEXT: movlps %xmm1, (%rax)
; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
; CHECK-NEXT: movlps %xmm0, (%rax)
; CHECK-NEXT: retq
1:
%2 = load float, ptr %0, align 4
Expand Down
242 changes: 162 additions & 80 deletions llvm/test/CodeGen/X86/masked_store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6185,19 +6185,20 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm3
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX2-NEXT: vpslld $31, %ymm3, %ymm3
; AVX2-NEXT: vpmaskmovd %ymm2, %ymm3, 64(%rdx)
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpslld $31, %ymm2, %ymm2
; AVX2-NEXT: vpmaskmovd %ymm1, %ymm2, 32(%rdx)
; AVX2-NEXT: vpacksswb %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX2-NEXT: vpslld $31, %ymm5, %ymm5
; AVX2-NEXT: vpmaskmovd %ymm1, %ymm5, 32(%rdx)
; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdx)
; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand Down Expand Up @@ -6277,77 +6278,158 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge

; From https://reviews.llvm.org/rGf8d9097168b7#1165311
define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) #0 {
; SSE-LABEL: undefshuffle:
; SSE: ## %bb.0:
; SSE-NEXT: movb $1, %al
; SSE-NEXT: testb %al, %al
; SSE-NEXT: testb %al, %al
; SSE-NEXT: testb %al, %al
; SSE-NEXT: testb %al, %al
; SSE-NEXT: testb %al, %al
; SSE-NEXT: testb %al, %al
; SSE-NEXT: testb %al, %al
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: testb %al, %al
; SSE-NEXT: psllw $15, %xmm0
; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: testb $1, %al
; SSE-NEXT: jne LBB32_1
; SSE-NEXT: ## %bb.2: ## %else23
; SSE-NEXT: testb $2, %al
; SSE-NEXT: jne LBB32_3
; SSE-NEXT: LBB32_4: ## %else25
; SSE-NEXT: testb $4, %al
; SSE-NEXT: jne LBB32_5
; SSE-NEXT: LBB32_6: ## %else27
; SSE-NEXT: testb $8, %al
; SSE-NEXT: jne LBB32_7
; SSE-NEXT: LBB32_8: ## %else29
; SSE-NEXT: testb $16, %al
; SSE-NEXT: jne LBB32_9
; SSE-NEXT: LBB32_10: ## %else31
; SSE-NEXT: testb $32, %al
; SSE-NEXT: jne LBB32_11
; SSE-NEXT: LBB32_12: ## %else33
; SSE-NEXT: testb $64, %al
; SSE-NEXT: jne LBB32_13
; SSE-NEXT: LBB32_14: ## %else35
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: jne LBB32_15
; SSE-NEXT: LBB32_16: ## %else37
; SSE-NEXT: retq
; SSE-NEXT: LBB32_1: ## %cond.store
; SSE-NEXT: movl $0, (%rsi)
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB32_4
; SSE-NEXT: LBB32_3: ## %cond.store24
; SSE-NEXT: movl $0, 4(%rsi)
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je LBB32_6
; SSE-NEXT: LBB32_5: ## %cond.store26
; SSE-NEXT: movl $0, 8(%rsi)
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je LBB32_8
; SSE-NEXT: LBB32_7: ## %cond.store28
; SSE-NEXT: movl $0, 12(%rsi)
; SSE-NEXT: testb $16, %al
; SSE-NEXT: je LBB32_10
; SSE-NEXT: LBB32_9: ## %cond.store30
; SSE-NEXT: movl $0, 16(%rsi)
; SSE-NEXT: testb $32, %al
; SSE-NEXT: je LBB32_12
; SSE-NEXT: LBB32_11: ## %cond.store32
; SSE-NEXT: movl $0, 20(%rsi)
; SSE-NEXT: testb $64, %al
; SSE-NEXT: je LBB32_14
; SSE-NEXT: LBB32_13: ## %cond.store34
; SSE-NEXT: movl $0, 24(%rsi)
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: je LBB32_16
; SSE-NEXT: LBB32_15: ## %cond.store36
; SSE-NEXT: movl $0, 28(%rsi)
; SSE-NEXT: retq
; SSE2-LABEL: undefshuffle:
; SSE2: ## %bb.0:
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb $1, %al
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: pinsrw $1, %ecx, %xmm0
; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: pinsrw $2, %ecx, %xmm0
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: movb $1, %cl
; SSE2-NEXT: testb %cl, %cl
; SSE2-NEXT: pinsrw $3, %eax, %xmm0
; SSE2-NEXT: testb %cl, %cl
; SSE2-NEXT: psllw $15, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB32_1
; SSE2-NEXT: ## %bb.2: ## %else23
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB32_3
; SSE2-NEXT: LBB32_4: ## %else25
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: jne LBB32_5
; SSE2-NEXT: LBB32_6: ## %else27
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: jne LBB32_7
; SSE2-NEXT: LBB32_8: ## %else29
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: jne LBB32_9
; SSE2-NEXT: LBB32_10: ## %else31
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: jne LBB32_11
; SSE2-NEXT: LBB32_12: ## %else33
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: jne LBB32_13
; SSE2-NEXT: LBB32_14: ## %else35
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne LBB32_15
; SSE2-NEXT: LBB32_16: ## %else37
; SSE2-NEXT: retq
; SSE2-NEXT: LBB32_1: ## %cond.store
; SSE2-NEXT: movl $0, (%rsi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB32_4
; SSE2-NEXT: LBB32_3: ## %cond.store24
; SSE2-NEXT: movl $0, 4(%rsi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB32_6
; SSE2-NEXT: LBB32_5: ## %cond.store26
; SSE2-NEXT: movl $0, 8(%rsi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB32_8
; SSE2-NEXT: LBB32_7: ## %cond.store28
; SSE2-NEXT: movl $0, 12(%rsi)
; SSE2-NEXT: testb $16, %al
; SSE2-NEXT: je LBB32_10
; SSE2-NEXT: LBB32_9: ## %cond.store30
; SSE2-NEXT: movl $0, 16(%rsi)
; SSE2-NEXT: testb $32, %al
; SSE2-NEXT: je LBB32_12
; SSE2-NEXT: LBB32_11: ## %cond.store32
; SSE2-NEXT: movl $0, 20(%rsi)
; SSE2-NEXT: testb $64, %al
; SSE2-NEXT: je LBB32_14
; SSE2-NEXT: LBB32_13: ## %cond.store34
; SSE2-NEXT: movl $0, 24(%rsi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je LBB32_16
; SSE2-NEXT: LBB32_15: ## %cond.store36
; SSE2-NEXT: movl $0, 28(%rsi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: undefshuffle:
; SSE4: ## %bb.0:
; SSE4-NEXT: movb $1, %al
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE4-NEXT: testb %al, %al
; SSE4-NEXT: psllw $15, %xmm0
; SSE4-NEXT: packsswb %xmm0, %xmm0
; SSE4-NEXT: pmovmskb %xmm0, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne LBB32_1
; SSE4-NEXT: ## %bb.2: ## %else23
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: jne LBB32_3
; SSE4-NEXT: LBB32_4: ## %else25
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: jne LBB32_5
; SSE4-NEXT: LBB32_6: ## %else27
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: jne LBB32_7
; SSE4-NEXT: LBB32_8: ## %else29
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: jne LBB32_9
; SSE4-NEXT: LBB32_10: ## %else31
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: jne LBB32_11
; SSE4-NEXT: LBB32_12: ## %else33
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: jne LBB32_13
; SSE4-NEXT: LBB32_14: ## %else35
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: jne LBB32_15
; SSE4-NEXT: LBB32_16: ## %else37
; SSE4-NEXT: retq
; SSE4-NEXT: LBB32_1: ## %cond.store
; SSE4-NEXT: movl $0, (%rsi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je LBB32_4
; SSE4-NEXT: LBB32_3: ## %cond.store24
; SSE4-NEXT: movl $0, 4(%rsi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je LBB32_6
; SSE4-NEXT: LBB32_5: ## %cond.store26
; SSE4-NEXT: movl $0, 8(%rsi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je LBB32_8
; SSE4-NEXT: LBB32_7: ## %cond.store28
; SSE4-NEXT: movl $0, 12(%rsi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je LBB32_10
; SSE4-NEXT: LBB32_9: ## %cond.store30
; SSE4-NEXT: movl $0, 16(%rsi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je LBB32_12
; SSE4-NEXT: LBB32_11: ## %cond.store32
; SSE4-NEXT: movl $0, 20(%rsi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je LBB32_14
; SSE4-NEXT: LBB32_13: ## %cond.store34
; SSE4-NEXT: movl $0, 24(%rsi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je LBB32_16
; SSE4-NEXT: LBB32_15: ## %cond.store36
; SSE4-NEXT: movl $0, 28(%rsi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: undefshuffle:
; AVX1: ## %bb.0:
Expand Down
220 changes: 120 additions & 100 deletions llvm/test/CodeGen/X86/midpoint-int-vec-512.ll

Large diffs are not rendered by default.

294 changes: 206 additions & 88 deletions llvm/test/CodeGen/X86/movmsk-cmp.ll

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions llvm/test/CodeGen/X86/mulvi32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -134,31 +134,31 @@ define <4 x i32> @_mul4xi32b(<4 x i32>, <4 x i32>) {
define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) {
; SSE2-LABEL: _mul4xi32toi64a:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: _mul4xi32toi64a:
; SSE42: # %bb.0:
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3]
; SSE42-NEXT: pmuludq %xmm3, %xmm2
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3]
; SSE42-NEXT: pmuludq %xmm3, %xmm1
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE42-NEXT: pmuludq %xmm1, %xmm0
; SSE42-NEXT: movdqa %xmm2, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: _mul4xi32toi64a:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
Expand Down
208 changes: 104 additions & 104 deletions llvm/test/CodeGen/X86/nontemporal-3.ll

Large diffs are not rendered by default.

72 changes: 36 additions & 36 deletions llvm/test/CodeGen/X86/pmulh.ll
Original file line number Diff line number Diff line change
Expand Up @@ -319,41 +319,41 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: and_mulhuw_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm8, %xmm1
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm7
; SSE2-NEXT: pand %xmm8, %xmm6
; SSE2-NEXT: packssdw %xmm7, %xmm6
; SSE2-NEXT: pmulhw %xmm2, %xmm6
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm4, %xmm8
; SSE2-NEXT: packssdw %xmm5, %xmm8
; SSE2-NEXT: pmulhw %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: pand %xmm8, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: pmulhw %xmm4, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm7
; SSE2-NEXT: pand %xmm6, %xmm8
; SSE2-NEXT: packssdw %xmm7, %xmm8
; SSE2-NEXT: pmulhw %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: and_mulhuw_v16i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
; SSE41-NEXT: pand %xmm8, %xmm3
; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: pand %xmm8, %xmm1
; SSE41-NEXT: pand %xmm8, %xmm0
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm7
; SSE41-NEXT: pand %xmm8, %xmm6
; SSE41-NEXT: packusdw %xmm7, %xmm6
; SSE41-NEXT: pmulhw %xmm2, %xmm6
; SSE41-NEXT: pand %xmm8, %xmm3
; SSE41-NEXT: pand %xmm8, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: pand %xmm8, %xmm5
; SSE41-NEXT: pand %xmm4, %xmm8
; SSE41-NEXT: packusdw %xmm5, %xmm8
; SSE41-NEXT: pmulhw %xmm8, %xmm0
; SSE41-NEXT: movdqa %xmm6, %xmm1
; SSE41-NEXT: pand %xmm8, %xmm4
; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: pmulhw %xmm4, %xmm0
; SSE41-NEXT: pand %xmm8, %xmm7
; SSE41-NEXT: pand %xmm6, %xmm8
; SSE41-NEXT: packusdw %xmm7, %xmm8
; SSE41-NEXT: pmulhw %xmm2, %xmm8
; SSE41-NEXT: movdqa %xmm8, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: and_mulhuw_v16i16:
Expand Down Expand Up @@ -417,39 +417,39 @@ define <16 x i16> @sext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: ashr_mulhuw_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm4, %xmm0
; SSE2-NEXT: psrad $16, %xmm7
; SSE2-NEXT: psrad $16, %xmm6
; SSE2-NEXT: packssdw %xmm7, %xmm6
; SSE2-NEXT: psrad $16, %xmm3
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pmulhw %xmm6, %xmm2
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pmulhw %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: ashr_mulhuw_v16i16:
; SSE41: # %bb.0:
; SSE41-NEXT: psrld $16, %xmm3
; SSE41-NEXT: psrld $16, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm7
; SSE41-NEXT: psrld $16, %xmm6
; SSE41-NEXT: packusdw %xmm7, %xmm6
; SSE41-NEXT: pmulhw %xmm2, %xmm6
; SSE41-NEXT: psrld $16, %xmm3
; SSE41-NEXT: psrld $16, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: psrld $16, %xmm5
; SSE41-NEXT: psrld $16, %xmm4
; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: pmulhw %xmm4, %xmm0
; SSE41-NEXT: psrld $16, %xmm7
; SSE41-NEXT: psrld $16, %xmm6
; SSE41-NEXT: packusdw %xmm7, %xmm6
; SSE41-NEXT: pmulhw %xmm2, %xmm6
; SSE41-NEXT: movdqa %xmm6, %xmm1
; SSE41-NEXT: retq
;
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/popcnt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ define i8 @cnt8(i8 %x) nounwind readnone {
define i16 @cnt16(i16 %x) nounwind readnone {
; X86-LABEL: cnt16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shrl %ecx
; X86-NEXT: andl $21845, %ecx # imm = 0x5555
Expand Down Expand Up @@ -1523,7 +1523,7 @@ define i32 @popcount_zext_i32(i16 zeroext %x) {
define i32 @popcount_i16_zext(i16 zeroext %x) {
; X86-LABEL: popcount_i16_zext:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shrl %ecx
; X86-NEXT: andl $21845, %ecx # imm = 0x5555
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/promote-vec3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,13 @@ define <3 x i16> @zext_i8(<3 x i8>) {
;
; AVX-64-LABEL: zext_i8:
; AVX-64: # %bb.0:
; AVX-64-NEXT: movzbl %sil, %esi
; AVX-64-NEXT: movzbl %dl, %ecx
; AVX-64-NEXT: movzbl %sil, %edx
; AVX-64-NEXT: vmovd %edi, %xmm0
; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-64-NEXT: movzbl %dl, %ecx
; AVX-64-NEXT: vmovd %xmm0, %eax
; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-64-NEXT: movl %esi, %edx
; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx
; AVX-64-NEXT: retq
%2 = zext <3 x i8> %0 to <3 x i16>
Expand Down
56 changes: 28 additions & 28 deletions llvm/test/CodeGen/X86/psubus.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1803,58 +1803,58 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
; SSE2OR3-LABEL: psubus_16i32_max:
; SSE2OR3: # %bb.0: # %vector.ph
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
; SSE2OR3-NEXT: movdqa %xmm5, %xmm8
; SSE2OR3-NEXT: pxor %xmm7, %xmm8
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9
; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
; SSE2OR3-NEXT: pand %xmm9, %xmm5
; SSE2OR3-NEXT: pxor %xmm8, %xmm9
; SSE2OR3-NEXT: por %xmm5, %xmm9
; SSE2OR3-NEXT: pslld $16, %xmm9
; SSE2OR3-NEXT: psrad $16, %xmm9
; SSE2OR3-NEXT: movdqa %xmm4, %xmm10
; SSE2OR3-NEXT: pxor %xmm7, %xmm10
; SSE2OR3-NEXT: movdqa %xmm6, %xmm5
; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm5
; SSE2OR3-NEXT: pand %xmm5, %xmm4
; SSE2OR3-NEXT: pxor %xmm8, %xmm5
; SSE2OR3-NEXT: por %xmm4, %xmm5
; SSE2OR3-NEXT: pslld $16, %xmm5
; SSE2OR3-NEXT: psrad $16, %xmm5
; SSE2OR3-NEXT: packssdw %xmm9, %xmm5
; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
; SSE2OR3-NEXT: pxor %xmm7, %xmm4
; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm9
; SSE2OR3-NEXT: pand %xmm9, %xmm3
; SSE2OR3-NEXT: pxor %xmm8, %xmm9
; SSE2OR3-NEXT: por %xmm3, %xmm9
; SSE2OR3-NEXT: pslld $16, %xmm9
; SSE2OR3-NEXT: psrad $16, %xmm9
; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
; SSE2OR3-NEXT: pxor %xmm7, %xmm3
; SSE2OR3-NEXT: movdqa %xmm6, %xmm10
; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm10
; SSE2OR3-NEXT: pand %xmm10, %xmm2
; SSE2OR3-NEXT: pxor %xmm8, %xmm10
; SSE2OR3-NEXT: por %xmm2, %xmm10
; SSE2OR3-NEXT: pslld $16, %xmm10
; SSE2OR3-NEXT: psrad $16, %xmm10
; SSE2OR3-NEXT: packssdw %xmm9, %xmm10
; SSE2OR3-NEXT: psubusw %xmm10, %xmm0
; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
; SSE2OR3-NEXT: pxor %xmm7, %xmm2
; SSE2OR3-NEXT: movdqa %xmm6, %xmm3
; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2OR3-NEXT: pand %xmm3, %xmm5
; SSE2OR3-NEXT: pxor %xmm8, %xmm3
; SSE2OR3-NEXT: por %xmm5, %xmm3
; SSE2OR3-NEXT: pslld $16, %xmm3
; SSE2OR3-NEXT: psrad $16, %xmm3
; SSE2OR3-NEXT: pxor %xmm4, %xmm7
; SSE2OR3-NEXT: pxor %xmm2, %xmm7
; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6
; SSE2OR3-NEXT: pxor %xmm6, %xmm8
; SSE2OR3-NEXT: pand %xmm4, %xmm6
; SSE2OR3-NEXT: pand %xmm2, %xmm6
; SSE2OR3-NEXT: por %xmm8, %xmm6
; SSE2OR3-NEXT: pslld $16, %xmm6
; SSE2OR3-NEXT: psrad $16, %xmm6
; SSE2OR3-NEXT: packssdw %xmm3, %xmm6
; SSE2OR3-NEXT: psubusw %xmm6, %xmm1
; SSE2OR3-NEXT: packssdw %xmm9, %xmm6
; SSE2OR3-NEXT: psubusw %xmm6, %xmm0
; SSE2OR3-NEXT: psubusw %xmm5, %xmm1
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: psubus_16i32_max:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
; SSE41-NEXT: pminud %xmm6, %xmm5
; SSE41-NEXT: pminud %xmm6, %xmm4
; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: pminud %xmm6, %xmm3
; SSE41-NEXT: pminud %xmm6, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: psubusw %xmm2, %xmm0
; SSE41-NEXT: pminud %xmm6, %xmm5
; SSE41-NEXT: pminud %xmm6, %xmm4
; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: psubusw %xmm4, %xmm1
; SSE41-NEXT: retq
;
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/shift-mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ define i16 @test_i16_shl_lshr_1(i16 %a0) {
define i16 @test_i16_shl_lshr_2(i16 %a0) {
; X86-LABEL: test_i16_shl_lshr_2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shrl $2, %eax
; X86-NEXT: andl $16376, %eax # imm = 0x3FF8
; X86-NEXT: andl $-8, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
Expand Down Expand Up @@ -411,7 +411,7 @@ define i16 @test_i16_lshr_lshr_0(i16 %a0) {
define i16 @test_i16_lshr_lshr_1(i16 %a0) {
; X86-LABEL: test_i16_lshr_lshr_1:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shrl $2, %eax
; X86-NEXT: andl $2047, %eax # imm = 0x7FF
; X86-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,8 @@ define void @shuffle_v16i8_to_v2i8_2(ptr %L, ptr %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rsi)
Expand Down Expand Up @@ -541,8 +541,8 @@ define void @shuffle_v16i8_to_v2i8_4(ptr %L, ptr %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rsi)
Expand Down Expand Up @@ -619,8 +619,8 @@ define void @shuffle_v16i8_to_v2i8_6(ptr %L, ptr %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rsi)
Expand Down
11 changes: 4 additions & 7 deletions llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -56,23 +56,20 @@ define void @store_single_128bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwin
;
; AVX-LABEL: store_single_128bit_elt_vector:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %ymm0
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, (%rdx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: store_single_128bit_elt_vector:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %ymm0
; AVX2-NEXT: vmovaps (%rdi), %xmm0
; AVX2-NEXT: vmovaps %xmm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: store_single_128bit_elt_vector:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
; AVX512F-NEXT: vmovaps %xmm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
%i0 = load <32 x i8>, ptr %in, align 64
%i1 = bitcast <32 x i8> %i0 to <2 x i128>
Expand Down Expand Up @@ -152,7 +149,7 @@ define void @store_single_256bit_elt_vector(ptr %in, ptr %off, ptr %out) nounwin
;
; AVX512F-LABEL: store_single_256bit_elt_vector:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %zmm0
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
; AVX512F-NEXT: vmovaps %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/smax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -660,8 +660,8 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X64: # %bb.0:
; X64-NEXT: movswl %si, %eax
; X64-NEXT: movswl %di, %ecx
; X64-NEXT: sarl $15, %ecx
; X64-NEXT: sarl $8, %eax
; X64-NEXT: shrl $15, %ecx
; X64-NEXT: shrl $8, %eax
; X64-NEXT: cmpw %ax, %cx
; X64-NEXT: cmovgl %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
Expand All @@ -671,7 +671,7 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X86: # %bb.0:
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $15, %eax
; X86-NEXT: shrl $15, %eax
; X86-NEXT: cmpw %cx, %ax
; X86-NEXT: cmovlel %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/smin.ll
Original file line number Diff line number Diff line change
Expand Up @@ -659,8 +659,8 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X64: # %bb.0:
; X64-NEXT: movswl %si, %eax
; X64-NEXT: movswl %di, %ecx
; X64-NEXT: sarl $15, %ecx
; X64-NEXT: sarl $8, %eax
; X64-NEXT: shrl $15, %ecx
; X64-NEXT: shrl $8, %eax
; X64-NEXT: cmpw %ax, %cx
; X64-NEXT: cmovll %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
Expand All @@ -670,7 +670,7 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X86: # %bb.0:
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $15, %eax
; X86-NEXT: shrl $15, %eax
; X86-NEXT: cmpw %cx, %ax
; X86-NEXT: cmovgel %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
Expand Down Expand Up @@ -251,10 +251,10 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: addl %eax, %esi
; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: sarl $31, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %eax, %ebp
Expand Down Expand Up @@ -585,8 +585,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
Expand Down Expand Up @@ -1295,8 +1295,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %edx, %ecx
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %eax, %ecx
Expand All @@ -1315,9 +1315,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl %edi, %ecx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ebx
Expand Down Expand Up @@ -1379,9 +1379,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/umax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1249,8 +1249,8 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X64: # %bb.0:
; X64-NEXT: movswl %si, %eax
; X64-NEXT: movswl %di, %ecx
; X64-NEXT: sarl $15, %ecx
; X64-NEXT: sarl $8, %eax
; X64-NEXT: shrl $15, %ecx
; X64-NEXT: shrl $8, %eax
; X64-NEXT: cmpw %ax, %cx
; X64-NEXT: cmoval %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
Expand All @@ -1260,7 +1260,7 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X86: # %bb.0:
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $15, %eax
; X86-NEXT: shrl $15, %eax
; X86-NEXT: cmpw %cx, %ax
; X86-NEXT: cmovbel %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/umin.ll
Original file line number Diff line number Diff line change
Expand Up @@ -668,8 +668,8 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X64: # %bb.0:
; X64-NEXT: movswl %si, %eax
; X64-NEXT: movswl %di, %ecx
; X64-NEXT: sarl $15, %ecx
; X64-NEXT: sarl $8, %eax
; X64-NEXT: shrl $15, %ecx
; X64-NEXT: shrl $8, %eax
; X64-NEXT: cmpw %ax, %cx
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
Expand All @@ -679,7 +679,7 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
; X86: # %bb.0:
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $15, %eax
; X86-NEXT: shrl $15, %eax
; X86-NEXT: cmpw %cx, %ax
; X86-NEXT: cmovael %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/vector-fshl-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
Expand All @@ -1010,7 +1010,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512F-LABEL: splatvar_funnnel_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
Expand All @@ -1021,7 +1021,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512VL-LABEL: splatvar_funnnel_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
Expand All @@ -1032,7 +1032,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
Expand All @@ -1052,7 +1052,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
Expand Down Expand Up @@ -1087,7 +1087,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-fshl-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
Expand All @@ -597,7 +597,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
Expand All @@ -784,7 +784,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512F-LABEL: splatvar_funnnel_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
Expand All @@ -795,7 +795,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512VL-LABEL: splatvar_funnnel_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
Expand All @@ -806,7 +806,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
Expand All @@ -817,7 +817,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4
; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3
Expand All @@ -345,7 +345,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/vector-fshr-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1032,7 +1032,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand All @@ -1043,7 +1043,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512F-LABEL: splatvar_funnnel_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand All @@ -1054,7 +1054,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512VL-LABEL: splatvar_funnnel_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand All @@ -1065,7 +1065,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand All @@ -1085,7 +1085,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand Down Expand Up @@ -1121,7 +1121,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-fshr-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand All @@ -600,7 +600,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX2-LABEL: splatvar_funnnel_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm1
Expand All @@ -823,7 +823,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512F-LABEL: splatvar_funnnel_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
Expand All @@ -834,7 +834,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512VL-LABEL: splatvar_funnnel_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
Expand All @@ -845,7 +845,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1
Expand All @@ -856,7 +856,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1
Expand All @@ -343,7 +343,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
;
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1
Expand Down
62 changes: 31 additions & 31 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -262,62 +262,62 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; SSE-LABEL: load_i16_stride2_vf32:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm5
; SSE-NEXT: movdqa 32(%rdi), %xmm2
; SSE-NEXT: movdqa 48(%rdi), %xmm4
; SSE-NEXT: movdqa 64(%rdi), %xmm0
; SSE-NEXT: movdqa 80(%rdi), %xmm4
; SSE-NEXT: movdqa 96(%rdi), %xmm1
; SSE-NEXT: movdqa 112(%rdi), %xmm7
; SSE-NEXT: movdqa 64(%rdi), %xmm3
; SSE-NEXT: movdqa 80(%rdi), %xmm9
; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0]
; SSE-NEXT: movdqa 112(%rdi), %xmm6
; SSE-NEXT: movdqa (%rdi), %xmm2
; SSE-NEXT: movdqa 16(%rdi), %xmm7
; SSE-NEXT: movdqa 32(%rdi), %xmm3
; SSE-NEXT: movdqa 48(%rdi), %xmm9
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0]
; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm11[0]
; SSE-NEXT: psrad $16, %xmm9
; SSE-NEXT: psrad $16, %xmm3
; SSE-NEXT: packssdw %xmm9, %xmm3
; SSE-NEXT: psrad $16, %xmm7
; SSE-NEXT: psrad $16, %xmm2
; SSE-NEXT: packssdw %xmm7, %xmm2
; SSE-NEXT: psrad $16, %xmm6
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: packssdw %xmm7, %xmm1
; SSE-NEXT: psrad $16, %xmm5
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm5, %xmm0
; SSE-NEXT: packssdw %xmm6, %xmm1
; SSE-NEXT: psrad $16, %xmm4
; SSE-NEXT: psrad $16, %xmm2
; SSE-NEXT: packssdw %xmm4, %xmm2
; SSE-NEXT: movdqa %xmm12, 16(%rsi)
; SSE-NEXT: movdqa %xmm10, (%rsi)
; SSE-NEXT: movdqa %xmm8, 48(%rsi)
; SSE-NEXT: movdqa %xmm6, 32(%rsi)
; SSE-NEXT: movdqa %xmm2, 16(%rdx)
; SSE-NEXT: movdqa %xmm0, (%rdx)
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm12, 32(%rsi)
; SSE-NEXT: movdqa %xmm10, 48(%rsi)
; SSE-NEXT: movdqa %xmm8, (%rsi)
; SSE-NEXT: movdqa %xmm5, 16(%rsi)
; SSE-NEXT: movdqa %xmm0, 32(%rdx)
; SSE-NEXT: movdqa %xmm1, 48(%rdx)
; SSE-NEXT: movdqa %xmm3, 32(%rdx)
; SSE-NEXT: movdqa %xmm2, (%rdx)
; SSE-NEXT: movdqa %xmm3, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-ONLY-LABEL: load_i16_stride2_vf32:
Expand Down
626 changes: 313 additions & 313 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll

Large diffs are not rendered by default.

244 changes: 122 additions & 122 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll

Large diffs are not rendered by default.

3,521 changes: 1,764 additions & 1,757 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll

Large diffs are not rendered by default.

3,527 changes: 1,765 additions & 1,762 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll

Large diffs are not rendered by default.

8,021 changes: 4,007 additions & 4,014 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll

Large diffs are not rendered by default.

244 changes: 121 additions & 123 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -182,30 +182,30 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps 48(%rdi), %xmm3
; SSE-NEXT: movaps 112(%rdi), %xmm4
; SSE-NEXT: movaps 96(%rdi), %xmm5
; SSE-NEXT: movaps 80(%rdi), %xmm6
; SSE-NEXT: movaps 64(%rdi), %xmm7
; SSE-NEXT: movaps 80(%rdi), %xmm4
; SSE-NEXT: movaps 64(%rdi), %xmm5
; SSE-NEXT: movaps 112(%rdi), %xmm6
; SSE-NEXT: movaps 96(%rdi), %xmm7
; SSE-NEXT: movaps %xmm7, %xmm8
; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2]
; SSE-NEXT: movaps %xmm5, %xmm9
; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2]
; SSE-NEXT: movaps %xmm0, %xmm10
; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm1[0,2]
; SSE-NEXT: movaps %xmm2, %xmm11
; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm3[0,2]
; SSE-NEXT: movaps %xmm2, %xmm10
; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[0,2]
; SSE-NEXT: movaps %xmm0, %xmm11
; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm6[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
; SSE-NEXT: movaps %xmm11, 16(%rsi)
; SSE-NEXT: movaps %xmm10, (%rsi)
; SSE-NEXT: movaps %xmm9, 48(%rsi)
; SSE-NEXT: movaps %xmm8, 32(%rsi)
; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: movaps %xmm9, 32(%rsi)
; SSE-NEXT: movaps %xmm8, 48(%rsi)
; SSE-NEXT: movaps %xmm11, (%rsi)
; SSE-NEXT: movaps %xmm10, 16(%rsi)
; SSE-NEXT: movaps %xmm5, 32(%rdx)
; SSE-NEXT: movaps %xmm7, 48(%rdx)
; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: movaps %xmm5, 48(%rdx)
; SSE-NEXT: movaps %xmm7, 32(%rdx)
; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-ONLY-LABEL: load_i32_stride2_vf16:
Expand Down
362 changes: 183 additions & 179 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll

Large diffs are not rendered by default.

354 changes: 178 additions & 176 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll

Large diffs are not rendered by default.

3,130 changes: 1,573 additions & 1,557 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll

Large diffs are not rendered by default.

4,362 changes: 2,181 additions & 2,181 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll

Large diffs are not rendered by default.

6,578 changes: 3,288 additions & 3,290 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll

Large diffs are not rendered by default.

150 changes: 75 additions & 75 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -901,69 +901,69 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 480(%rdi), %xmm11
; SSE-NEXT: movaps 448(%rdi), %xmm3
; SSE-NEXT: movaps 160(%rdi), %xmm6
; SSE-NEXT: movaps 128(%rdi), %xmm13
; SSE-NEXT: movaps 160(%rdi), %xmm8
; SSE-NEXT: movaps 128(%rdi), %xmm15
; SSE-NEXT: movaps 224(%rdi), %xmm5
; SSE-NEXT: movaps 192(%rdi), %xmm10
; SSE-NEXT: movaps %xmm10, %xmm8
; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
; SSE-NEXT: movaps %xmm13, %xmm9
; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
; SSE-NEXT: movaps %xmm9, %xmm14
; SSE-NEXT: movaps %xmm10, %xmm6
; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; SSE-NEXT: movaps %xmm15, %xmm9
; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
; SSE-NEXT: movaps %xmm9, %xmm13
; SSE-NEXT: movaps %xmm9, %xmm12
; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm8[0]
; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm6[0]
; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm3, %xmm9
; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
; SSE-NEXT: movaps %xmm4, %xmm14
; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1]
; SSE-NEXT: movaps %xmm4, %xmm13
; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1]
; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm14, %xmm0
; SSE-NEXT: movaps %xmm13, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1]
; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm7, %xmm8
; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1]
; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm7, %xmm6
; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
; SSE-NEXT: movaps 256(%rdi), %xmm9
; SSE-NEXT: movaps %xmm9, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0]
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 96(%rdi), %xmm8
; SSE-NEXT: movaps 96(%rdi), %xmm6
; SSE-NEXT: movaps 64(%rdi), %xmm12
; SSE-NEXT: movaps %xmm12, %xmm14
; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
; SSE-NEXT: movaps %xmm12, %xmm13
; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1]
; SSE-NEXT: movaps (%rdi), %xmm2
; SSE-NEXT: movaps 32(%rdi), %xmm15
; SSE-NEXT: movaps 32(%rdi), %xmm14
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0]
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm6[2],xmm13[3],xmm6[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm8[2],xmm12[3],xmm8[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3]
; SSE-NEXT: movaps %xmm13, %xmm0
; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; SSE-NEXT: movaps %xmm15, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1]
; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1]
; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm11, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
Expand All @@ -981,8 +981,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 240(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 208(%rdi), %xmm12
; SSE-NEXT: movaps %xmm12, %xmm0
; SSE-NEXT: movaps 208(%rdi), %xmm15
; SSE-NEXT: movaps %xmm15, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movaps 176(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
Expand All @@ -994,36 +994,36 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 368(%rdi), %xmm0
; SSE-NEXT: movaps 496(%rdi), %xmm0
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 336(%rdi), %xmm3
; SSE-NEXT: movaps 464(%rdi), %xmm3
; SSE-NEXT: movaps %xmm3, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movaps 304(%rdi), %xmm0
; SSE-NEXT: movaps 432(%rdi), %xmm0
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 272(%rdi), %xmm10
; SSE-NEXT: movaps 400(%rdi), %xmm10
; SSE-NEXT: movaps %xmm10, %xmm8
; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
; SSE-NEXT: movaps %xmm8, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1]
; SSE-NEXT: movaps 496(%rdi), %xmm14
; SSE-NEXT: movaps 464(%rdi), %xmm2
; SSE-NEXT: movaps 368(%rdi), %xmm12
; SSE-NEXT: movaps 336(%rdi), %xmm2
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
; SSE-NEXT: movaps 432(%rdi), %xmm13
; SSE-NEXT: movaps 400(%rdi), %xmm15
; SSE-NEXT: movaps %xmm15, %xmm7
; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
; SSE-NEXT: movaps 304(%rdi), %xmm11
; SSE-NEXT: movaps 272(%rdi), %xmm13
; SSE-NEXT: movaps %xmm13, %xmm7
; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
; SSE-NEXT: movaps %xmm7, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
; SSE-NEXT: movaps 112(%rdi), %xmm11
; SSE-NEXT: movaps 112(%rdi), %xmm14
; SSE-NEXT: movaps 80(%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
; SSE-NEXT: movaps 16(%rdi), %xmm6
; SSE-NEXT: movaps 48(%rdi), %xmm9
; SSE-NEXT: movaps %xmm6, %xmm5
Expand All @@ -1032,31 +1032,31 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
; SSE-NEXT: movaps %xmm0, %xmm4
; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
; SSE-NEXT: movaps %xmm0, %xmm9
; SSE-NEXT: movaps %xmm10, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE-NEXT: movaps %xmm4, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1]
; SSE-NEXT: movaps %xmm4, %xmm9
; SSE-NEXT: movaps %xmm13, %xmm4
; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1]
; SSE-NEXT: movaps %xmm10, %xmm2
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1]
; SSE-NEXT: movaps %xmm15, %xmm3
; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1]
; SSE-NEXT: movaps %xmm6, %xmm2
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movaps %xmm6, %xmm3
; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 32(%rsi)
Expand Down Expand Up @@ -1099,19 +1099,19 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 16(%r9)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movaps %xmm7, 48(%rax)
; SSE-NEXT: movaps %xmm8, 32(%rax)
; SSE-NEXT: movaps %xmm7, 32(%rax)
; SSE-NEXT: movaps %xmm8, 48(%rax)
; SSE-NEXT: movaps %xmm5, (%rax)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 16(%rax)
; SSE-NEXT: movaps %xmm5, (%rax)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movaps %xmm3, 48(%rax)
; SSE-NEXT: movaps %xmm0, 32(%rax)
; SSE-NEXT: movaps %xmm4, 16(%rax)
; SSE-NEXT: movaps %xmm2, (%rax)
; SSE-NEXT: movaps %xmm2, 48(%rax)
; SSE-NEXT: movaps %xmm4, 32(%rax)
; SSE-NEXT: movaps %xmm0, 16(%rax)
; SSE-NEXT: movaps %xmm3, (%rax)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movaps %xmm15, 48(%rax)
; SSE-NEXT: movaps %xmm10, 32(%rax)
; SSE-NEXT: movaps %xmm10, 48(%rax)
; SSE-NEXT: movaps %xmm13, 32(%rax)
; SSE-NEXT: movaps %xmm9, 16(%rax)
; SSE-NEXT: movaps %xmm6, (%rax)
; SSE-NEXT: addq $296, %rsp # imm = 0x128
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -154,30 +154,30 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps 48(%rdi), %xmm3
; SSE-NEXT: movaps 112(%rdi), %xmm4
; SSE-NEXT: movaps 96(%rdi), %xmm5
; SSE-NEXT: movaps 80(%rdi), %xmm6
; SSE-NEXT: movaps 64(%rdi), %xmm7
; SSE-NEXT: movaps 80(%rdi), %xmm4
; SSE-NEXT: movaps 64(%rdi), %xmm5
; SSE-NEXT: movaps 112(%rdi), %xmm6
; SSE-NEXT: movaps 96(%rdi), %xmm7
; SSE-NEXT: movaps %xmm7, %xmm8
; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
; SSE-NEXT: movaps %xmm5, %xmm9
; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0]
; SSE-NEXT: movaps %xmm0, %xmm10
; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0]
; SSE-NEXT: movaps %xmm2, %xmm11
; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm3[0]
; SSE-NEXT: movaps %xmm2, %xmm10
; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0]
; SSE-NEXT: movaps %xmm0, %xmm11
; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
; SSE-NEXT: movaps %xmm11, 16(%rsi)
; SSE-NEXT: movaps %xmm10, (%rsi)
; SSE-NEXT: movaps %xmm9, 48(%rsi)
; SSE-NEXT: movaps %xmm8, 32(%rsi)
; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: movaps %xmm9, 32(%rsi)
; SSE-NEXT: movaps %xmm8, 48(%rsi)
; SSE-NEXT: movaps %xmm11, (%rsi)
; SSE-NEXT: movaps %xmm10, 16(%rsi)
; SSE-NEXT: movaps %xmm5, 32(%rdx)
; SSE-NEXT: movaps %xmm7, 48(%rdx)
; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: movaps %xmm5, 48(%rdx)
; SSE-NEXT: movaps %xmm7, 32(%rdx)
; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-ONLY-LABEL: load_i64_stride2_vf8:
Expand Down
162 changes: 81 additions & 81 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -358,104 +358,104 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; SSE-LABEL: load_i64_stride3_vf16:
; SSE: # %bb.0:
; SSE-NEXT: subq $24, %rsp
; SSE-NEXT: movapd 272(%rdi), %xmm0
; SSE-NEXT: movapd 224(%rdi), %xmm2
; SSE-NEXT: movapd 368(%rdi), %xmm1
; SSE-NEXT: movapd 320(%rdi), %xmm3
; SSE-NEXT: movapd 128(%rdi), %xmm4
; SSE-NEXT: movapd 240(%rdi), %xmm5
; SSE-NEXT: movapd 256(%rdi), %xmm10
; SSE-NEXT: movapd 192(%rdi), %xmm6
; SSE-NEXT: movapd 128(%rdi), %xmm0
; SSE-NEXT: movapd 176(%rdi), %xmm1
; SSE-NEXT: movapd 224(%rdi), %xmm4
; SSE-NEXT: movapd 272(%rdi), %xmm3
; SSE-NEXT: movapd 80(%rdi), %xmm2
; SSE-NEXT: movapd 96(%rdi), %xmm5
; SSE-NEXT: movapd 112(%rdi), %xmm11
; SSE-NEXT: movapd 144(%rdi), %xmm6
; SSE-NEXT: movapd 160(%rdi), %xmm14
; SSE-NEXT: movapd 192(%rdi), %xmm7
; SSE-NEXT: movapd 208(%rdi), %xmm12
; SSE-NEXT: movapd 336(%rdi), %xmm7
; SSE-NEXT: movapd 352(%rdi), %xmm14
; SSE-NEXT: movapd 288(%rdi), %xmm11
; SSE-NEXT: movapd 304(%rdi), %xmm15
; SSE-NEXT: movapd 96(%rdi), %xmm9
; SSE-NEXT: movapd 112(%rdi), %xmm13
; SSE-NEXT: movapd 240(%rdi), %xmm10
; SSE-NEXT: movapd 256(%rdi), %xmm13
; SSE-NEXT: movapd 48(%rdi), %xmm9
; SSE-NEXT: movapd 64(%rdi), %xmm15
; SSE-NEXT: movapd %xmm15, %xmm8
; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm11[0],xmm8[1]
; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1]
; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill
; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm3[0]
; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1]
; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm2[0]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd %xmm14, %xmm15
; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1]
; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm1[0]
; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1]
; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0]
; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1]
; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd %xmm12, %xmm14
; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm6[0],xmm14[1]
; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm2[0]
; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1]
; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd %xmm10, %xmm12
; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm5[0],xmm12[1]
; SSE-NEXT: movapd %xmm11, %xmm14
; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1]
; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0]
; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1]
; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd %xmm13, %xmm10
; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1]
; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm4[0]
; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm13[0],xmm4[1]
; SSE-NEXT: movapd %xmm13, %xmm11
; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1]
; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0]
; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1]
; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd %xmm12, %xmm13
; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm7[0],xmm13[1]
; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0]
; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1]
; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movapd 144(%rdi), %xmm13
; SSE-NEXT: movapd 160(%rdi), %xmm1
; SSE-NEXT: movapd %xmm1, %xmm8
; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1]
; SSE-NEXT: movapd 176(%rdi), %xmm6
; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm6[0]
; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
; SSE-NEXT: movapd 48(%rdi), %xmm1
; SSE-NEXT: movapd 64(%rdi), %xmm4
; SSE-NEXT: movapd %xmm4, %xmm3
; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
; SSE-NEXT: movapd 80(%rdi), %xmm2
; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
; SSE-NEXT: movapd (%rdi), %xmm4
; SSE-NEXT: movapd 16(%rdi), %xmm7
; SSE-NEXT: movapd %xmm7, %xmm5
; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; SSE-NEXT: movapd 32(%rdi), %xmm0
; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm0[0]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
; SSE-NEXT: movapd %xmm12, 80(%rsi)
; SSE-NEXT: movapd %xmm3, 16(%rsi)
; SSE-NEXT: movapd %xmm14, 64(%rsi)
; SSE-NEXT: movapd %xmm5, (%rsi)
; SSE-NEXT: movapd %xmm15, 112(%rsi)
; SSE-NEXT: movapd %xmm8, 48(%rsi)
; SSE-NEXT: movapd 336(%rdi), %xmm12
; SSE-NEXT: movapd 352(%rdi), %xmm2
; SSE-NEXT: movapd %xmm2, %xmm7
; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1]
; SSE-NEXT: movapd 368(%rdi), %xmm4
; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm4[0]
; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
; SSE-NEXT: movapd 288(%rdi), %xmm2
; SSE-NEXT: movapd 304(%rdi), %xmm5
; SSE-NEXT: movapd %xmm5, %xmm3
; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
; SSE-NEXT: movapd 320(%rdi), %xmm0
; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0]
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
; SSE-NEXT: movapd (%rdi), %xmm5
; SSE-NEXT: movapd 16(%rdi), %xmm8
; SSE-NEXT: movapd %xmm8, %xmm6
; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1]
; SSE-NEXT: movapd 32(%rdi), %xmm1
; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0]
; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1]
; SSE-NEXT: movapd %xmm3, 96(%rsi)
; SSE-NEXT: movapd %xmm14, 32(%rsi)
; SSE-NEXT: movapd %xmm7, 112(%rsi)
; SSE-NEXT: movapd %xmm15, 48(%rsi)
; SSE-NEXT: movapd %xmm13, 64(%rsi)
; SSE-NEXT: movapd %xmm6, (%rsi)
; SSE-NEXT: movapd %xmm11, 80(%rsi)
; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload
; SSE-NEXT: movaps %xmm3, 96(%rsi)
; SSE-NEXT: movapd %xmm10, 32(%rsi)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE-NEXT: movaps %xmm3, 80(%rdx)
; SSE-NEXT: movapd %xmm1, 16(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 64(%rdx)
; SSE-NEXT: movapd %xmm4, (%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: movaps %xmm1, 112(%rdx)
; SSE-NEXT: movapd %xmm13, 48(%rdx)
; SSE-NEXT: movapd %xmm11, 96(%rdx)
; SSE-NEXT: movapd %xmm9, 32(%rdx)
; SSE-NEXT: movapd %xmm2, 16(%rcx)
; SSE-NEXT: movapd %xmm0, (%rcx)
; SSE-NEXT: movapd %xmm6, 48(%rcx)
; SSE-NEXT: movaps %xmm3, 16(%rsi)
; SSE-NEXT: movapd %xmm2, 96(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 32(%rdx)
; SSE-NEXT: movapd %xmm12, 112(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 48(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE-NEXT: movaps %xmm2, 64(%rdx)
; SSE-NEXT: movapd %xmm5, (%rdx)
; SSE-NEXT: movapd %xmm10, 80(%rdx)
; SSE-NEXT: movapd %xmm9, 16(%rdx)
; SSE-NEXT: movapd %xmm0, 96(%rcx)
; SSE-NEXT: movapd %xmm4, 112(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 32(%rcx)
; SSE-NEXT: movaps %xmm0, 64(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 80(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 64(%rcx)
; SSE-NEXT: movaps %xmm0, 32(%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 112(%rcx)
; SSE-NEXT: movaps %xmm0, 48(%rcx)
; SSE-NEXT: movapd %xmm1, (%rcx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 96(%rcx)
; SSE-NEXT: movaps %xmm0, 16(%rcx)
; SSE-NEXT: addq $24, %rsp
; SSE-NEXT: retq
;
Expand Down
Loading