-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] EltsFromConsecutiveLoads - recognise reverse load patterns. #168706
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
See if we can create a vector load from the src elements in reverse and then shuffle these back into place. SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization. I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples.
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesSee if we can create a vector load from the src elements in reverse and then shuffle these back into place. SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization. I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples. Patch is 45.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168706.diff 7 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa9ba6b0e197c..661eace037de9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7557,6 +7557,19 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
+ // REVERSE - attempt to match the loads in reverse and then shuffle back.
+ // TODO: Do this for any permute or mismatching element counts.
+ if (Depth == 0 && !ZeroMask && TLI.isTypeLegal(VT) && VT.isVector() &&
+ NumElems == VT.getVectorNumElements()) {
+ SmallVector<SDValue, 4> ReverseElts(Elts.rbegin(), Elts.rend());
+ if (SDValue RevLd = EltsFromConsecutiveLoads(
+ VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
+ SmallVector<int, 16> ReverseMask(NumElems);
+ std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
+ return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 330c978d2a9f7..22c4ad28059e4 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -844,13 +844,11 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
; AVX512-NEXT: vmovq %rcx, %xmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovq %r8, %xmm1
-; AVX512-NEXT: vmovq %r9, %xmm2
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512-NEXT: vmovq %r8, %xmm2
+; AVX512-NEXT: vmovq %r9, %xmm3
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: vplzcntq %zmm0, %zmm1
; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
@@ -2071,13 +2069,11 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
; AVX512-NEXT: vmovq %rcx, %xmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovq %r8, %xmm1
-; AVX512-NEXT: vmovq %r9, %xmm2
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512-NEXT: vmovq %r8, %xmm2
+; AVX512-NEXT: vmovq %r9, %xmm3
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512-NEXT: vplzcntq %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index 3edb712e53c8d..773eb8f6742e5 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -417,9 +417,8 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
; AVX1-32-LABEL: test_buildvector_4f64_2_var:
; AVX1-32: # %bb.0:
-; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
-; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT: vmovupd {{[0-9]+}}(%esp), %xmm0
+; AVX1-32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-32-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll
index 3ced27f12c72a..18faec5747abe 100644
--- a/llvm/test/CodeGen/X86/chain_order.ll
+++ b/llvm/test/CodeGen/X86/chain_order.ll
@@ -6,9 +6,8 @@ define void @cftx020(ptr nocapture %a) {
; CHECK-LABEL: cftx020:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0]
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovupd (%rdi), %xmm1
; CHECK-NEXT: vmovupd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 26f076d450c15..b6aae486dc315 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -354,53 +354,23 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp
}
define <4 x float> @merge_v4f32_f32_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; SSE2-LABEL: merge_v4f32_f32_3210:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: merge_v4f32_f32_3210:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; SSE41-NEXT: retq
+; SSE-LABEL: merge_v4f32_f32_3210:
+; SSE: # %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
;
; AVX-LABEL: merge_v4f32_f32_3210:
; AVX: # %bb.0:
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
; AVX-NEXT: retq
;
-; X86-SSE1-LABEL: merge_v4f32_f32_3210:
-; X86-SSE1: # %bb.0:
-; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X86-SSE1-NEXT: retl
-;
-; X86-SSE41-LABEL: merge_v4f32_f32_3210:
-; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-SSE41-NEXT: retl
+; X86-SSE-LABEL: merge_v4f32_f32_3210:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movups (%eax), %xmm0
+; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-SSE-NEXT: retl
%ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
%ptr1 = getelementptr inbounds float, ptr %ptr, i64 2
%ptr2 = getelementptr inbounds float, ptr %ptr, i64 1
@@ -788,31 +758,15 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline s
}
define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; SSE2-LABEL: merge_v4i32_i32_3210:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: merge_v4i32_i32_3210:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pinsrd $1, 8(%rdi), %xmm0
-; SSE41-NEXT: pinsrd $2, 4(%rdi), %xmm0
-; SSE41-NEXT: pinsrd $3, (%rdi), %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: merge_v4i32_i32_3210:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqu (%rdi), %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
;
; AVX-LABEL: merge_v4i32_i32_3210:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
; AVX-NEXT: retq
;
; X86-SSE1-LABEL: merge_v4i32_i32_3210:
@@ -842,10 +796,8 @@ define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
; X86-SSE41-LABEL: merge_v4i32_i32_3210:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE41-NEXT: pinsrd $1, 8(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrd $2, 4(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrd $3, (%eax), %xmm0
+; X86-SSE41-NEXT: movdqu (%eax), %xmm0
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X86-SSE41-NEXT: retl
%ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
%ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2
@@ -1003,55 +955,22 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss
define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ssp {
; SSE2-LABEL: merge_8i16_i16_76543210:
; SSE2: # %bb.0:
-; SSE2-NEXT: movzwl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl 2(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movzwl 4(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl 6(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movzwl 8(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl 10(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movzwl 12(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: movzwl 14(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_8i16_i16_76543210:
; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl 14(%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrw $1, 12(%rdi), %xmm0
-; SSE41-NEXT: pinsrw $2, 10(%rdi), %xmm0
-; SSE41-NEXT: pinsrw $3, 8(%rdi), %xmm0
-; SSE41-NEXT: pinsrw $4, 6(%rdi), %xmm0
-; SSE41-NEXT: pinsrw $5, 4(%rdi), %xmm0
-; SSE41-NEXT: pinsrw $6, 2(%rdi), %xmm0
-; SSE41-NEXT: pinsrw $7, (%rdi), %xmm0
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_8i16_i16_76543210:
; AVX: # %bb.0:
-; AVX-NEXT: movzwl 14(%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
; AVX-NEXT: retq
;
; X86-SSE1-LABEL: merge_8i16_i16_76543210:
@@ -1107,15 +1026,8 @@ define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ss
; X86-SSE41-LABEL: merge_8i16_i16_76543210:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movzwl 14(%eax), %ecx
-; X86-SSE41-NEXT: movd %ecx, %xmm0
-; X86-SSE41-NEXT: pinsrw $1, 12(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrw $2, 10(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrw $3, 8(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrw $4, 6(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrw $5, 4(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrw $6, 2(%eax), %xmm0
-; X86-SSE41-NEXT: pinsrw $7, (%eax), %xmm0
+; X86-SSE41-NEXT: movdqu (%eax), %xmm0
+; X86-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
; X86-SSE41-NEXT: retl
%ptr0 = getelementptr inbounds i16, ptr %ptr, i64 7
%ptr1 = getelementptr inbounds i16, ptr %ptr, i64 6
@@ -1341,95 +1253,30 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin
define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
; SSE2-LABEL: merge_16i8_i8_FEDCBA9876543210:
; SSE2: # %bb.0:
-; SSE2-NEXT: movzbl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzbl 1(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movzbl 2(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzbl 3(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: movzbl 4(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzbl 5(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: movzbl 6(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzbl 7(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movzbl 8(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzbl 9(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movzbl 10(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzbl 11(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: movzbl 12(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzbl 13(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movzbl 14(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: movzbl 15(%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210:
; SSE41: # %bb.0:
-; SSE41-NEXT: movzbl 15(%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrb $1, 14(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $2, 13(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $3, 12(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $4, 11(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $5, 10(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $6, 9(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $7, 8(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $8, 7(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $9, 6(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $10, 5(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $11, 4(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $13, 2(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $14, 1(%rdi), %xmm0
-; SSE41-NEXT: pinsrb $15, (%rdi), %xmm0
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_16i8_i8_FEDCBA9876543210:
; AVX: # %bb.0:
-; AVX-NEXT: movzbl 15(%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrb $1, 14(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $2, 13(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $3, 12(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $4, 11(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $5, 10(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $6, 9(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $7, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $8, 7(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $9, 6(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $10, 5(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $11, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $13, 2(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $14, 1(%rdi), %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $15, (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpsh...
[truncated]
|
🐧 Linux x64 Test Results
|
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
See if we can create a vector load from the src elements in reverse and then shuffle these back into place.
SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization.
I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples.