Skip to content

Conversation

@RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Nov 19, 2025

See if we can create a vector load from the src elements in reverse and then shuffle these back into place.

SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization.

I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples.

See if we can create a vector load from the src elements in reverse and then shuffle these back into place.

SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization.

I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples.
@llvmbot
Copy link
Member

llvmbot commented Nov 19, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

See if we can create a vector load from the src elements in reverse and then shuffle these back into place.

SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization.

I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples.


Patch is 45.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168706.diff

7 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+13)
  • (modified) llvm/test/CodeGen/X86/bitcnt-big-integer.ll (+8-12)
  • (modified) llvm/test/CodeGen/X86/build-vector-256.ll (+2-3)
  • (modified) llvm/test/CodeGen/X86/chain_order.ll (+1-2)
  • (modified) llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll (+48-216)
  • (modified) llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll (+75-154)
  • (modified) llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll (+14-113)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa9ba6b0e197c..661eace037de9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7557,6 +7557,19 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     }
   }
 
+  // REVERSE - attempt to match the loads in reverse and then shuffle back.
+  // TODO: Do this for any permute or mismatching element counts.
+  if (Depth == 0 && !ZeroMask && TLI.isTypeLegal(VT) && VT.isVector() &&
+      NumElems == VT.getVectorNumElements()) {
+    SmallVector<SDValue, 4> ReverseElts(Elts.rbegin(), Elts.rend());
+    if (SDValue RevLd = EltsFromConsecutiveLoads(
+            VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
+      SmallVector<int, 16> ReverseMask(NumElems);
+      std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
+      return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 330c978d2a9f7..22c4ad28059e4 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -844,13 +844,11 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
 ; AVX512-NEXT:    vmovq %rcx, %xmm2
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovq %r8, %xmm1
-; AVX512-NEXT:    vmovq %r9, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512-NEXT:    vmovq %r8, %xmm2
+; AVX512-NEXT:    vmovq %r9, %xmm3
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
@@ -2071,13 +2069,11 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
 ; AVX512-NEXT:    vmovq %rcx, %xmm2
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vmovq %r8, %xmm1
-; AVX512-NEXT:    vmovq %r9, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512-NEXT:    vmovq %r8, %xmm2
+; AVX512-NEXT:    vmovq %r9, %xmm3
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512-NEXT:    vplzcntq %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index 3edb712e53c8d..773eb8f6742e5 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -417,9 +417,8 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
 ; AVX1-32-LABEL: test_buildvector_4f64_2_var:
 ; AVX1-32:       # %bb.0:
-; AVX1-32-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
-; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT:    vmovupd {{[0-9]+}}(%esp), %xmm0
+; AVX1-32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX1-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll
index 3ced27f12c72a..18faec5747abe 100644
--- a/llvm/test/CodeGen/X86/chain_order.ll
+++ b/llvm/test/CodeGen/X86/chain_order.ll
@@ -6,9 +6,8 @@ define void @cftx020(ptr nocapture %a) {
 ; CHECK-LABEL: cftx020:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm1
 ; CHECK-NEXT:    vmovupd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 26f076d450c15..b6aae486dc315 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -354,53 +354,23 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp
 }
 
 define <4 x float> @merge_v4f32_f32_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; SSE2-LABEL: merge_v4f32_f32_3210:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: merge_v4f32_f32_3210:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: merge_v4f32_f32_3210:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: merge_v4f32_f32_3210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
 ; AVX-NEXT:    retq
 ;
-; X86-SSE1-LABEL: merge_v4f32_f32_3210:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE41-LABEL: merge_v4f32_f32_3210:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X86-SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-SSE41-NEXT:    retl
+; X86-SSE-LABEL: merge_v4f32_f32_3210:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups (%eax), %xmm0
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-SSE-NEXT:    retl
   %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
   %ptr1 = getelementptr inbounds float, ptr %ptr, i64 2
   %ptr2 = getelementptr inbounds float, ptr %ptr, i64 1
@@ -788,31 +758,15 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline s
 }
 
 define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
-; SSE2-LABEL: merge_v4i32_i32_3210:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: merge_v4i32_i32_3210:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pinsrd $1, 8(%rdi), %xmm0
-; SSE41-NEXT:    pinsrd $2, 4(%rdi), %xmm0
-; SSE41-NEXT:    pinsrd $3, (%rdi), %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: merge_v4i32_i32_3210:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqu (%rdi), %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: merge_v4i32_i32_3210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpinsrd $1, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrd $2, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrd $3, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
 ; AVX-NEXT:    retq
 ;
 ; X86-SSE1-LABEL: merge_v4i32_i32_3210:
@@ -842,10 +796,8 @@ define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
 ; X86-SSE41-LABEL: merge_v4i32_i32_3210:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE41-NEXT:    pinsrd $1, 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrd $2, 4(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrd $3, (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; X86-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
   %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2
@@ -1003,55 +955,22 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss
 define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; SSE2-LABEL: merge_8i16_i16_76543210:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movzwl (%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl 2(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl 4(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl 6(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movzwl 8(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzwl 10(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl 12(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    movzwl 14(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_8i16_i16_76543210:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movzwl 14(%rdi), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pinsrw $1, 12(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $2, 10(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $3, 8(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $4, 6(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $5, 4(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $6, 2(%rdi), %xmm0
-; SSE41-NEXT:    pinsrw $7, (%rdi), %xmm0
+; SSE41-NEXT:    movdqu (%rdi), %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: merge_8i16_i16_76543210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movzwl 14(%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpinsrw $1, 12(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $2, 10(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $3, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $4, 6(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $5, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $6, 2(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
 ; AVX-NEXT:    retq
 ;
 ; X86-SSE1-LABEL: merge_8i16_i16_76543210:
@@ -1107,15 +1026,8 @@ define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ss
 ; X86-SSE41-LABEL: merge_8i16_i16_76543210:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movzwl 14(%eax), %ecx
-; X86-SSE41-NEXT:    movd %ecx, %xmm0
-; X86-SSE41-NEXT:    pinsrw $1, 12(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $2, 10(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $3, 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $4, 6(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $5, 4(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $6, 2(%eax), %xmm0
-; X86-SSE41-NEXT:    pinsrw $7, (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
 ; X86-SSE41-NEXT:    retl
   %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 7
   %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 6
@@ -1341,95 +1253,30 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin
 define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
 ; SSE2-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movzbl (%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 1(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movzbl 2(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 3(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzbl 4(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 5(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movzbl 6(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 7(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movzbl 8(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 9(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl 10(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 11(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movzbl 12(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 13(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl 14(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movzbl 15(%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movzbl 15(%rdi), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pinsrb $1, 14(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $2, 13(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $3, 12(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $4, 11(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $5, 10(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $6, 9(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $7, 8(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $8, 7(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $9, 6(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $10, 5(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $11, 4(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $12, 3(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $13, 2(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $14, 1(%rdi), %xmm0
-; SSE41-NEXT:    pinsrb $15, (%rdi), %xmm0
+; SSE41-NEXT:    movdqu (%rdi), %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movzbl 15(%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpinsrb $1, 14(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $2, 13(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $3, 12(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $4, 11(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $5, 10(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $6, 9(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, 8(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, 7(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, 6(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, 5(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, 4(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, 3(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, 2(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, 1(%rdi), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsh...
[truncated]

@github-actions
Copy link

github-actions bot commented Nov 19, 2025

🐧 Linux x64 Test Results

  • 186403 tests passed
  • 4864 tests skipped

Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@RKSimon RKSimon enabled auto-merge (squash) November 20, 2025 08:35
@RKSimon RKSimon merged commit 07a31ad into llvm:main Nov 20, 2025
9 of 10 checks passed
@RKSimon RKSimon deleted the x86-reverse-load branch November 20, 2025 09:16
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants