Skip to content

Conversation

@ayank227
Copy link
Contributor

@ayank227 ayank227 commented Dec 2, 2025

This optimization splits i128 G_STORE(G_MERGE_VALUES(x, y)) into two i64 G_STOREs.

This optimization splits i128 G_STORE(G_MERGE_VALUES(x, y)) into two i64 G_STOREs.
@github-actions
Copy link

github-actions bot commented Dec 2, 2025

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

@llvmbot
Copy link
Member

llvmbot commented Dec 2, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-aarch64

Author: None (ayank227)

Changes

This optimization splits i128 G_STORE(G_MERGE_VALUES(x, y)) into two i64 G_STOREs.


Patch is 36.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170276.diff

11 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64Combine.td (+10-1)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp (+63)
  • (modified) llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll (+15-45)
  • (modified) llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll (+11-31)
  • (modified) llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll (+11-15)
  • (modified) llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll (+5-13)
  • (modified) llvm/test/CodeGen/AArch64/dup.ll (+30-33)
  • (modified) llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll (+39-63)
  • (modified) llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll (+39-63)
  • (modified) llvm/test/CodeGen/AArch64/insertextract.ll (+19-23)
  • (modified) llvm/test/CodeGen/AArch64/store.ll (+2-5)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 278314792bfb9..32955f3a6f952 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -280,6 +280,14 @@ def form_truncstore : GICombineRule<
   (apply [{ applyFormTruncstore(*${root}, MRI, B, Observer, ${matchinfo}); }])
 >;
 
+def split_store_128_matchdata : GIDefMatchData<"std::pair<Register, Register>">;
+def split_store_128 : GICombineRule<
+  (defs root:$root, split_store_128_matchdata:$matchinfo),
+  (match (G_STORE $src, $addr):$root,
+          [{ return matchSplitStore128(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applySplitStore128(*${root}, MRI, B, Observer, ${matchinfo}); }])
+>;
+
 def fold_merge_to_zext : GICombineRule<
   (defs root:$d),
   (match (wip_match_opcode G_MERGE_VALUES):$d,
@@ -339,7 +347,8 @@ def AArch64PostLegalizerLowering
     : GICombiner<"AArch64PostLegalizerLoweringImpl",
                        [shuffle_vector_lowering, vashr_vlshr_imm,
                         icmp_lowering, build_vector_lowering,
-                        lower_vector_fcmp, form_truncstore, fconstant_to_constant,
+                        lower_vector_fcmp, form_truncstore, split_store_128,
+                        fconstant_to_constant,
                         vector_sext_inreg_to_shift,
                         unmerge_ext_to_unmerge, lower_mulv2s64,
                         vector_unmerge_lowering, insertelt_nonconst,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4fba593b3d0fb..7152558580763 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1119,6 +1119,69 @@ void applyFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI,
   Observer.changedInstr(MI);
 }
 
+/// Optimize i128 stores by splitting into two i64 stores for STP pairing
+bool matchSplitStore128(MachineInstr &MI, MachineRegisterInfo &MRI,
+                        std::pair<Register, Register> &Parts) {
+  assert(MI.getOpcode() == TargetOpcode::G_STORE);
+  GStore &Store = cast<GStore>(MI);
+
+  Register ValueReg = Store.getValueReg();
+  LLT ValueTy = MRI.getType(ValueReg);
+
+  // Only handle scalar types
+  if (!ValueTy.isScalar())
+    return false;
+
+  if (ValueTy.getSizeInBits() != 128)
+    return false;
+
+  // Check if the value comes from G_MERGE_VALUES
+  MachineInstr *DefMI = MRI.getVRegDef(ValueReg);
+  if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_MERGE_VALUES)
+    return false;
+
+  // Get the two i64 parts
+  if (DefMI->getNumOperands() != 3) // Dst + 2 sources
+    return false;
+
+  Register Part0 = DefMI->getOperand(1).getReg();
+  Register Part1 = DefMI->getOperand(2).getReg();
+
+  if (MRI.getType(Part0) != LLT::scalar(64) ||
+      MRI.getType(Part1) != LLT::scalar(64))
+    return false;
+
+  Parts = {Part0, Part1};
+  return true;
+}
+
+void applySplitStore128(MachineInstr &MI, MachineRegisterInfo &MRI,
+                        MachineIRBuilder &B, GISelChangeObserver &Observer,
+                        std::pair<Register, Register> &Parts) {
+  assert(MI.getOpcode() == TargetOpcode::G_STORE);
+  GStore &Store = cast<GStore>(MI);
+
+  B.setInstrAndDebugLoc(MI);
+
+  Register PtrReg = Store.getPointerReg();
+  MachineMemOperand &MMO = Store.getMMO();
+
+  // Create two i64 stores
+  // Store low part at [ptr]
+  B.buildStore(Parts.first, PtrReg, MMO.getPointerInfo(), MMO.getAlign(),
+               MMO.getFlags());
+
+  // Calculate offset for high part: ptr + 8
+  auto Offset = B.buildConstant(LLT::scalar(64), 8);
+  auto PtrHi = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg, Offset);
+
+  // Store high part at [ptr + 8]
+  B.buildStore(Parts.second, PtrHi, MMO.getPointerInfo().getWithOffset(8),
+               commonAlignment(MMO.getAlign(), 8), MMO.getFlags());
+
+  MI.eraseFromParent();
+}
+
 // Lower vector G_SEXT_INREG back to shifts for selection. We allowed them to
 // form in the first place for combine opportunities, so any remaining ones
 // at this stage need be lowered back.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index be51210882eaa..ac18249950e9e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -27,9 +27,7 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-LLSC-O1-NEXT:    stxp w10, x4, x5, [x0]
 ; CHECK-LLSC-O1-NEXT:    cbnz w10, .LBB0_1
 ; CHECK-LLSC-O1-NEXT:  .LBB0_4:
-; CHECK-LLSC-O1-NEXT:    mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT:    mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT:    str q0, [x0]
+; CHECK-LLSC-O1-NEXT:    stp x8, x9, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap:
@@ -45,9 +43,7 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x0, x1, [x19]
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
 ;
@@ -58,9 +54,7 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
 ; CHECK-CAS-O1-NEXT:    // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
 ; CHECK-CAS-O1-NEXT:    // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
 ; CHECK-CAS-O1-NEXT:    caspa x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT:    mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT:    mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT:    str q0, [x0]
+; CHECK-CAS-O1-NEXT:    stp x2, x3, [x0]
 ; CHECK-CAS-O1-NEXT:    ret
 ;
 ; CHECK-LLSC-O0-LABEL: val_compare_and_swap:
@@ -154,9 +148,7 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
 ; CHECK-LLSC-O1-NEXT:    stlxp w10, x4, x5, [x0]
 ; CHECK-LLSC-O1-NEXT:    cbnz w10, .LBB1_1
 ; CHECK-LLSC-O1-NEXT:  .LBB1_4:
-; CHECK-LLSC-O1-NEXT:    mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT:    mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT:    str q0, [x0]
+; CHECK-LLSC-O1-NEXT:    stp x8, x9, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
@@ -172,9 +164,7 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x0, x1, [x19]
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
 ;
@@ -185,9 +175,7 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
 ; CHECK-CAS-O1-NEXT:    // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
 ; CHECK-CAS-O1-NEXT:    // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
 ; CHECK-CAS-O1-NEXT:    caspal x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT:    mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT:    mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT:    str q0, [x0]
+; CHECK-CAS-O1-NEXT:    stp x2, x3, [x0]
 ; CHECK-CAS-O1-NEXT:    ret
 ;
 ; CHECK-LLSC-O0-LABEL: val_compare_and_swap_monotonic_seqcst:
@@ -281,9 +269,7 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
 ; CHECK-LLSC-O1-NEXT:    stlxp w10, x4, x5, [x0]
 ; CHECK-LLSC-O1-NEXT:    cbnz w10, .LBB2_1
 ; CHECK-LLSC-O1-NEXT:  .LBB2_4:
-; CHECK-LLSC-O1-NEXT:    mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT:    mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT:    str q0, [x0]
+; CHECK-LLSC-O1-NEXT:    stp x8, x9, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_release_acquire:
@@ -299,9 +285,7 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x0, x1, [x19]
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
 ;
@@ -312,9 +296,7 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
 ; CHECK-CAS-O1-NEXT:    // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
 ; CHECK-CAS-O1-NEXT:    // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
 ; CHECK-CAS-O1-NEXT:    caspal x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT:    mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT:    mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT:    str q0, [x0]
+; CHECK-CAS-O1-NEXT:    stp x2, x3, [x0]
 ; CHECK-CAS-O1-NEXT:    ret
 ;
 ; CHECK-LLSC-O0-LABEL: val_compare_and_swap_release_acquire:
@@ -408,9 +390,7 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
 ; CHECK-LLSC-O1-NEXT:    stlxp w10, x4, x5, [x0]
 ; CHECK-LLSC-O1-NEXT:    cbnz w10, .LBB3_1
 ; CHECK-LLSC-O1-NEXT:  .LBB3_4:
-; CHECK-LLSC-O1-NEXT:    mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT:    mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT:    str q0, [x0]
+; CHECK-LLSC-O1-NEXT:    stp x8, x9, [x0]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic:
@@ -426,9 +406,7 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x3, x5
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    mov x4, x19
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    bl __aarch64_cas16_acq_rel
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x0, x1, [x19]
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
 ;
@@ -439,9 +417,7 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
 ; CHECK-CAS-O1-NEXT:    // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
 ; CHECK-CAS-O1-NEXT:    // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
 ; CHECK-CAS-O1-NEXT:    caspal x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT:    mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT:    mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT:    str q0, [x0]
+; CHECK-CAS-O1-NEXT:    stp x2, x3, [x0]
 ; CHECK-CAS-O1-NEXT:    ret
 ;
 ; CHECK-LLSC-O0-LABEL: val_compare_and_swap_monotonic:
@@ -525,9 +501,7 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
 ; CHECK-LLSC-O1-NEXT:    stxp w10, x9, x8, [x2]
 ; CHECK-LLSC-O1-NEXT:    cbnz w10, .LBB4_1
 ; CHECK-LLSC-O1-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-LLSC-O1-NEXT:    mov v0.d[0], x9
-; CHECK-LLSC-O1-NEXT:    mov v0.d[1], x8
-; CHECK-LLSC-O1-NEXT:    str q0, [x3]
+; CHECK-LLSC-O1-NEXT:    stp x9, x8, [x3]
 ; CHECK-LLSC-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-LLSC-O1-LABEL: atomic_load_relaxed:
@@ -538,9 +512,7 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    stxp w10, x9, x8, [x2]
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    cbnz w10, .LBB4_1
 ; CHECK-OUTLINE-LLSC-O1-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[0], x9
-; CHECK-OUTLINE-LLSC-O1-NEXT:    mov v0.d[1], x8
-; CHECK-OUTLINE-LLSC-O1-NEXT:    str q0, [x3]
+; CHECK-OUTLINE-LLSC-O1-NEXT:    stp x9, x8, [x3]
 ; CHECK-OUTLINE-LLSC-O1-NEXT:    ret
 ;
 ; CHECK-CAS-O1-LABEL: atomic_load_relaxed:
@@ -548,9 +520,7 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
 ; CHECK-CAS-O1-NEXT:    mov x0, xzr
 ; CHECK-CAS-O1-NEXT:    mov x1, xzr
 ; CHECK-CAS-O1-NEXT:    casp x0, x1, x0, x1, [x2]
-; CHECK-CAS-O1-NEXT:    mov v0.d[0], x0
-; CHECK-CAS-O1-NEXT:    mov v0.d[1], x1
-; CHECK-CAS-O1-NEXT:    str q0, [x3]
+; CHECK-CAS-O1-NEXT:    stp x0, x1, [x3]
 ; CHECK-CAS-O1-NEXT:    ret
 ;
 ; CHECK-LLSC-O0-LABEL: atomic_load_relaxed:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
index fe7e24c2d8ba5..a4dcd0155a449 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
@@ -5,56 +5,42 @@ define void @test_atomic_load(ptr %addr) {
 ; CHECK-LABEL: test_atomic_load:
 
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %res.0 = load atomic i128, ptr %addr monotonic, align 16
   store i128 %res.0, ptr %addr
 
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %res.1 = load atomic i128, ptr %addr unordered, align 16
   store i128 %res.1, ptr %addr
 
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: dmb ish
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: dmb ishld
+; CHECK: stp [[LO]], [[HI]], [x0]
   %res.2 = load atomic i128, ptr %addr acquire, align 16
   store i128 %res.2, ptr %addr
 
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
 ; CHECK: dmb ish
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %res.3 = load atomic i128, ptr %addr seq_cst, align 16
   store i128 %res.3, ptr %addr
 
 
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %addr8.1 = getelementptr i8,  ptr %addr, i32 8
   %res.5 = load atomic i128, ptr %addr8.1 monotonic, align 16
   store i128 %res.5, ptr %addr
 
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %addr8.2 = getelementptr i8,  ptr %addr, i32 504
   %res.6 = load atomic i128, ptr %addr8.2 monotonic, align 16
   store i128 %res.6, ptr %addr
 
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %addr8.3 = getelementptr i8,  ptr %addr, i32 -512
   %res.7 = load atomic i128, ptr %addr8.3 monotonic, align 16
   store i128 %res.7, ptr %addr
@@ -76,9 +62,7 @@ define void @test_nonfolded_load1(ptr %addr) {
 
 ; CHECK: add x[[ADDR:[0-9]+]], x0, #4
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %addr8.1 = getelementptr i8,  ptr %addr, i32 4
   %res.1 = load atomic i128, ptr %addr8.1 monotonic, align 16
   store i128 %res.1, ptr %addr
@@ -91,9 +75,7 @@ define void @test_nonfolded_load2(ptr %addr) {
 
 ; CHECK: add x[[ADDR:[0-9]+]], x0, #512
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK:  stp [[LO]], [[HI]], [x0]
   %addr8.1 = getelementptr i8,  ptr %addr, i32 512
   %res.1 = load atomic i128, ptr %addr8.1 monotonic, align 16
   store i128 %res.1, ptr %addr
@@ -106,9 +88,7 @@ define void @test_nonfolded_load3(ptr %addr) {
 
 ; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
 ; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
   %addr8.1 = getelementptr i8,  ptr %addr, i32 -520
   %res.1 = load atomic i128, ptr %addr8.1 monotonic, align 16
   store i128 %res.1, ptr %addr
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 63dcafed2320a..a188a1cfa7502 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -383,13 +383,11 @@ define i128 @gep4(ptr %p, i128 %a, i64 %b) {
 ; CHECK0-GISEL-LABEL: gep4:
 ; CHECK0-GISEL:       // %bb.0:
 ; CHECK0-GISEL-NEXT:    add x8, x0, x4, lsl #4
-; CHECK0-GISEL-NEXT:    mov v0.d[0], x2
-; CHECK0-GISEL-NEXT:    ldr q1, [x8]
-; CHECK0-GISEL-NEXT:    mov d2, v1.d[1]
-; CHECK0-GISEL-NEXT:    mov v0.d[1], x3
-; CHECK0-GISEL-NEXT:    fmov x0, d1
-; CHECK0-GISEL-NEXT:    fmov x1, d2
-; CHECK0-GISEL-NEXT:    str q0, [x8]
+; CHECK0-GISEL-NEXT:    ldr q0, [x8]
+; CHECK0-GISEL-NEXT:    stp x2, x3, [x8]
+; CHECK0-GISEL-NEXT:    mov d1, v0.d[1]
+; CHECK0-GISEL-NEXT:    fmov x0, d0
+; CHECK0-GISEL-NEXT:    fmov x1, d1
 ; CHECK0-GISEL-NEXT:    ret
 ;
 ; CHECK3-SDAG-LABEL: gep4:
@@ -401,14 +399,12 @@ define i128 @gep4(ptr %p, i128 %a, i64 %b) {
 ;
 ; CHECK3-GISEL-LABEL: gep4:
 ; CHECK3-GISEL:       // %bb.0:
-; CHECK3-GISEL-NEXT:    ldr q1, [x0, x4, lsl #4]
-; CHECK3-GISEL-NEXT:    mov v0.d[0], x2
-; CHECK3-GISEL-NEXT:    mov x8, x0
-; CHECK3-GISEL-NEXT:    mov d2, v1.d[1]
-; CHECK3-GISEL-NEXT:    fmov x0, d1
-; CHECK3-GISEL-NEXT:    mov v0.d[1], x3
-; CHECK3-GISEL-NEXT:    fmov x1, d2
-; CHECK3-GISEL-NEXT:    str q0, [x8, x4, lsl #4]
+; CHECK3-GISEL-NEXT:    ldr q0, [x0, x4, lsl #4]
+; CHECK3-GISEL-NEXT:    add x8, x0, x4, lsl #4
+; CHECK3-GISEL-NEXT:    mov d1, v0.d[1]
+; CHECK3-GISEL-NEXT:    fmov x0, d0
+; CHECK3-GISEL-NEXT:    stp x2, x3, [x8]
+; CHECK3-GISEL-NEXT:    fmov x1, d1
 ; CHECK3-GISEL-NEXT:    ret
   %g = getelementptr inbounds i128, ptr %p, i64 %b
   %l = load i128, ptr %g
diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
index 0e1e15f9b6b91..cbcc6184182ae 100644
--- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -663,19 +663,11 @@ define void @testRightBad2x64(<2 x i64> %src1, <2 x i64> %src2, ptr %dest) nounw
 }
 
 define void @testLeftShouldNotCreateSLI1x128(<1 x i128> %src1, <1 x i128> %src2, ptr %dest) nounwind {
-; CHECK-SD-LABEL: testLeftShouldNotCreateSLI1x128:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    bfi x1, x2, #6, #58
-; CHECK-SD-NEXT:    stp x0, x1, [x4]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: testLeftShouldNotCreateSLI1x128:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov.d v0[0], x0
-; CHECK-GI-NEXT:    bfi x1, x2, #6, #58
-; CHECK-GI-NEXT:    mov.d v0[1], x1
-; CHECK-GI-NEXT:    str q0, [x4]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: testLeftShouldNotCreateSLI1x128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bfi x1, x2, #6, #58
+; CHECK-NEXT:    stp x0, x1, [x4]
+; CHECK-NEXT:    ret
   %and.i = and <1 x i128> %src1, <i128 1180591620717411303423>
   %vshl_n = shl <1 x i128> %src2, <i128 70>
   %result = or <1 x i128> %and.i, %vshl_n
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 6df6d76fb0592..6b0c62490a5db 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -1252,16 +1252,15 @@ define <2 x i128> @loaddup_str_v2i128(ptr %p) {
 ;
 ; CHECK-GI-LABEL: loaddup_str_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    mov v0.d[0], xzr
+; CHECK-GI-NEXT:    ldr q0, [x0]
 ; CHECK-GI-NEXT:    mov x8, x0
-; CHECK-GI-NEXT:    mov d2, v1.d[1]
-; CHECK-GI-NEXT:    fmov x0, d1
-; CHECK-GI-NEXT:    fmov x2, d1
-; CHECK-GI-NEXT:    mov v0.d[1], xzr
-; CHECK-GI-NEXT:    fmov x1, d2
-; CHECK-GI-NEXT:    fmov x3, d2
-; CHECK-GI-NEXT:    str q0, [x8]
+; CHECK-GI-NEXT:    str xzr, [x0]
+; CHECK-GI-NEXT:    str xzr, [x8, #8]
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x2, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    fmov x3, d1
 ; CHECK-GI-NEXT:    ret
 entry:
   %a = load i128, ptr %p
@@ -1340,18 +1339,17 ...
[truncated]

@dc03-work dc03-work requested review from aemerson and arsenm December 5, 2025 10:53
@ayank227
Copy link
Contributor Author

ping @arsenm @davemgreen @aemerson

Copy link
Contributor

@cofibrant cofibrant left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks very sensible to me! Let's wait and see what the other reviewers have to say, though.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants