Skip to content

Conversation

@GrumpyPigSkin
Copy link
Contributor

Added check to MatchLoadCombine combine two i32 loads into a i64 load + rotate

Closes #167314

@llvmbot llvmbot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Nov 10, 2025
@llvmbot
Copy link
Member

llvmbot commented Nov 10, 2025

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-x86

Author: None (GrumpyPigSkin)

Changes

Added check to MatchLoadCombine combine two i32 loads into a i64 load + rotate

Closes #167314


Full diff: https://github.com/llvm/llvm-project/pull/167416.diff

2 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+64-13)
  • (added) llvm/test/CodeGen/X86/dagcombine-bswap-to-rotate.ll (+22)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4f2eb1e64dbe0..4ae239bc58ef4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9772,12 +9772,40 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
                           MemVT))
     return SDValue();
 
+  auto IsRotateLoaded = [](ArrayRef<int64_t> ByteOffsets, int64_t FirstOffset,
+                           unsigned BitWidth) {
+    // Ensure that we have the correct width type, we want to combine two 32
+    // loads into a 64 bit load.
+    if (BitWidth != 64 || ByteOffsets.size() != 8)
+      return false;
+
+    constexpr unsigned FourBytes = 4;
+
+    for (unsigned i = 0; i < FourBytes; ++i) {
+      // Check the lower 4 bytes come from the higher memory address.
+      if (ByteOffsets[i] != FirstOffset + i + FourBytes)
+        return false;
+      // Check the higher 4 bytes come from the lower memory adderess.
+      if (ByteOffsets[i + FourBytes] != FirstOffset + i)
+        return false;
+    }
+    return true;
+  };
+
   // Check if the bytes of the OR we are looking at match with either big or
   // little endian value load
   std::optional<bool> IsBigEndian = isBigEndian(
       ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
-  if (!IsBigEndian)
-    return SDValue();
+
+  bool IsRotated = false;
+  if (!IsBigEndian) {
+    IsRotated =
+        IsRotateLoaded(ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes),
+                       FirstOffset, VT.getSizeInBits());
+
+    if (!IsRotated)
+      return SDValue();
+  }
 
   assert(FirstByteProvider && "must be set");
 
@@ -9791,8 +9819,9 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   // replace it with a single (possibly zero-extended) load and bswap + shift if
   // needed.
 
-  // If the load needs byte swap check if the target supports it
-  bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
+  // If the load needs byte swap check if the target supports it, make sure that
+  // we are not rotating.
+  bool NeedsBswap = !IsRotated && (IsBigEndianTarget != *IsBigEndian);
 
   // Before legalize we can introduce illegal bswaps which will be later
   // converted to an explicit bswap sequence. This way we end up with a single
@@ -9803,8 +9832,12 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
       !TLI.isOperationLegal(ISD::BSWAP, VT))
     return SDValue();
 
-  // If we need to bswap and zero extend, we have to insert a shift. Check that
-  // it is legal.
+  // If we need to rotate make sure that is legal.
+  if (IsRotated && LegalOperations && !TLI.isOperationLegal(ISD::ROTR, VT))
+    return SDValue();
+
+  // If we need to bswap and zero extend, we have to insert a shift. Check
+  // thatunsigned Fast = 0; it is legal.
   if (NeedsBswap && NeedsZext && LegalOperations &&
       !TLI.isOperationLegal(ISD::SHL, VT))
     return SDValue();
@@ -9826,15 +9859,33 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   for (LoadSDNode *L : Loads)
     DAG.makeEquivalentMemoryOrdering(L, NewLoad);
 
-  if (!NeedsBswap)
+  // If no transform is needed then return the new load.
+  if (!NeedsBswap && !IsRotated)
     return NewLoad;
 
-  SDValue ShiftedLoad =
-      NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
-                              DAG.getShiftAmountConstant(ZeroExtendedBytes * 8,
-                                                         VT, SDLoc(N)))
-                : NewLoad;
-  return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
+  // If we detect the need to BSWAP build the new node and return it.
+  if (NeedsBswap) {
+    SDValue ShiftedLoad =
+        NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
+                                DAG.getShiftAmountConstant(
+                                    ZeroExtendedBytes * 8, VT, SDLoc(N)))
+                  : NewLoad;
+    return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
+  }
+
+  // If we detect we need to rotate build the new ROTR node.
+  if (IsRotated) {
+    // The amount to rotate is half that of the size, i.e 32 bits for an i64
+    unsigned RotateAmount = VT.getSizeInBits() / 2;
+
+    EVT ShiftAmountTy =
+        TLI.getShiftAmountTy(NewLoad.getValueType(), DAG.getDataLayout());
+
+    return DAG.getNode(ISD::ROTR, SDLoc(N), VT, NewLoad,
+                       DAG.getConstant(RotateAmount, SDLoc(N), ShiftAmountTy));
+  }
+
+  llvm_unreachable("Should have returned a transformed load value");
 }
 
 // If the target has andn, bsl, or a similar bit-select instruction,
diff --git a/llvm/test/CodeGen/X86/dagcombine-bswap-to-rotate.ll b/llvm/test/CodeGen/X86/dagcombine-bswap-to-rotate.ll
new file mode 100644
index 0000000000000..b08f2fbbc56e4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/dagcombine-bswap-to-rotate.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; This test checks that a pattern of two 32-bit loads, which are combined
+; to form a 64-bit value with swapped words, is optimized into a single
+; 64-bit load followed by a 32-bit rotate.
+
+define i64 @test_load_bswap_to_rotate(ptr %p) {
+; CHECK-LABEL: test_load_bswap_to_rotate:
+; CHECK:  # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    rorq $32, %rax
+; CHECK-NEXT:    retq
+
+  %p.hi = getelementptr inbounds nuw i8, ptr %p, i64 4
+  %lo = load i32, ptr %p
+  %hi = load i32, ptr %p.hi
+  %conv = zext i32 %lo to i64
+  %shl = shl nuw i64 %conv, 32
+  %conv2 = zext i32 %hi to i64
+  %or = or disjoint i64 %shl, %conv2
+  ret i64 %or
+}

@GrumpyPigSkin
Copy link
Contributor Author

@RKSimon please review :)

@RKSimon RKSimon requested review from RKSimon and topperc November 11, 2025 08:21
@github-actions
Copy link

github-actions bot commented Nov 11, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@GrumpyPigSkin GrumpyPigSkin force-pushed the dag-matchloadcombine-match-swapped-loads branch from bb99659 to 55f06ae Compare November 11, 2025 22:23
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:X86 llvm:SelectionDAG SelectionDAGISel as well

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[DAG] MatchLoadCombine - match swapped loads

4 participants