diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4f2eb1e64dbe0..7d324c031528d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9398,6 +9398,35 @@ static std::optional isBigEndian(ArrayRef ByteOffsets, return BigEndian; } +// Determines if multiple bytes loaded into a register +// corresponds to loading a single, contiguous block of bytes from memory and +// then perform a bitwise right rotation. Returns the rotation amount or +// std::nullopt if we can't match the pattern. +static std::optional getRotationAmount(ArrayRef ByteOffsets, + int64_t FirstOffset) { + unsigned ByteWidth = ByteOffsets.size(); + if (ByteWidth == 0) + return std::nullopt; + + int64_t FirstByteActualOffset = ByteOffsets[0]; + int64_t RotateAmtInBytes = FirstByteActualOffset - FirstOffset; + + // Check the rotation amount is valid + if (RotateAmtInBytes < 0 || RotateAmtInBytes >= ByteWidth) + return std::nullopt; + + // Make sure each of the following loads follow the same rotational pattern. + for (unsigned I = 0; I < ByteWidth; ++I) { + int64_t ExpectedOffset = FirstOffset + ((I + RotateAmtInBytes) % ByteWidth); + if (ByteOffsets[I] != ExpectedOffset) { + return std::nullopt; + } + } + + // Return the rotation amount in bits. + return RotateAmtInBytes * 8; +} + // Look through one layer of truncate or extend. static SDValue stripTruncAndExt(SDValue Value) { switch (Value.getOpcode()) { @@ -9776,65 +9805,99 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // little endian value load std::optional IsBigEndian = isBigEndian( ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); - if (!IsBigEndian) - return SDValue(); - assert(FirstByteProvider && "must be set"); + // Handle the standard load combine. + if (IsBigEndian) { + bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; + + // Before legalize we can introduce illegal bswaps which will be later + // converted to an explicit bswap sequence. This way we end up with a single + // load and byte shuffling instead of several loads and byte shuffling. + // We do not introduce illegal bswaps when zero-extending as this tends to + // introduce too many arithmetic instructions. + if (NeedsBswap && (LegalOperations || NeedsZext) && + !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); - // Ensure that the first byte is loaded from zero offset of the first load. - // So the combined value can be loaded from the first load address. - if (MemoryByteOffset(*FirstByteProvider) != 0) - return SDValue(); - auto *FirstLoad = cast(FirstByteProvider->Src.value()); + // If we need to bswap and zero extend, we have to insert a shift. Check + // thatunsigned Fast = 0; it is legal. + if (NeedsBswap && NeedsZext && LegalOperations && + !TLI.isOperationLegal(ISD::SHL, VT)) + return SDValue(); - // The node we are looking at matches with the pattern, check if we can - // replace it with a single (possibly zero-extended) load and bswap + shift if - // needed. + auto *FirstLoad = cast(FirstByteProvider->Src.value()); + if (MemoryByteOffset(*FirstByteProvider) != 0) + return SDValue(); - // If the load needs byte swap check if the target supports it - bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; + // Check that a load of the wide type is both allowed and fast on the target + unsigned Fast = 0; + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + *FirstLoad->getMemOperand(), &Fast) || + !Fast) + return SDValue(); - // Before legalize we can introduce illegal bswaps which will be later - // converted to an explicit bswap sequence. This way we end up with a single - // load and byte shuffling instead of several loads and byte shuffling. - // We do not introduce illegal bswaps when zero-extending as this tends to - // introduce too many arithmetic instructions. - if (NeedsBswap && (LegalOperations || NeedsZext) && - !TLI.isOperationLegal(ISD::BSWAP, VT)) - return SDValue(); + SDValue NewLoad = DAG.getExtLoad( + NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, Chain, + FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), MemVT, + FirstLoad->getAlign()); - // If we need to bswap and zero extend, we have to insert a shift. Check that - // it is legal. - if (NeedsBswap && NeedsZext && LegalOperations && - !TLI.isOperationLegal(ISD::SHL, VT)) - return SDValue(); + for (LoadSDNode *L : Loads) + DAG.makeEquivalentMemoryOrdering(L, NewLoad); - // Check that a load of the wide type is both allowed and fast on the target - unsigned Fast = 0; - bool Allowed = - TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - *FirstLoad->getMemOperand(), &Fast); - if (!Allowed || !Fast) - return SDValue(); + // It is a simple combine. + if (!NeedsBswap) + return NewLoad; - SDValue NewLoad = - DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, - Chain, FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign()); + // It is a BSWAP combine. + SDValue ShiftedLoad = + NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, + DAG.getShiftAmountConstant( + ZeroExtendedBytes * 8, VT, SDLoc(N))) + : NewLoad; + return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); + } - // Transfer chain users from old loads to the new load. - for (LoadSDNode *L : Loads) - DAG.makeEquivalentMemoryOrdering(L, NewLoad); + // Handle the rotated load combine. + if (auto RotateAmt = getRotationAmount( + ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset)) { - if (!NeedsBswap) - return NewLoad; + // Make sure we can rotate + if (LegalOperations && !TLI.isOperationLegal(ISD::ROTR, VT)) + return SDValue(); - SDValue ShiftedLoad = - NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, - DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, - VT, SDLoc(N))) - : NewLoad; - return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); + auto *FirstLoad = cast(FirstByteProvider->Src.value()); + if (MemoryByteOffset(*FirstByteProvider) != 0) + return SDValue(); + + // Make sure the operation is legal and fast. + unsigned Fast = 0; + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + *FirstLoad->getMemOperand(), &Fast) || + !Fast) + return SDValue(); + + // Create the new load, rotate and then zero extend after if we need to. + SDValue NewLoad = + DAG.getLoad(MemVT, SDLoc(N), Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo()); + + for (LoadSDNode *L : Loads) + DAG.makeEquivalentMemoryOrdering(L, NewLoad); + + EVT ShiftAmountTy = + TLI.getShiftAmountTy(NewLoad.getValueType(), DAG.getDataLayout()); + SDValue Rotated = + DAG.getNode(ISD::ROTR, SDLoc(N), MemVT, NewLoad, + DAG.getConstant(*RotateAmt, SDLoc(N), ShiftAmountTy)); + + if (NeedsZext) + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Rotated); + + return Rotated; + } + + // No pattern matched. + return SDValue(); } // If the target has andn, bsl, or a similar bit-select instruction, diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll index f21c07599d6f1..3fb8cfe3c81da 100644 --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -1314,3 +1314,66 @@ define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind { %res = or i32 %e1.ext.shift, %e0.ext ret i32 %res } + +define i64 @test_load_bswap_to_rotate(ptr %p) { +; CHECK-LABEL: test_load_bswap_to_rotate: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %edx +; CHECK-NEXT: movl 4(%eax), %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: test_load_bswap_to_rotate: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movq (%rdi), %rax +; CHECK64-NEXT: rorq $32, %rax +; CHECK64-NEXT: retq + + %p.hi = getelementptr inbounds nuw i8, ptr %p, i64 4 + %lo = load i32, ptr %p + %hi = load i32, ptr %p.hi + %conv = zext i32 %lo to i64 + %shl = shl nuw i64 %conv, 32 + %conv2 = zext i32 %hi to i64 + %or = or disjoint i64 %shl, %conv2 + ret i64 %or +} + +define i64 @test_load_rotate_zext(ptr %p) { +; CHECK-LABEL: test_load_rotate_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: rorl $8, %eax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: retl +; +; CHECK64-LABEL: test_load_rotate_zext: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movl (%rdi), %eax +; CHECK64-NEXT: rorl $8, %eax +; CHECK64-NEXT: retq + %p1 = getelementptr inbounds i8, ptr %p, i64 1 + %l1 = load i8, ptr %p1, align 1 + %e1 = zext i8 %l1 to i64 + + %p2 = getelementptr inbounds i8, ptr %p, i64 2 + %l2 = load i8, ptr %p2, align 1 + %e2 = zext i8 %l2 to i64 + %s2 = shl i64 %e2, 8 + + %p3 = getelementptr inbounds i8, ptr %p, i64 3 + %l3 = load i8, ptr %p3, align 1 + %e3 = zext i8 %l3 to i64 + %s3 = shl i64 %e3, 16 + + %p0 = getelementptr inbounds i8, ptr %p, i64 0 + %l0 = load i8, ptr %p0, align 1 + %e0 = zext i8 %l0 to i64 + %s0 = shl i64 %e0, 24 + + %or1 = or i64 %e1, %s2 + %or2 = or i64 %or1, %s3 + %or3 = or i64 %or2, %s0 + ret i64 %or3 +}