From fc95ffbd5cda9326366da42f5ed7e21036a09272 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Sat, 25 Oct 2025 12:27:59 -0400 Subject: [PATCH 1/4] [ARM] Only change mask if demanded bits says we can optimize Also enable a switch to turn off enable-logical-imm. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 99 +++++++++--- llvm/test/CodeGen/ARM/and-cmpz.ll | 2 +- llvm/test/CodeGen/ARM/fpenv.ll | 4 +- llvm/test/CodeGen/ARM/funnel-shift-rot.ll | 5 +- ...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 56 ++++--- .../CodeGen/ARM/illegal-bitfield-loadstore.ll | 10 +- llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll | 84 ++++++---- llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll | 153 +++++++++++++----- .../CodeGen/ARM/simplifysetcc_narrow_load.ll | 4 +- .../CodeGen/ARM/urem-seteq-illegal-types.ll | 6 +- llvm/test/CodeGen/Thumb/bic_imm.ll | 2 +- llvm/test/CodeGen/Thumb2/active_lane_mask.ll | 2 +- 12 files changed, 286 insertions(+), 141 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 313ae3d68fb83..6c994f36c9833 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -118,6 +118,7 @@ using namespace llvm; #define DEBUG_TYPE "arm-isel" STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); STATISTIC(NumConstpoolPromoted, @@ -142,6 +143,12 @@ static cl::opt ConstpoolPromotionMaxTotal( cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); +static cl::opt + EnableOptimizeLogicalImm("arm-enable-logical-imm", cl::Hidden, + cl::desc("Enable ARM logical imm instruction " + "optimization"), + cl::init(true)); + cl::opt MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), @@ -20138,6 +20145,16 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } } +static bool isLegalLogicalImmediate(unsigned Imm, + const ARMSubtarget *Subtarget) { + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(Imm) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(Imm) != -1; + // Thumb1 only has 8-bit unsigned immediate. + return Imm <= 255; +} + bool ARMTargetLowering::targetShrinkDemandedConstant( SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const { @@ -20146,8 +20163,7 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( if (!TLO.LegalOps) return false; - // Only optimize AND for now. - if (Op.getOpcode() != ISD::AND) + if (!EnableOptimizeLogicalImm) return false; EVT VT = Op.getValueType(); @@ -20158,6 +20174,14 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( assert(VT == MVT::i32 && "Unexpected integer type"); + // Exit early if we demand all bits. + if (DemandedBits.popcount() == 32) + return false; + + // Only optimize AND for now. + if (Op.getOpcode() != ISD::AND) + return false; + // Make sure the RHS really is a constant. ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) @@ -20165,21 +20189,13 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( unsigned Mask = C->getZExtValue(); + if (Mask == 0 || Mask == ~0U) + return false; + unsigned Demanded = DemandedBits.getZExtValue(); unsigned ShrunkMask = Mask & Demanded; unsigned ExpandedMask = Mask | ~Demanded; - // If the mask is all zeros, let the target-independent code replace the - // result with zero. - if (ShrunkMask == 0) - return false; - - // If the mask is all ones, erase the AND. (Currently, the target-independent - // code won't do this, so we have to do it explicitly to avoid an infinite - // loop in obscure cases.) - if (ExpandedMask == ~0U) - return TLO.CombineTo(Op, Op.getOperand(0)); - auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; }; @@ -20192,30 +20208,61 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( return TLO.CombineTo(Op, NewOp); }; - // Prefer uxtb mask. - if (IsLegalMask(0xFF)) - return UseMask(0xFF); + // If the mask is all zeros, let the target-independent code replace the + // result with zero. + if (ShrunkMask == 0) { + ++NumOptimizedImms; + return UseMask(ShrunkMask); + } - // Prefer uxth mask. - if (IsLegalMask(0xFFFF)) - return UseMask(0xFFFF); + // If the mask is all ones, erase the AND. (Currently, the target-independent + // code won't do this, so we have to do it explicitly to avoid an infinite + // loop in obscure cases.) + if (ExpandedMask == ~0U) { + ++NumOptimizedImms; + return UseMask(ExpandedMask); + } - // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. - // FIXME: Prefer a contiguous sequence of bits for other optimizations. - if (ShrunkMask < 256) + // If thumb, check for uxth and uxtb masks first and foremost. + if (Subtarget->isThumb1Only() && Subtarget->hasV6Ops()) { + if (IsLegalMask(0xFF)) { + ++NumOptimizedImms; + return UseMask(0xFF); + } + + if (IsLegalMask(0xFFFF)) { + ++NumOptimizedImms; + return UseMask(0xFFFF); + } + } + + // Don't optimize if it is legal already. + if (isLegalLogicalImmediate(Mask, Subtarget)) + return false; + + if (isLegalLogicalImmediate(ShrunkMask, Subtarget)) { + ++NumOptimizedImms; return UseMask(ShrunkMask); + } - // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. - // FIXME: Prefer a contiguous sequence of bits for other optimizations. - if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) + // FIXME: The check for v6 is because this interferes with some ubfx + // optimizations + if (!Subtarget->hasV6Ops() && + isLegalLogicalImmediate(~ExpandedMask, Subtarget)) { + ++NumOptimizedImms; return UseMask(ExpandedMask); + } + + if ((~ExpandedMask) < 256) { + ++NumOptimizedImms; + return UseMask(ExpandedMask); + } // Potential improvements: // // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. // We could try to prefer Thumb1 immediates which can be lowered to a // two-instruction sequence. - // We could try to recognize more legal ARM/Thumb2 immediates here. return false; } diff --git a/llvm/test/CodeGen/ARM/and-cmpz.ll b/llvm/test/CodeGen/ARM/and-cmpz.ll index 1f72307f12a68..30d8079d4e4ba 100644 --- a/llvm/test/CodeGen/ARM/and-cmpz.ll +++ b/llvm/test/CodeGen/ARM/and-cmpz.ll @@ -92,7 +92,7 @@ false: ; T1: uxth r0, r0 ; T1-NEXT: lsrs r0, r0, #9 ; T1-NEXT: bne -; T2: uxth r0, r0 +; T2: and r0, r0, #65024 ; T2-NEXT: movs r2, #0 ; T2-NEXT: cmp.w r2, r0, lsr #9 define void @i16_cmpz(i16 %x, ptr %foo) { diff --git a/llvm/test/CodeGen/ARM/fpenv.ll b/llvm/test/CodeGen/ARM/fpenv.ll index f5d87170d9153..57e264d97bc44 100644 --- a/llvm/test/CodeGen/ARM/fpenv.ll +++ b/llvm/test/CodeGen/ARM/fpenv.ll @@ -41,8 +41,8 @@ define void @func_05() { ; CHECK-LABEL: func_05: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmrs r0, fpscr -; CHECK-NEXT: bic r0, r0, #12582912 ; CHECK-NEXT: orr r0, r0, #4194304 +; CHECK-NEXT: bic r0, r0, #8388608 ; CHECK-NEXT: vmsr fpscr, r0 ; CHECK-NEXT: mov pc, lr call void @llvm.set.rounding(i32 2) @@ -53,8 +53,8 @@ define void @func_06() { ; CHECK-LABEL: func_06: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmrs r0, fpscr -; CHECK-NEXT: bic r0, r0, #12582912 ; CHECK-NEXT: orr r0, r0, #8388608 +; CHECK-NEXT: bic r0, r0, #4194304 ; CHECK-NEXT: vmsr fpscr, r0 ; CHECK-NEXT: mov pc, lr call void @llvm.set.rounding(i32 3) diff --git a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll index a1b6847d623d0..6f34a5fd00314 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll @@ -19,7 +19,7 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) define i8 @rotl_i8_const_shift(i8 %x) { ; CHECK-LABEL: rotl_i8_const_shift: ; CHECK: @ %bb.0: -; CHECK-NEXT: uxtb r1, r0 +; CHECK-NEXT: and r1, r0, #224 ; CHECK-NEXT: lsl r0, r0, #3 ; CHECK-NEXT: orr r0, r0, r1, lsr #5 ; CHECK-NEXT: bx lr @@ -161,8 +161,7 @@ define <4 x i32> @rotl_v4i32_rotl_const_shift(<4 x i32> %x) { define i8 @rotr_i8_const_shift(i8 %x) { ; CHECK-LABEL: rotr_i8_const_shift: ; CHECK: @ %bb.0: -; CHECK-NEXT: uxtb r1, r0 -; CHECK-NEXT: lsr r1, r1, #3 +; CHECK-NEXT: ubfx r1, r0, #3, #5 ; CHECK-NEXT: orr r0, r1, r0, lsl #5 ; CHECK-NEXT: bx lr %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3) diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index 7cc623fb0a616..a21ac8944d7ad 100644 --- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -21,9 +21,9 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { ; ARM-LABEL: scalar_i8_signbit_eq: ; ARM: @ %bb.0: ; ARM-NEXT: uxtb r1, r1 -; ARM-NEXT: lsl r0, r0, r1 +; ARM-NEXT: mov r2, #128 +; ARM-NEXT: and r0, r2, r0, lsl r1 ; ARM-NEXT: mov r1, #1 -; ARM-NEXT: uxtb r0, r0 ; ARM-NEXT: eor r0, r1, r0, lsr #7 ; ARM-NEXT: bx lr ; @@ -42,7 +42,7 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { ; THUMB78-NEXT: uxtb r1, r1 ; THUMB78-NEXT: lsls r0, r1 ; THUMB78-NEXT: movs r1, #1 -; THUMB78-NEXT: uxtb r0, r0 +; THUMB78-NEXT: and r0, r0, #128 ; THUMB78-NEXT: eor.w r0, r1, r0, lsr #7 ; THUMB78-NEXT: bx lr %t0 = lshr i8 128, %y @@ -122,9 +122,9 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { ; ARM-LABEL: scalar_i16_signbit_eq: ; ARM: @ %bb.0: ; ARM-NEXT: uxth r1, r1 -; ARM-NEXT: lsl r0, r0, r1 +; ARM-NEXT: mov r2, #32768 +; ARM-NEXT: and r0, r2, r0, lsl r1 ; ARM-NEXT: mov r1, #1 -; ARM-NEXT: uxth r0, r0 ; ARM-NEXT: eor r0, r1, r0, lsr #15 ; ARM-NEXT: bx lr ; @@ -144,7 +144,7 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { ; THUMB78-NEXT: uxth r1, r1 ; THUMB78-NEXT: lsls r0, r1 ; THUMB78-NEXT: movs r1, #1 -; THUMB78-NEXT: uxth r0, r0 +; THUMB78-NEXT: and r0, r0, #32768 ; THUMB78-NEXT: eor.w r0, r1, r0, lsr #15 ; THUMB78-NEXT: bx lr %t0 = lshr i16 32768, %y @@ -862,21 +862,35 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ;------------------------------------------------------------------------------; define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind { -; ARM-LABEL: scalar_i8_signbit_ne: -; ARM: @ %bb.0: -; ARM-NEXT: uxtb r1, r1 -; ARM-NEXT: lsl r0, r0, r1 -; ARM-NEXT: uxtb r0, r0 -; ARM-NEXT: lsr r0, r0, #7 -; ARM-NEXT: bx lr +; ARM6-LABEL: scalar_i8_signbit_ne: +; ARM6: @ %bb.0: +; ARM6-NEXT: uxtb r1, r1 +; ARM6-NEXT: mov r2, #128 +; ARM6-NEXT: and r0, r2, r0, lsl r1 +; ARM6-NEXT: lsr r0, r0, #7 +; ARM6-NEXT: bx lr ; -; THUMB-LABEL: scalar_i8_signbit_ne: -; THUMB: @ %bb.0: -; THUMB-NEXT: uxtb r1, r1 -; THUMB-NEXT: lsls r0, r1 -; THUMB-NEXT: uxtb r0, r0 -; THUMB-NEXT: lsrs r0, r0, #7 -; THUMB-NEXT: bx lr +; ARM78-LABEL: scalar_i8_signbit_ne: +; ARM78: @ %bb.0: +; ARM78-NEXT: uxtb r1, r1 +; ARM78-NEXT: lsl r0, r0, r1 +; ARM78-NEXT: ubfx r0, r0, #7, #1 +; ARM78-NEXT: bx lr +; +; THUMB6-LABEL: scalar_i8_signbit_ne: +; THUMB6: @ %bb.0: +; THUMB6-NEXT: uxtb r1, r1 +; THUMB6-NEXT: lsls r0, r1 +; THUMB6-NEXT: uxtb r0, r0 +; THUMB6-NEXT: lsrs r0, r0, #7 +; THUMB6-NEXT: bx lr +; +; THUMB78-LABEL: scalar_i8_signbit_ne: +; THUMB78: @ %bb.0: +; THUMB78-NEXT: uxtb r1, r1 +; THUMB78-NEXT: lsls r0, r1 +; THUMB78-NEXT: ubfx r0, r0, #7, #1 +; THUMB78-NEXT: bx lr %t0 = lshr i8 128, %y %t1 = and i8 %t0, %x %res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate @@ -1051,3 +1065,5 @@ define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind { %res = icmp eq i8 %t1, 1 ; should be comparing with 0 ret i1 %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; THUMB: {{.*}} diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll index 5dbf8dd86b891..822bb89ecf22a 100644 --- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -53,10 +53,8 @@ define void @i24_and_or(ptr %a) { define void @i24_insert_bit(ptr %a, i1 zeroext %bit) { ; LE-LABEL: i24_insert_bit: ; LE: @ %bb.0: -; LE-NEXT: mov r3, #255 ; LE-NEXT: ldrh r2, [r0] -; LE-NEXT: orr r3, r3, #57088 -; LE-NEXT: and r2, r2, r3 +; LE-NEXT: bic r2, r2, #8192 ; LE-NEXT: orr r1, r2, r1, lsl #13 ; LE-NEXT: strh r1, [r0] ; LE-NEXT: mov pc, lr @@ -64,8 +62,7 @@ define void @i24_insert_bit(ptr %a, i1 zeroext %bit) { ; BE-LABEL: i24_insert_bit: ; BE: @ %bb.0: ; BE-NEXT: ldrh r2, [r0] -; BE-NEXT: mov r3, #57088 -; BE-NEXT: orr r3, r3, #16711680 +; BE-NEXT: mvn r3, #8192 ; BE-NEXT: and r2, r3, r2, lsl #8 ; BE-NEXT: orr r1, r2, r1, lsl #13 ; BE-NEXT: lsr r1, r1, #8 @@ -144,8 +141,7 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) { ; BE-LABEL: i56_insert_bit: ; BE: @ %bb.0: ; BE-NEXT: ldrh r2, [r0, #4]! -; BE-NEXT: mov r3, #57088 -; BE-NEXT: orr r3, r3, #16711680 +; BE-NEXT: mvn r3, #8192 ; BE-NEXT: and r2, r3, r2, lsl #8 ; BE-NEXT: orr r1, r2, r1, lsl #13 ; BE-NEXT: lsr r1, r1, #8 diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll index a9eda31e729e2..4aa8f1a1ae923 100644 --- a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll +++ b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll @@ -1,13 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=armv7a -mattr=+hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,DIV ; RUN: llc -mtriple=armv7a -mattr=-hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,NODIV ; Check SREM define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: test_rem -; CHECK: asr r1, r0, #31 -; CHECK-NEXT: add r1, r0, r1, lsr #30 -; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: sub r0, r0, r1 +; CHECK-LABEL: test_rem: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: asr r1, r0, #31 +; CHECK-NEXT: add r1, r0, r1, lsr #30 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: sub r0, r0, r1 +; CHECK-NEXT: bx lr entry: %div = srem i32 %F, 4 @@ -16,18 +19,22 @@ entry: ; Try an i16 sdiv, with a small immediate. define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 { -; CHECK-LABEL: f0 +; DIV-LABEL: f0: +; DIV: @ %bb.0: @ %entry +; DIV-NEXT: mov r1, #2 +; DIV-NEXT: sdiv r0, r0, r1 +; DIV-NEXT: sxth r0, r0 +; DIV-NEXT: bx lr +; +; NODIV-LABEL: f0: +; NODIV: @ %bb.0: @ %entry +; NODIV-NEXT: and r1, r0, #32768 +; NODIV-NEXT: add r0, r0, r1, lsr #15 +; NODIV-NEXT: sxth r0, r0 +; NODIV-NEXT: asr r0, r0, #1 +; NODIV-NEXT: bx lr -; DIV: mov r1, #2 -; DIV-NEXT: sdiv r0, r0, r1 -; DIV-NEXT: sxth r0, r0 -; DIV-NEXT: bx lr -; NODIV: uxth r1, r0 -; NODIV-NEXT: add r0, r0, r1, lsr #15 -; NODIV-NEXT: sxth r0, r0 -; NODIV-NEXT: asr r0, r0, #1 -; NODIV-NEXT: bx lr entry: %0 = sdiv i16 %F, 2 @@ -36,16 +43,20 @@ entry: ; Try an i32 sdiv, with a small immediate. define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f1 +; DIV-LABEL: f1: +; DIV: @ %bb.0: @ %entry +; DIV-NEXT: mov r1, #4 +; DIV-NEXT: sdiv r0, r0, r1 +; DIV-NEXT: bx lr +; +; NODIV-LABEL: f1: +; NODIV: @ %bb.0: @ %entry +; NODIV-NEXT: asr r1, r0, #31 +; NODIV-NEXT: add r0, r0, r1, lsr #30 +; NODIV-NEXT: asr r0, r0, #2 +; NODIV-NEXT: bx lr -; DIV: mov r1, #4 -; DIV-NEXT: sdiv r0, r0, r1 -; DIV-NEXT: bx lr -; NODIV: asr r1, r0, #31 -; NODIV-NEXT: add r0, r0, r1, lsr #30 -; NODIV-NEXT: asr r0, r0, #2 -; NODIV-NEXT: bx lr entry: %div = sdiv i32 %F, 4 @@ -55,10 +66,18 @@ entry: ; Try a large power of 2 immediate, which should also be materialised with 1 ; move immediate instruction. define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f2 -; DIV: mov r1, #131072 -; DIV-NEXT: sdiv r0, r0, r1 -; DIV-NEXT: bx lr +; DIV-LABEL: f2: +; DIV: @ %bb.0: @ %entry +; DIV-NEXT: mov r1, #131072 +; DIV-NEXT: sdiv r0, r0, r1 +; DIV-NEXT: bx lr +; +; NODIV-LABEL: f2: +; NODIV: @ %bb.0: @ %entry +; NODIV-NEXT: asr r1, r0, #31 +; NODIV-NEXT: add r0, r0, r1, lsr #15 +; NODIV-NEXT: asr r0, r0, #17 +; NODIV-NEXT: bx lr entry: %div = sdiv i32 %F, 131072 ret i32 %div @@ -66,11 +85,12 @@ entry: ; MinSize not set, so should expand to the faster but longer sequence. define dso_local i32 @f3(i32 %F) { -; CHECK-LABEL: f3 -; CHECK: asr r1, r0, #31 -; CHECK-NEXT: add r0, r0, r1, lsr #30 -; CHECK-NEXT: asr r0, r0, #2 -; CHECK-NEXT: bx lr +; CHECK-LABEL: f3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: asr r1, r0, #31 +; CHECK-NEXT: add r0, r0, r1, lsr #30 +; CHECK-NEXT: asr r0, r0, #2 +; CHECK-NEXT: bx lr entry: %div = sdiv i32 %F, 4 ret i32 %div diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll index 4b0419577cdf0..7a93267fcc390 100644 --- a/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll +++ b/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefixes=CHECK,T2 ; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefixes=CHECK,T2 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefixes=CHECK,T1 @@ -13,11 +14,21 @@ ; Test sdiv i16 define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 { -; CHECK-LABEL: f0 -; CHECK: movs r1, #2 -; CHECK-NEXT: sdiv r0, r0, r1 -; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: bx lr +; CHECK-LABEL: f0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: bx lr +; +; V6M-LABEL: f0: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: uxth r1, r0 +; V6M-NEXT: lsrs r1, r1, #15 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: sxth r0, r0 +; V6M-NEXT: asrs r0, r0, #1 +; V6M-NEXT: bx lr entry: %0 = sdiv i16 %F, 2 @@ -26,10 +37,19 @@ entry: ; Same as above, but now with i32 define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f1 -; CHECK: movs r1, #4 -; CHECK-NEXT: sdiv r0, r0, r1 -; CHECK-NEXT: bx lr +; CHECK-LABEL: f1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #4 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: bx lr +; +; V6M-LABEL: f1: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: asrs r1, r0, #31 +; V6M-NEXT: lsrs r1, r1, #30 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: asrs r0, r0, #2 +; V6M-NEXT: bx lr entry: %div = sdiv i32 %F, 4 @@ -38,10 +58,18 @@ entry: ; The immediate is not a power of 2, so we expect a sdiv. define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f2 -; CHECK: movs r1, #5 -; CHECK-NEXT: sdiv r0, r0, r1 -; CHECK-NEXT: bx lr +; CHECK-LABEL: f2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #5 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: bx lr +; +; V6M-LABEL: f2: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: movs r1, #5 +; V6M-NEXT: bl __divsi3 +; V6M-NEXT: pop {r7, pc} entry: %div = sdiv i32 %F, 5 @@ -51,8 +79,28 @@ entry: ; Try a larger power of 2 immediate: immediates larger than ; 128 don't give any code size savings. define dso_local i32 @f3(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f3 -; CHECK-NOT: sdiv +; T2-LABEL: f3: +; T2: @ %bb.0: @ %entry +; T2-NEXT: asrs r1, r0, #31 +; T2-NEXT: add.w r0, r0, r1, lsr #24 +; T2-NEXT: asrs r0, r0, #8 +; T2-NEXT: bx lr +; +; T1-LABEL: f3: +; T1: @ %bb.0: @ %entry +; T1-NEXT: asrs r1, r0, #31 +; T1-NEXT: lsrs r1, r1, #24 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: asrs r0, r0, #8 +; T1-NEXT: bx lr +; +; V6M-LABEL: f3: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: asrs r1, r0, #31 +; V6M-NEXT: lsrs r1, r1, #24 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: asrs r0, r0, #8 +; V6M-NEXT: bx lr entry: %div = sdiv i32 %F, 256 ret i32 %div @@ -65,20 +113,32 @@ attributes #0 = { minsize norecurse nounwind optsize readnone } ; the sdiv to sdiv, but to the faster instruction sequence. define dso_local signext i16 @f4(i16 signext %F) { -; T2-LABEL: f4 -; T2: uxth r1, r0 -; T2-NEXT: add.w r0, r0, r1, lsr #15 -; T2-NEXT: sxth r0, r0 -; T2-NEXT: asrs r0, r0, #1 -; T2-NEXT: bx lr - -; T1-LABEL: f4 -; T1: uxth r1, r0 -; T1-NEXT: lsrs r1, r1, #15 -; T1-NEXT: adds r0, r0, r1 -; T1-NEXT: sxth r0, r0 -; T1-NEXT: asrs r0, r0, #1 -; T1-NEXT: bx lr +; T2-LABEL: f4: +; T2: @ %bb.0: @ %entry +; T2-NEXT: and r1, r0, #32768 +; T2-NEXT: add.w r0, r0, r1, lsr #15 +; T2-NEXT: sxth r0, r0 +; T2-NEXT: asrs r0, r0, #1 +; T2-NEXT: bx lr +; +; T1-LABEL: f4: +; T1: @ %bb.0: @ %entry +; T1-NEXT: uxth r1, r0 +; T1-NEXT: lsrs r1, r1, #15 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: sxth r0, r0 +; T1-NEXT: asrs r0, r0, #1 +; T1-NEXT: bx lr +; +; V6M-LABEL: f4: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: uxth r1, r0 +; V6M-NEXT: lsrs r1, r1, #15 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: sxth r0, r0 +; V6M-NEXT: asrs r0, r0, #1 +; V6M-NEXT: bx lr + entry: %0 = sdiv i16 %F, 2 @@ -86,18 +146,29 @@ entry: } define dso_local i32 @f5(i32 %F) { -; T2-LABEL: f5 -; T2: asrs r1, r0, #31 -; T2-NEXT: add.w r0, r0, r1, lsr #30 -; T2-NEXT: asrs r0, r0, #2 -; T2-NEXT: bx lr - -; T1-LABEL: f5 -; T1: asrs r1, r0, #31 -; T1-NEXT: lsrs r1, r1, #30 -; T1-NEXT: adds r0, r0, r1 -; T1-NEXT: asrs r0, r0, #2 -; T1-NEXT: bx lr +; T2-LABEL: f5: +; T2: @ %bb.0: @ %entry +; T2-NEXT: asrs r1, r0, #31 +; T2-NEXT: add.w r0, r0, r1, lsr #30 +; T2-NEXT: asrs r0, r0, #2 +; T2-NEXT: bx lr +; +; T1-LABEL: f5: +; T1: @ %bb.0: @ %entry +; T1-NEXT: asrs r1, r0, #31 +; T1-NEXT: lsrs r1, r1, #30 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: asrs r0, r0, #2 +; T1-NEXT: bx lr +; +; V6M-LABEL: f5: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: asrs r1, r0, #31 +; V6M-NEXT: lsrs r1, r1, #30 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: asrs r0, r0, #2 +; V6M-NEXT: bx lr + entry: %div = sdiv i32 %F, 4 diff --git a/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll b/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll index 838da59f9e412..a5f3822bfa1ae 100644 --- a/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll +++ b/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll @@ -24,9 +24,7 @@ define i1 @test_129_15_0(ptr %y) { ; CHECK-LE-LABEL: test_129_15_0: ; CHECK-LE: @ %bb.0: ; CHECK-LE-NEXT: ldrh r0, [r0] -; CHECK-LE-NEXT: mov r1, #255 -; CHECK-LE-NEXT: orr r1, r1, #32512 -; CHECK-LE-NEXT: ands r0, r0, r1 +; CHECK-LE-NEXT: bics r0, r0, #32768 ; CHECK-LE-NEXT: movne r0, #1 ; CHECK-LE-NEXT: mov pc, lr ; diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll index b85cb3a4f191c..dae52a27d37f0 100644 --- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll @@ -90,7 +90,7 @@ define i1 @test_urem_even(i27 %X) nounwind { ; ARM5: @ %bb.0: ; ARM5-NEXT: ldr r1, .LCPI1_0 ; ARM5-NEXT: mul r2, r0, r1 -; ARM5-NEXT: bic r0, r2, #-134217727 +; ARM5-NEXT: bic r0, r2, #134217728 ; ARM5-NEXT: lsr r0, r0, #1 ; ARM5-NEXT: orr r0, r0, r2, lsl #26 ; ARM5-NEXT: ldr r2, .LCPI1_1 @@ -333,11 +333,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; ARM5-NEXT: mov r3, #171 ; ARM5-NEXT: orr r3, r3, #512 ; ARM5-NEXT: mul r12, r0, r3 -; ARM5-NEXT: mov r0, #1020 -; ARM5-NEXT: orr r0, r0, #1024 ; ARM5-NEXT: mov r3, #254 ; ARM5-NEXT: orr r3, r3, #1792 -; ARM5-NEXT: and r0, r12, r0 +; ARM5-NEXT: bic r0, r12, #2048 ; ARM5-NEXT: lsr r0, r0, #1 ; ARM5-NEXT: orr r0, r0, r12, lsl #10 ; ARM5-NEXT: sub r12, r1, #1 diff --git a/llvm/test/CodeGen/Thumb/bic_imm.ll b/llvm/test/CodeGen/Thumb/bic_imm.ll index 741b2cf8db2e3..a2fc448670f0c 100644 --- a/llvm/test/CodeGen/Thumb/bic_imm.ll +++ b/llvm/test/CodeGen/Thumb/bic_imm.ll @@ -82,7 +82,7 @@ define void @truncated_neg256(i16 %a, ptr %p) { ; ; CHECK-T2-LABEL: truncated_neg256: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: bic r0, r0, #255 +; CHECK-T2-NEXT: and r0, r0, #65280 ; CHECK-T2-NEXT: strh r0, [r1] ; CHECK-T2-NEXT: bx lr %and = and i16 %a, -256 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index bcd92f81911b2..b75f1ff742bee 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -283,7 +283,7 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: adds r0, r2, #1 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: bic r0, r0, #1 +; CHECK-NEXT: and r0, r0, #510 ; CHECK-NEXT: subs r0, #2 ; CHECK-NEXT: add.w r0, r3, r0, lsr #1 ; CHECK-NEXT: dls lr, r0 From 8e01b9b0864d8931edc6eb99d31255200d5db918 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Mon, 27 Oct 2025 14:24:55 -0400 Subject: [PATCH 2/4] [SelectionDAG] Optimize BSWAP yet again once more --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 32 ++++++--- llvm/lib/Target/ARM/README.txt | 26 ------- .../CodeGen/ARM/load-combine-big-endian.ll | 71 +++++++++---------- llvm/test/CodeGen/ARM/load-combine.ll | 53 +++++++------- 4 files changed, 78 insertions(+), 104 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index da4e40953b39a..eccf9e1b45097 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9899,17 +9899,29 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const { // Use a rotate by 8. This can be further expanded if necessary. return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT)); case MVT::i32: - // This is meant for ARM speficially, which has ROTR but no ROTL. + // This is meant for ARM specifically, which has ROTR but no ROTL. if (isOperationLegalOrCustom(ISD::ROTR, VT)) { - SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT); - // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF - SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask); - SDValue Rotr = - DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT)); - SDValue Rotl = - DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT)); - SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask); - return DAG.getNode(ISD::OR, dl, VT, Rotr, And2); + // ror rtmp, r0, #16 + SDValue Ror16 = + DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(16, dl, SHVT)); + // eor r1, r0, rtmp ; r1 = r0 ^ (r0 ror 16) + SDValue Xor1 = DAG.getNode(ISD::XOR, dl, VT, Op, Ror16); + + // bic r1, r1, #0xff0000 (clear bits 16-23) + // So we need the negated value: ~0x00FF0000 = 0xFF00FFFF + SDValue Mask = DAG.getConstant(0xFF00FFFFu, dl, VT); + SDValue BicResult = DAG.getNode(ISD::AND, dl, VT, Xor1, Mask); + + // mov r1, r1, lsr #8 + SDValue Lsr8 = DAG.getNode(ISD::SRL, dl, VT, BicResult, + DAG.getConstant(8, dl, SHVT)); + + // ror r0, r0, #8 + SDValue Ror8 = + DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(8, dl, SHVT)); + + // eor r0, Lsr8, Ror8 + return DAG.getNode(ISD::XOR, dl, VT, Lsr8, Ror8); } Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT)); Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op, diff --git a/llvm/lib/Target/ARM/README.txt b/llvm/lib/Target/ARM/README.txt index ff84e07fa084a..0170cc9e4a17f 100644 --- a/llvm/lib/Target/ARM/README.txt +++ b/llvm/lib/Target/ARM/README.txt @@ -606,32 +606,6 @@ constant which was already loaded). Not sure what's necessary to do that. //===---------------------------------------------------------------------===// -The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal: - -int a(int x) { return __builtin_bswap32(x); } - -a: - mov r1, #255, 24 - mov r2, #255, 16 - and r1, r1, r0, lsr #8 - and r2, r2, r0, lsl #8 - orr r1, r1, r0, lsr #24 - orr r0, r2, r0, lsl #24 - orr r0, r0, r1 - bx lr - -Something like the following would be better (fewer instructions/registers): - eor r1, r0, r0, ror #16 - bic r1, r1, #0xff0000 - mov r1, r1, lsr #8 - eor r0, r1, r0, ror #8 - bx lr - -A custom Thumb version would also be a slight improvement over the generic -version. - -//===---------------------------------------------------------------------===// - Consider the following simple C code: void foo(unsigned char *a, unsigned char *b, int *c) { diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll index 1d5c8589429a4..e12bf031b01ae 100644 --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -50,15 +50,13 @@ define i32 @load_i32_by_i8_big_endian(ptr %arg) { ; ptr p; // p is 4 byte aligned ; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24) define i32 @load_i32_by_i8_bswap(ptr %arg) { -; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -221,16 +219,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) { define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r2, #255 ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: orr r2, r2, #16711680 -; CHECK-NEXT: and r3, r0, r2 -; CHECK-NEXT: and r0, r2, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r3, ror #8 -; CHECK-NEXT: and r3, r1, r2 -; CHECK-NEXT: and r1, r2, r1, ror #24 -; CHECK-NEXT: orr r1, r1, r3, ror #8 +; CHECK-NEXT: eor r2, r0, r0, ror #16 +; CHECK-NEXT: bic r2, r2, #16711680 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r0, r2, r0, ror #8 +; CHECK-NEXT: eor r2, r1, r1, ror #16 +; CHECK-NEXT: bic r2, r2, #16711680 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r1, r2, r1, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -370,12 +368,11 @@ define i64 @load_i64_by_i8(ptr %arg) { define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: @@ -425,12 +422,11 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { define i32 @load_i32_by_i8_neg_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: @@ -576,12 +572,11 @@ declare i16 @llvm.bswap.i16(i16) define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: @@ -654,12 +649,11 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r0, r1 -; CHECK-NEXT: mov r1, #255 -; CHECK-NEXT: orr r1, r1, #16711680 ; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: @@ -718,12 +712,11 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r1, r0 -; CHECK-NEXT: mov r1, #255 -; CHECK-NEXT: orr r1, r1, #16711680 ; CHECK-NEXT: ldr r0, [r0, #13] -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index 70873672d6523..59197cd63b474 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -114,15 +114,13 @@ define i32 @load_i32_by_i8_aligned(ptr %arg) { ; ptr p; // p is 4 byte aligned ; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3] define i32 @load_i32_by_i8_bswap(ptr %arg) { -; BSWAP is not supported by 32 bit target ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: @@ -235,16 +233,16 @@ define i64 @load_i64_by_i8(ptr %arg) { define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r2, #255 ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: ldr r0, [r0, #4] -; CHECK-NEXT: orr r2, r2, #16711680 -; CHECK-NEXT: and r3, r0, r2 -; CHECK-NEXT: and r0, r2, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r3, ror #8 -; CHECK-NEXT: and r3, r1, r2 -; CHECK-NEXT: and r1, r2, r1, ror #24 -; CHECK-NEXT: orr r1, r1, r3, ror #8 +; CHECK-NEXT: eor r2, r0, r0, ror #16 +; CHECK-NEXT: bic r2, r2, #16711680 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r0, r2, r0, ror #8 +; CHECK-NEXT: eor r2, r1, r1, ror #16 +; CHECK-NEXT: bic r2, r2, #16711680 +; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: eor r1, r2, r1, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: @@ -406,12 +404,11 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) { define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #1] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: @@ -460,12 +457,11 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0, #-4] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: @@ -516,12 +512,11 @@ declare i16 @llvm.bswap.i16(i16) define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r1, #255 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: orr r1, r1, #16711680 -; CHECK-NEXT: and r2, r0, r1 -; CHECK-NEXT: and r0, r1, r0, ror #24 -; CHECK-NEXT: orr r0, r0, r2, ror #8 +; CHECK-NEXT: eor r1, r0, r0, ror #16 +; CHECK-NEXT: bic r1, r1, #16711680 +; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: From 48b99bc8dcbc3d182c47da3efad9b558f4538ed4 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Sat, 1 Nov 2025 13:12:41 -0400 Subject: [PATCH 3/4] Thumb --- llvm/test/CodeGen/ARM/load-combine.ll | 218 ++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index 59197cd63b474..ef6ac30b87531 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-unknown | FileCheck %s +; RUN: llc < %s -mtriple=thumbv5-unknown | FileCheck %s --check-prefix=CHECK-THUMBv5 ; RUN: llc < %s -mtriple=armv6-unknown | FileCheck %s --check-prefix=CHECK-ARMv6 ; RUN: llc < %s -mtriple=thumbv6m-none-eabi | FileCheck %s --check-prefix=CHECK-THUMBv6 ; RUN: llc < %s -mtriple=thumbv7m-none-eabi | FileCheck %s --check-prefix=CHECK-THUMBv7 @@ -18,6 +19,20 @@ define i32 @load_i32_by_i8_unaligned(ptr %arg) { ; CHECK-NEXT: orr r0, r1, r0, lsl #24 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_unaligned: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldrb r1, [r0] +; CHECK-THUMBv5-NEXT: ldrb r2, [r0, #1] +; CHECK-THUMBv5-NEXT: lsls r2, r2, #8 +; CHECK-THUMBv5-NEXT: adds r1, r2, r1 +; CHECK-THUMBv5-NEXT: ldrb r2, [r0, #2] +; CHECK-THUMBv5-NEXT: lsls r2, r2, #16 +; CHECK-THUMBv5-NEXT: adds r1, r1, r2 +; CHECK-THUMBv5-NEXT: ldrb r0, [r0, #3] +; CHECK-THUMBv5-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv5-NEXT: adds r0, r1, r0 +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_unaligned: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrb r2, [r0, #1] @@ -76,6 +91,11 @@ define i32 @load_i32_by_i8_aligned(ptr %arg) { ; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_aligned: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldr r0, [r0] +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_aligned: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0] @@ -123,6 +143,26 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_bswap: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldr r0, [r0] +; CHECK-THUMBv5-NEXT: movs r1, #8 +; CHECK-THUMBv5-NEXT: movs r2, r0 +; CHECK-THUMBv5-NEXT: rors r2, r1 +; CHECK-THUMBv5-NEXT: movs r1, #16 +; CHECK-THUMBv5-NEXT: movs r3, r0 +; CHECK-THUMBv5-NEXT: rors r3, r1 +; CHECK-THUMBv5-NEXT: eors r3, r0 +; CHECK-THUMBv5-NEXT: ldr r0, .LCPI2_0 +; CHECK-THUMBv5-NEXT: ands r0, r3 +; CHECK-THUMBv5-NEXT: lsrs r0, r0, #8 +; CHECK-THUMBv5-NEXT: eors r0, r2 +; CHECK-THUMBv5-NEXT: bx lr +; CHECK-THUMBv5-NEXT: .p2align 2 +; CHECK-THUMBv5-NEXT: @ %bb.1: +; CHECK-THUMBv5-NEXT: .LCPI2_0: +; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0] @@ -171,6 +211,13 @@ define i64 @load_i64_by_i8(ptr %arg) { ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i64_by_i8: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldr r2, [r0] +; CHECK-THUMBv5-NEXT: ldr r1, [r0, #4] +; CHECK-THUMBv5-NEXT: movs r0, r2 +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i64_by_i8: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrd r0, r1, [r0] @@ -245,6 +292,37 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-NEXT: eor r1, r2, r1, ror #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i64_by_i8_bswap: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: push {r4, r5, r7, lr} +; CHECK-THUMBv5-NEXT: ldr r1, [r0, #4] +; CHECK-THUMBv5-NEXT: movs r3, #8 +; CHECK-THUMBv5-NEXT: movs r4, r1 +; CHECK-THUMBv5-NEXT: rors r4, r3 +; CHECK-THUMBv5-NEXT: movs r5, #16 +; CHECK-THUMBv5-NEXT: movs r2, r1 +; CHECK-THUMBv5-NEXT: rors r2, r5 +; CHECK-THUMBv5-NEXT: eors r2, r1 +; CHECK-THUMBv5-NEXT: ldr r1, .LCPI4_0 +; CHECK-THUMBv5-NEXT: ands r2, r1 +; CHECK-THUMBv5-NEXT: lsrs r2, r2, #8 +; CHECK-THUMBv5-NEXT: eors r2, r4 +; CHECK-THUMBv5-NEXT: ldr r0, [r0] +; CHECK-THUMBv5-NEXT: movs r4, r0 +; CHECK-THUMBv5-NEXT: rors r4, r3 +; CHECK-THUMBv5-NEXT: movs r3, r0 +; CHECK-THUMBv5-NEXT: rors r3, r5 +; CHECK-THUMBv5-NEXT: eors r3, r0 +; CHECK-THUMBv5-NEXT: ands r3, r1 +; CHECK-THUMBv5-NEXT: lsrs r1, r3, #8 +; CHECK-THUMBv5-NEXT: eors r1, r4 +; CHECK-THUMBv5-NEXT: movs r0, r2 +; CHECK-THUMBv5-NEXT: pop {r4, r5, r7, pc} +; CHECK-THUMBv5-NEXT: .p2align 2 +; CHECK-THUMBv5-NEXT: @ %bb.1: +; CHECK-THUMBv5-NEXT: .LCPI4_0: +; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrd r2, r3, [r0] @@ -315,6 +393,12 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { ; CHECK-NEXT: ldr r0, [r0, #1] ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: movs r1, #1 +; CHECK-THUMBv5-NEXT: ldr r0, [r0, r1] +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0, #1] @@ -361,6 +445,12 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) { ; CHECK-NEXT: ldr r0, [r0, #-4] ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_neg_offset: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: subs r0, r0, #4 +; CHECK-THUMBv5-NEXT: ldr r0, [r0] +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0, #-4] @@ -411,6 +501,26 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: movs r1, #1 +; CHECK-THUMBv5-NEXT: ldr r0, [r0, r1] +; CHECK-THUMBv5-NEXT: movs r1, #16 +; CHECK-THUMBv5-NEXT: movs r2, r0 +; CHECK-THUMBv5-NEXT: rors r2, r1 +; CHECK-THUMBv5-NEXT: eors r2, r0 +; CHECK-THUMBv5-NEXT: ldr r1, .LCPI7_0 +; CHECK-THUMBv5-NEXT: ands r1, r2 +; CHECK-THUMBv5-NEXT: lsrs r1, r1, #8 +; CHECK-THUMBv5-NEXT: movs r2, #8 +; CHECK-THUMBv5-NEXT: rors r0, r2 +; CHECK-THUMBv5-NEXT: eors r0, r1 +; CHECK-THUMBv5-NEXT: bx lr +; CHECK-THUMBv5-NEXT: .p2align 2 +; CHECK-THUMBv5-NEXT: @ %bb.1: +; CHECK-THUMBv5-NEXT: .LCPI7_0: +; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0, #1] @@ -464,6 +574,27 @@ define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) { ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: subs r0, r0, #4 +; CHECK-THUMBv5-NEXT: ldr r0, [r0] +; CHECK-THUMBv5-NEXT: movs r1, #8 +; CHECK-THUMBv5-NEXT: movs r2, r0 +; CHECK-THUMBv5-NEXT: rors r2, r1 +; CHECK-THUMBv5-NEXT: movs r1, #16 +; CHECK-THUMBv5-NEXT: movs r3, r0 +; CHECK-THUMBv5-NEXT: rors r3, r1 +; CHECK-THUMBv5-NEXT: eors r3, r0 +; CHECK-THUMBv5-NEXT: ldr r0, .LCPI8_0 +; CHECK-THUMBv5-NEXT: ands r0, r3 +; CHECK-THUMBv5-NEXT: lsrs r0, r0, #8 +; CHECK-THUMBv5-NEXT: eors r0, r2 +; CHECK-THUMBv5-NEXT: bx lr +; CHECK-THUMBv5-NEXT: .p2align 2 +; CHECK-THUMBv5-NEXT: @ %bb.1: +; CHECK-THUMBv5-NEXT: .LCPI8_0: +; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0, #-4] @@ -519,6 +650,26 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_bswap_i16: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldr r0, [r0] +; CHECK-THUMBv5-NEXT: movs r1, #8 +; CHECK-THUMBv5-NEXT: movs r2, r0 +; CHECK-THUMBv5-NEXT: rors r2, r1 +; CHECK-THUMBv5-NEXT: movs r1, #16 +; CHECK-THUMBv5-NEXT: movs r3, r0 +; CHECK-THUMBv5-NEXT: rors r3, r1 +; CHECK-THUMBv5-NEXT: eors r3, r0 +; CHECK-THUMBv5-NEXT: ldr r0, .LCPI9_0 +; CHECK-THUMBv5-NEXT: ands r0, r3 +; CHECK-THUMBv5-NEXT: lsrs r0, r0, #8 +; CHECK-THUMBv5-NEXT: eors r0, r2 +; CHECK-THUMBv5-NEXT: bx lr +; CHECK-THUMBv5-NEXT: .p2align 2 +; CHECK-THUMBv5-NEXT: @ %bb.1: +; CHECK-THUMBv5-NEXT: .LCPI9_0: +; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0] @@ -558,6 +709,11 @@ define i32 @load_i32_by_sext_i16(ptr %arg) { ; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_sext_i16: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldr r0, [r0] +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i32_by_sext_i16: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldr r0, [r0] @@ -592,6 +748,12 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) { ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_base_offset_index: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: adds r0, r0, r1 +; CHECK-THUMBv5-NEXT: ldr r0, [r0, #12] +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: add r0, r0, r1 @@ -649,6 +811,13 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-NEXT: ldr r0, [r0, #13] ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: load_i32_by_i8_base_offset_index_2: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: adds r0, r1, r0 +; CHECK-THUMBv5-NEXT: movs r1, #13 +; CHECK-THUMBv5-NEXT: ldr r0, [r0, r1] +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: add r0, r1, r0 @@ -705,6 +874,11 @@ define i32 @zext_load_i32_by_i8(ptr %arg) { ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: zext_load_i32_by_i8: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldrh r0, [r0] +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrh r0, [r0] @@ -741,6 +915,15 @@ define i32 @zext_load_i32_by_i8_shl_8(ptr %arg) { ; CHECK-NEXT: orr r0, r0, r1, lsl #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: zext_load_i32_by_i8_shl_8: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldrb r1, [r0] +; CHECK-THUMBv5-NEXT: lsls r1, r1, #8 +; CHECK-THUMBv5-NEXT: ldrb r0, [r0, #1] +; CHECK-THUMBv5-NEXT: lsls r0, r0, #16 +; CHECK-THUMBv5-NEXT: adds r0, r0, r1 +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_8: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrb r1, [r0] @@ -788,6 +971,15 @@ define i32 @zext_load_i32_by_i8_shl_16(ptr %arg) { ; CHECK-NEXT: orr r0, r0, r1, lsl #16 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: zext_load_i32_by_i8_shl_16: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldrb r1, [r0] +; CHECK-THUMBv5-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv5-NEXT: ldrb r0, [r0, #1] +; CHECK-THUMBv5-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv5-NEXT: adds r0, r0, r1 +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_16: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrb r1, [r0] @@ -834,6 +1026,14 @@ define i32 @zext_load_i32_by_i8_bswap(ptr %arg) { ; CHECK-NEXT: orr r0, r0, r1, lsl #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: zext_load_i32_by_i8_bswap: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldrb r1, [r0, #1] +; CHECK-THUMBv5-NEXT: ldrb r0, [r0] +; CHECK-THUMBv5-NEXT: lsls r0, r0, #8 +; CHECK-THUMBv5-NEXT: adds r0, r0, r1 +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrh r0, [r0] @@ -873,6 +1073,15 @@ define i32 @zext_load_i32_by_i8_bswap_shl_8(ptr %arg) { ; CHECK-NEXT: orr r0, r1, r0, lsl #8 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: zext_load_i32_by_i8_bswap_shl_8: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldrb r1, [r0, #1] +; CHECK-THUMBv5-NEXT: lsls r1, r1, #8 +; CHECK-THUMBv5-NEXT: ldrb r0, [r0] +; CHECK-THUMBv5-NEXT: lsls r0, r0, #16 +; CHECK-THUMBv5-NEXT: adds r0, r0, r1 +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_8: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrb r1, [r0] @@ -920,6 +1129,15 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(ptr %arg) { ; CHECK-NEXT: orr r0, r1, r0, lsl #16 ; CHECK-NEXT: mov pc, lr ; +; CHECK-THUMBv5-LABEL: zext_load_i32_by_i8_bswap_shl_16: +; CHECK-THUMBv5: @ %bb.0: +; CHECK-THUMBv5-NEXT: ldrb r1, [r0, #1] +; CHECK-THUMBv5-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv5-NEXT: ldrb r0, [r0] +; CHECK-THUMBv5-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv5-NEXT: adds r0, r0, r1 +; CHECK-THUMBv5-NEXT: bx lr +; ; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_16: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: ldrb r1, [r0] From 8ef41b7b99f4b5fc0e193983e53ec6caf84f958a Mon Sep 17 00:00:00 2001 From: AZero13 Date: Sat, 1 Nov 2025 13:17:58 -0400 Subject: [PATCH 4/4] Optimize once more --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 21 ++-- .../CodeGen/ARM/load-combine-big-endian.ll | 31 +++--- llvm/test/CodeGen/ARM/load-combine.ll | 97 +++++++++---------- 3 files changed, 73 insertions(+), 76 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index eccf9e1b45097..9f98c053eb392 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9901,27 +9901,26 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const { case MVT::i32: // This is meant for ARM specifically, which has ROTR but no ROTL. if (isOperationLegalOrCustom(ISD::ROTR, VT)) { - // ror rtmp, r0, #16 + // eor r3, r0, r0, ror #16 SDValue Ror16 = DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(16, dl, SHVT)); - // eor r1, r0, rtmp ; r1 = r0 ^ (r0 ror 16) SDValue Xor1 = DAG.getNode(ISD::XOR, dl, VT, Op, Ror16); - // bic r1, r1, #0xff0000 (clear bits 16-23) - // So we need the negated value: ~0x00FF0000 = 0xFF00FFFF - SDValue Mask = DAG.getConstant(0xFF00FFFFu, dl, VT); - SDValue BicResult = DAG.getNode(ISD::AND, dl, VT, Xor1, Mask); - - // mov r1, r1, lsr #8 - SDValue Lsr8 = DAG.getNode(ISD::SRL, dl, VT, BicResult, + // lsr r3, r3, #8 + SDValue Lsr8 = DAG.getNode(ISD::SRL, dl, VT, Xor1, DAG.getConstant(8, dl, SHVT)); + // bic r3, r3, #65280 (0xFF00) + // So we need the negated value: ~0x0000FF00 = 0xFFFF00FF + SDValue Mask = DAG.getConstant(0xFFFF00FFu, dl, VT); + SDValue BicResult = DAG.getNode(ISD::AND, dl, VT, Lsr8, Mask); + // ror r0, r0, #8 SDValue Ror8 = DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(8, dl, SHVT)); - // eor r0, Lsr8, Ror8 - return DAG.getNode(ISD::XOR, dl, VT, Lsr8, Ror8); + // eor r0, r3, r0, ror #8 + return DAG.getNode(ISD::XOR, dl, VT, BicResult, Ror8); } Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT)); Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op, diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll index e12bf031b01ae..6f08039c36ea7 100644 --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -53,9 +53,9 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -220,14 +220,13 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: mvn r3, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] ; CHECK-NEXT: eor r2, r0, r0, ror #16 -; CHECK-NEXT: bic r2, r2, #16711680 -; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: and r2, r3, r2, lsr #8 ; CHECK-NEXT: eor r0, r2, r0, ror #8 ; CHECK-NEXT: eor r2, r1, r1, ror #16 -; CHECK-NEXT: bic r2, r2, #16711680 -; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: and r2, r3, r2, lsr #8 ; CHECK-NEXT: eor r1, r2, r1, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -369,9 +368,9 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #1] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -423,9 +422,9 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #-4] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -573,9 +572,9 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -649,10 +648,10 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r0, r1 +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -712,10 +711,10 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: @ %bb.0: ; CHECK-NEXT: add r0, r1, r0 +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: ldr r0, [r0, #13] ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index ef6ac30b87531..4431993f71bdf 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -137,9 +137,9 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -153,15 +153,15 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { ; CHECK-THUMBv5-NEXT: movs r3, r0 ; CHECK-THUMBv5-NEXT: rors r3, r1 ; CHECK-THUMBv5-NEXT: eors r3, r0 +; CHECK-THUMBv5-NEXT: lsrs r1, r3, #8 ; CHECK-THUMBv5-NEXT: ldr r0, .LCPI2_0 -; CHECK-THUMBv5-NEXT: ands r0, r3 -; CHECK-THUMBv5-NEXT: lsrs r0, r0, #8 +; CHECK-THUMBv5-NEXT: ands r0, r1 ; CHECK-THUMBv5-NEXT: eors r0, r2 ; CHECK-THUMBv5-NEXT: bx lr ; CHECK-THUMBv5-NEXT: .p2align 2 ; CHECK-THUMBv5-NEXT: @ %bb.1: ; CHECK-THUMBv5-NEXT: .LCPI2_0: -; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; CHECK-THUMBv5-NEXT: .long 16711935 @ 0xff00ff ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: @@ -281,47 +281,46 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: mvn r3, #65280 ; CHECK-NEXT: ldr r0, [r0, #4] ; CHECK-NEXT: eor r2, r0, r0, ror #16 -; CHECK-NEXT: bic r2, r2, #16711680 -; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: and r2, r3, r2, lsr #8 ; CHECK-NEXT: eor r0, r2, r0, ror #8 ; CHECK-NEXT: eor r2, r1, r1, ror #16 -; CHECK-NEXT: bic r2, r2, #16711680 -; CHECK-NEXT: lsr r2, r2, #8 +; CHECK-NEXT: and r2, r3, r2, lsr #8 ; CHECK-NEXT: eor r1, r2, r1, ror #8 ; CHECK-NEXT: mov pc, lr ; ; CHECK-THUMBv5-LABEL: load_i64_by_i8_bswap: ; CHECK-THUMBv5: @ %bb.0: ; CHECK-THUMBv5-NEXT: push {r4, r5, r7, lr} -; CHECK-THUMBv5-NEXT: ldr r1, [r0, #4] -; CHECK-THUMBv5-NEXT: movs r3, #8 -; CHECK-THUMBv5-NEXT: movs r4, r1 -; CHECK-THUMBv5-NEXT: rors r4, r3 -; CHECK-THUMBv5-NEXT: movs r5, #16 +; CHECK-THUMBv5-NEXT: movs r1, r0 +; CHECK-THUMBv5-NEXT: ldr r0, [r0, #4] +; CHECK-THUMBv5-NEXT: movs r2, #8 +; CHECK-THUMBv5-NEXT: movs r3, r0 +; CHECK-THUMBv5-NEXT: rors r3, r2 +; CHECK-THUMBv5-NEXT: movs r4, #16 +; CHECK-THUMBv5-NEXT: movs r5, r0 +; CHECK-THUMBv5-NEXT: rors r5, r4 +; CHECK-THUMBv5-NEXT: eors r5, r0 +; CHECK-THUMBv5-NEXT: lsrs r0, r5, #8 +; CHECK-THUMBv5-NEXT: ldr r5, .LCPI4_0 +; CHECK-THUMBv5-NEXT: ands r0, r5 +; CHECK-THUMBv5-NEXT: eors r0, r3 +; CHECK-THUMBv5-NEXT: ldr r1, [r1] +; CHECK-THUMBv5-NEXT: movs r3, r1 +; CHECK-THUMBv5-NEXT: rors r3, r2 ; CHECK-THUMBv5-NEXT: movs r2, r1 -; CHECK-THUMBv5-NEXT: rors r2, r5 +; CHECK-THUMBv5-NEXT: rors r2, r4 ; CHECK-THUMBv5-NEXT: eors r2, r1 -; CHECK-THUMBv5-NEXT: ldr r1, .LCPI4_0 -; CHECK-THUMBv5-NEXT: ands r2, r1 -; CHECK-THUMBv5-NEXT: lsrs r2, r2, #8 -; CHECK-THUMBv5-NEXT: eors r2, r4 -; CHECK-THUMBv5-NEXT: ldr r0, [r0] -; CHECK-THUMBv5-NEXT: movs r4, r0 -; CHECK-THUMBv5-NEXT: rors r4, r3 -; CHECK-THUMBv5-NEXT: movs r3, r0 -; CHECK-THUMBv5-NEXT: rors r3, r5 -; CHECK-THUMBv5-NEXT: eors r3, r0 -; CHECK-THUMBv5-NEXT: ands r3, r1 -; CHECK-THUMBv5-NEXT: lsrs r1, r3, #8 -; CHECK-THUMBv5-NEXT: eors r1, r4 -; CHECK-THUMBv5-NEXT: movs r0, r2 +; CHECK-THUMBv5-NEXT: lsrs r1, r2, #8 +; CHECK-THUMBv5-NEXT: ands r1, r5 +; CHECK-THUMBv5-NEXT: eors r1, r3 ; CHECK-THUMBv5-NEXT: pop {r4, r5, r7, pc} ; CHECK-THUMBv5-NEXT: .p2align 2 ; CHECK-THUMBv5-NEXT: @ %bb.1: ; CHECK-THUMBv5-NEXT: .LCPI4_0: -; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; CHECK-THUMBv5-NEXT: .long 16711935 @ 0xff00ff ; ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: @@ -495,9 +494,9 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #1] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -509,17 +508,17 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) { ; CHECK-THUMBv5-NEXT: movs r2, r0 ; CHECK-THUMBv5-NEXT: rors r2, r1 ; CHECK-THUMBv5-NEXT: eors r2, r0 -; CHECK-THUMBv5-NEXT: ldr r1, .LCPI7_0 -; CHECK-THUMBv5-NEXT: ands r1, r2 -; CHECK-THUMBv5-NEXT: lsrs r1, r1, #8 -; CHECK-THUMBv5-NEXT: movs r2, #8 -; CHECK-THUMBv5-NEXT: rors r0, r2 -; CHECK-THUMBv5-NEXT: eors r0, r1 +; CHECK-THUMBv5-NEXT: lsrs r1, r2, #8 +; CHECK-THUMBv5-NEXT: ldr r2, .LCPI7_0 +; CHECK-THUMBv5-NEXT: ands r2, r1 +; CHECK-THUMBv5-NEXT: movs r1, #8 +; CHECK-THUMBv5-NEXT: rors r0, r1 +; CHECK-THUMBv5-NEXT: eors r0, r2 ; CHECK-THUMBv5-NEXT: bx lr ; CHECK-THUMBv5-NEXT: .p2align 2 ; CHECK-THUMBv5-NEXT: @ %bb.1: ; CHECK-THUMBv5-NEXT: .LCPI7_0: -; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; CHECK-THUMBv5-NEXT: .long 16711935 @ 0xff00ff ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK-ARMv6: @ %bb.0: @@ -568,9 +567,9 @@ define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) { ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0, #-4] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -585,15 +584,15 @@ define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) { ; CHECK-THUMBv5-NEXT: movs r3, r0 ; CHECK-THUMBv5-NEXT: rors r3, r1 ; CHECK-THUMBv5-NEXT: eors r3, r0 +; CHECK-THUMBv5-NEXT: lsrs r1, r3, #8 ; CHECK-THUMBv5-NEXT: ldr r0, .LCPI8_0 -; CHECK-THUMBv5-NEXT: ands r0, r3 -; CHECK-THUMBv5-NEXT: lsrs r0, r0, #8 +; CHECK-THUMBv5-NEXT: ands r0, r1 ; CHECK-THUMBv5-NEXT: eors r0, r2 ; CHECK-THUMBv5-NEXT: bx lr ; CHECK-THUMBv5-NEXT: .p2align 2 ; CHECK-THUMBv5-NEXT: @ %bb.1: ; CHECK-THUMBv5-NEXT: .LCPI8_0: -; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; CHECK-THUMBv5-NEXT: .long 16711935 @ 0xff00ff ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK-ARMv6: @ %bb.0: @@ -644,9 +643,9 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-LABEL: load_i32_by_bswap_i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mvn r2, #65280 ; CHECK-NEXT: eor r1, r0, r0, ror #16 -; CHECK-NEXT: bic r1, r1, #16711680 -; CHECK-NEXT: lsr r1, r1, #8 +; CHECK-NEXT: and r1, r2, r1, lsr #8 ; CHECK-NEXT: eor r0, r1, r0, ror #8 ; CHECK-NEXT: mov pc, lr ; @@ -660,15 +659,15 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) { ; CHECK-THUMBv5-NEXT: movs r3, r0 ; CHECK-THUMBv5-NEXT: rors r3, r1 ; CHECK-THUMBv5-NEXT: eors r3, r0 +; CHECK-THUMBv5-NEXT: lsrs r1, r3, #8 ; CHECK-THUMBv5-NEXT: ldr r0, .LCPI9_0 -; CHECK-THUMBv5-NEXT: ands r0, r3 -; CHECK-THUMBv5-NEXT: lsrs r0, r0, #8 +; CHECK-THUMBv5-NEXT: ands r0, r1 ; CHECK-THUMBv5-NEXT: eors r0, r2 ; CHECK-THUMBv5-NEXT: bx lr ; CHECK-THUMBv5-NEXT: .p2align 2 ; CHECK-THUMBv5-NEXT: @ %bb.1: ; CHECK-THUMBv5-NEXT: .LCPI9_0: -; CHECK-THUMBv5-NEXT: .long 4278255360 @ 0xff00ff00 +; CHECK-THUMBv5-NEXT: .long 16711935 @ 0xff00ff ; ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16: ; CHECK-ARMv6: @ %bb.0: