diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 313ae3d68fb83..1c85b076404ba 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -118,6 +118,7 @@ using namespace llvm; #define DEBUG_TYPE "arm-isel" STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); STATISTIC(NumConstpoolPromoted, @@ -128,6 +129,12 @@ ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); +static cl::opt + EnableOptimizeLogicalImm("arm-enable-logical-imm", cl::Hidden, + cl::desc("Enable ARM logical imm instruction " + "optimization"), + cl::init(true)); + static cl::opt EnableConstpoolPromotion( "arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " @@ -20138,6 +20145,112 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } } +static bool isLegalLogicalImmediate(unsigned Imm, + const ARMSubtarget *Subtarget) { + // Handle special cases first + if (!Subtarget->isThumb()) + return ARM_AM::getSOImmVal(Imm) != -1; + if (Subtarget->isThumb2()) + return ARM_AM::getT2SOImmVal(Imm) != -1; + // Thumb1 only has 8-bit unsigned immediate. + return Imm <= 255; +} + +static bool optimizeLogicalImm(SDValue Op, unsigned Imm, const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO, + unsigned NewOpc, const ARMSubtarget *Subtarget) { + unsigned OldImm = Imm, NewImm; + + // Return if the immediate is already all zeros, all ones, a bimm32. + if (Imm == 0 || Imm == ~0U || isLegalLogicalImmediate(Imm, Subtarget)) + return false; + + // bic/orn/eon + if ((Op.getOpcode() == ISD::AND || + (Subtarget->isThumb2() && Op.getOpcode() == ISD::OR)) && + isLegalLogicalImmediate(~Imm, Subtarget)) + return false; + + unsigned DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + // Try to extend the immediate to a legal ARM rotating immediate + // by filling in non-demanded bits. ARM supports: + // - An 8-bit value rotated by an even number of bits (0, 2, 4, 6, ..., 30) + // - Any 8-bit immediate (Thumb2 also supports 16-bit splat patterns) + unsigned NonDemandedBits = ~DemandedBits; + + // Try filling with 0 + NewImm = Imm & DemandedBits; + if (isLegalLogicalImmediate(NewImm, Subtarget) || + ((Op.getOpcode() == ISD::AND || + (Subtarget->isThumb2() && Op.getOpcode() == ISD::OR)) && + isLegalLogicalImmediate(~NewImm, Subtarget))) { + ++NumOptimizedImms; + } else { + // Try filling with 1 + NewImm = Imm | NonDemandedBits; + if (isLegalLogicalImmediate(NewImm, Subtarget) || + ((Op.getOpcode() == ISD::AND || + (Subtarget->isThumb2() && Op.getOpcode() == ISD::OR)) && + isLegalLogicalImmediate(~NewImm, Subtarget))) { + ++NumOptimizedImms; + } else { + return false; + } + } + + (void)OldImm; + assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); + + // Create the new constant immediate node. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue New; + + // If the new constant immediate is all-zeros or all-ones, let the target + // independent DAG combine optimize this node. + if (NewImm == 0 || NewImm == ~0U) { + New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), + TLO.DAG.getConstant(NewImm, DL, VT)); + // Otherwise, create a machine node so that target independent DAG combine + // doesn't undo this optimization. + } else { + // bic/orn/eon + if (isLegalLogicalImmediate(NewImm, Subtarget)) { + SDValue EncConst = TLO.DAG.getTargetConstant(NewImm, DL, VT); + New = SDValue( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), + 0); + } else if ((Op.getOpcode() == ISD::AND || + (Subtarget->isThumb2() && Op.getOpcode() == ISD::OR)) && + isLegalLogicalImmediate(~NewImm, Subtarget)) { + + if (Op.getOpcode() == ISD::OR) { + // ORN + NewOpc = ARM::t2ORNri; + } else { + // AND -> BIC + NewOpc = Subtarget->isThumb() + ? Subtarget->isThumb2() ? ARM::t2BICri : ARM::tBIC + : ARM::BICri; + } + SDValue EncConst = TLO.DAG.getTargetConstant(~NewImm, DL, VT); + New = SDValue( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), + 0); + } else { + return false; + } + } + + return TLO.CombineTo(Op, New); +} + bool ARMTargetLowering::targetShrinkDemandedConstant( SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const { @@ -20146,18 +20259,19 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( if (!TLO.LegalOps) return false; - // Only optimize AND for now. - if (Op.getOpcode() != ISD::AND) + if (!EnableOptimizeLogicalImm) return false; EVT VT = Op.getValueType(); - - // Ignore vectors. if (VT.isVector()) return false; assert(VT == MVT::i32 && "Unexpected integer type"); + // Exit early if we demand all bits. + if (DemandedBits.popcount() == 32) + return false; + // Make sure the RHS really is a constant. ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) @@ -20165,59 +20279,62 @@ bool ARMTargetLowering::targetShrinkDemandedConstant( unsigned Mask = C->getZExtValue(); - unsigned Demanded = DemandedBits.getZExtValue(); - unsigned ShrunkMask = Mask & Demanded; - unsigned ExpandedMask = Mask | ~Demanded; - - // If the mask is all zeros, let the target-independent code replace the - // result with zero. - if (ShrunkMask == 0) - return false; - - // If the mask is all ones, erase the AND. (Currently, the target-independent - // code won't do this, so we have to do it explicitly to avoid an infinite - // loop in obscure cases.) - if (ExpandedMask == ~0U) - return TLO.CombineTo(Op, Op.getOperand(0)); - - auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { - return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; - }; - auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { - if (NewMask == Mask) - return true; - SDLoc DL(Op); - SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); - SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); - return TLO.CombineTo(Op, NewOp); - }; - - // Prefer uxtb mask. - if (IsLegalMask(0xFF)) - return UseMask(0xFF); + // If thumb, check for uxth and uxtb masks. + if (Subtarget->isThumb1Only() && Op.getOpcode() == ISD::AND) { + unsigned Demanded = DemandedBits.getZExtValue(); + unsigned ShrunkMask = Mask & Demanded; + unsigned ExpandedMask = Mask | ~Demanded; - // Prefer uxth mask. - if (IsLegalMask(0xFFFF)) - return UseMask(0xFFFF); + // If the mask is all zeros, let the target-independent code replace the + // result with zero. + if (ShrunkMask == 0) + return false; - // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. - // FIXME: Prefer a contiguous sequence of bits for other optimizations. - if (ShrunkMask < 256) - return UseMask(ShrunkMask); + // If the mask is all ones, erase the AND. (Currently, the + // target-independent code won't do this, so we have to do it explicitly to + // avoid an infinite loop in obscure cases.) + if (ExpandedMask == ~0U) + return TLO.CombineTo(Op, Op.getOperand(0)); + auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { + return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; + }; + auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { + if (NewMask == Mask) + return true; + SDLoc DL(Op); + SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); + SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); + return TLO.CombineTo(Op, NewOp); + }; - // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. - // FIXME: Prefer a contiguous sequence of bits for other optimizations. - if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) - return UseMask(ExpandedMask); + if (IsLegalMask(0xFF)) + return UseMask(0xFF); + if (IsLegalMask(0xFFFF)) + return UseMask(0xFFFF); + } - // Potential improvements: - // - // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. - // We could try to prefer Thumb1 immediates which can be lowered to a - // two-instruction sequence. - // We could try to recognize more legal ARM/Thumb2 immediates here. + unsigned NewOpc; + switch (Op.getOpcode()) { + default: + return false; + case ISD::AND: + NewOpc = Subtarget->isThumb() + ? Subtarget->isThumb2() ? ARM::t2ANDri : ARM::tAND + : ARM::ANDri; + break; + case ISD::OR: + NewOpc = Subtarget->isThumb() + ? Subtarget->isThumb2() ? ARM::t2ORRri : ARM::tORR + : ARM::ORRri; + break; + case ISD::XOR: + NewOpc = Subtarget->isThumb() + ? Subtarget->isThumb2() ? ARM::t2EORri : ARM::tEOR + : ARM::EORri; + break; + } - return false; + return optimizeLogicalImm(Op, Mask, DemandedBits, TLO, NewOpc, Subtarget); } bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( diff --git a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll index a1b6847d623d0..6f34a5fd00314 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll @@ -19,7 +19,7 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) define i8 @rotl_i8_const_shift(i8 %x) { ; CHECK-LABEL: rotl_i8_const_shift: ; CHECK: @ %bb.0: -; CHECK-NEXT: uxtb r1, r0 +; CHECK-NEXT: and r1, r0, #224 ; CHECK-NEXT: lsl r0, r0, #3 ; CHECK-NEXT: orr r0, r0, r1, lsr #5 ; CHECK-NEXT: bx lr @@ -161,8 +161,7 @@ define <4 x i32> @rotl_v4i32_rotl_const_shift(<4 x i32> %x) { define i8 @rotr_i8_const_shift(i8 %x) { ; CHECK-LABEL: rotr_i8_const_shift: ; CHECK: @ %bb.0: -; CHECK-NEXT: uxtb r1, r0 -; CHECK-NEXT: lsr r1, r1, #3 +; CHECK-NEXT: ubfx r1, r0, #3, #5 ; CHECK-NEXT: orr r0, r1, r0, lsl #5 ; CHECK-NEXT: bx lr %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3) diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index 7cc623fb0a616..a21ac8944d7ad 100644 --- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -21,9 +21,9 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { ; ARM-LABEL: scalar_i8_signbit_eq: ; ARM: @ %bb.0: ; ARM-NEXT: uxtb r1, r1 -; ARM-NEXT: lsl r0, r0, r1 +; ARM-NEXT: mov r2, #128 +; ARM-NEXT: and r0, r2, r0, lsl r1 ; ARM-NEXT: mov r1, #1 -; ARM-NEXT: uxtb r0, r0 ; ARM-NEXT: eor r0, r1, r0, lsr #7 ; ARM-NEXT: bx lr ; @@ -42,7 +42,7 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind { ; THUMB78-NEXT: uxtb r1, r1 ; THUMB78-NEXT: lsls r0, r1 ; THUMB78-NEXT: movs r1, #1 -; THUMB78-NEXT: uxtb r0, r0 +; THUMB78-NEXT: and r0, r0, #128 ; THUMB78-NEXT: eor.w r0, r1, r0, lsr #7 ; THUMB78-NEXT: bx lr %t0 = lshr i8 128, %y @@ -122,9 +122,9 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { ; ARM-LABEL: scalar_i16_signbit_eq: ; ARM: @ %bb.0: ; ARM-NEXT: uxth r1, r1 -; ARM-NEXT: lsl r0, r0, r1 +; ARM-NEXT: mov r2, #32768 +; ARM-NEXT: and r0, r2, r0, lsl r1 ; ARM-NEXT: mov r1, #1 -; ARM-NEXT: uxth r0, r0 ; ARM-NEXT: eor r0, r1, r0, lsr #15 ; ARM-NEXT: bx lr ; @@ -144,7 +144,7 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind { ; THUMB78-NEXT: uxth r1, r1 ; THUMB78-NEXT: lsls r0, r1 ; THUMB78-NEXT: movs r1, #1 -; THUMB78-NEXT: uxth r0, r0 +; THUMB78-NEXT: and r0, r0, #32768 ; THUMB78-NEXT: eor.w r0, r1, r0, lsr #15 ; THUMB78-NEXT: bx lr %t0 = lshr i16 32768, %y @@ -862,21 +862,35 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ;------------------------------------------------------------------------------; define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind { -; ARM-LABEL: scalar_i8_signbit_ne: -; ARM: @ %bb.0: -; ARM-NEXT: uxtb r1, r1 -; ARM-NEXT: lsl r0, r0, r1 -; ARM-NEXT: uxtb r0, r0 -; ARM-NEXT: lsr r0, r0, #7 -; ARM-NEXT: bx lr +; ARM6-LABEL: scalar_i8_signbit_ne: +; ARM6: @ %bb.0: +; ARM6-NEXT: uxtb r1, r1 +; ARM6-NEXT: mov r2, #128 +; ARM6-NEXT: and r0, r2, r0, lsl r1 +; ARM6-NEXT: lsr r0, r0, #7 +; ARM6-NEXT: bx lr ; -; THUMB-LABEL: scalar_i8_signbit_ne: -; THUMB: @ %bb.0: -; THUMB-NEXT: uxtb r1, r1 -; THUMB-NEXT: lsls r0, r1 -; THUMB-NEXT: uxtb r0, r0 -; THUMB-NEXT: lsrs r0, r0, #7 -; THUMB-NEXT: bx lr +; ARM78-LABEL: scalar_i8_signbit_ne: +; ARM78: @ %bb.0: +; ARM78-NEXT: uxtb r1, r1 +; ARM78-NEXT: lsl r0, r0, r1 +; ARM78-NEXT: ubfx r0, r0, #7, #1 +; ARM78-NEXT: bx lr +; +; THUMB6-LABEL: scalar_i8_signbit_ne: +; THUMB6: @ %bb.0: +; THUMB6-NEXT: uxtb r1, r1 +; THUMB6-NEXT: lsls r0, r1 +; THUMB6-NEXT: uxtb r0, r0 +; THUMB6-NEXT: lsrs r0, r0, #7 +; THUMB6-NEXT: bx lr +; +; THUMB78-LABEL: scalar_i8_signbit_ne: +; THUMB78: @ %bb.0: +; THUMB78-NEXT: uxtb r1, r1 +; THUMB78-NEXT: lsls r0, r1 +; THUMB78-NEXT: ubfx r0, r0, #7, #1 +; THUMB78-NEXT: bx lr %t0 = lshr i8 128, %y %t1 = and i8 %t0, %x %res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate @@ -1051,3 +1065,5 @@ define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind { %res = icmp eq i8 %t1, 1 ; should be comparing with 0 ret i1 %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; THUMB: {{.*}} diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll index 4b0419577cdf0..7a93267fcc390 100644 --- a/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll +++ b/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefixes=CHECK,T2 ; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefixes=CHECK,T2 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefixes=CHECK,T1 @@ -13,11 +14,21 @@ ; Test sdiv i16 define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 { -; CHECK-LABEL: f0 -; CHECK: movs r1, #2 -; CHECK-NEXT: sdiv r0, r0, r1 -; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: bx lr +; CHECK-LABEL: f0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: bx lr +; +; V6M-LABEL: f0: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: uxth r1, r0 +; V6M-NEXT: lsrs r1, r1, #15 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: sxth r0, r0 +; V6M-NEXT: asrs r0, r0, #1 +; V6M-NEXT: bx lr entry: %0 = sdiv i16 %F, 2 @@ -26,10 +37,19 @@ entry: ; Same as above, but now with i32 define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f1 -; CHECK: movs r1, #4 -; CHECK-NEXT: sdiv r0, r0, r1 -; CHECK-NEXT: bx lr +; CHECK-LABEL: f1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #4 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: bx lr +; +; V6M-LABEL: f1: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: asrs r1, r0, #31 +; V6M-NEXT: lsrs r1, r1, #30 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: asrs r0, r0, #2 +; V6M-NEXT: bx lr entry: %div = sdiv i32 %F, 4 @@ -38,10 +58,18 @@ entry: ; The immediate is not a power of 2, so we expect a sdiv. define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f2 -; CHECK: movs r1, #5 -; CHECK-NEXT: sdiv r0, r0, r1 -; CHECK-NEXT: bx lr +; CHECK-LABEL: f2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r1, #5 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: bx lr +; +; V6M-LABEL: f2: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: push {r7, lr} +; V6M-NEXT: movs r1, #5 +; V6M-NEXT: bl __divsi3 +; V6M-NEXT: pop {r7, pc} entry: %div = sdiv i32 %F, 5 @@ -51,8 +79,28 @@ entry: ; Try a larger power of 2 immediate: immediates larger than ; 128 don't give any code size savings. define dso_local i32 @f3(i32 %F) local_unnamed_addr #0 { -; CHECK-LABEL: f3 -; CHECK-NOT: sdiv +; T2-LABEL: f3: +; T2: @ %bb.0: @ %entry +; T2-NEXT: asrs r1, r0, #31 +; T2-NEXT: add.w r0, r0, r1, lsr #24 +; T2-NEXT: asrs r0, r0, #8 +; T2-NEXT: bx lr +; +; T1-LABEL: f3: +; T1: @ %bb.0: @ %entry +; T1-NEXT: asrs r1, r0, #31 +; T1-NEXT: lsrs r1, r1, #24 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: asrs r0, r0, #8 +; T1-NEXT: bx lr +; +; V6M-LABEL: f3: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: asrs r1, r0, #31 +; V6M-NEXT: lsrs r1, r1, #24 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: asrs r0, r0, #8 +; V6M-NEXT: bx lr entry: %div = sdiv i32 %F, 256 ret i32 %div @@ -65,20 +113,32 @@ attributes #0 = { minsize norecurse nounwind optsize readnone } ; the sdiv to sdiv, but to the faster instruction sequence. define dso_local signext i16 @f4(i16 signext %F) { -; T2-LABEL: f4 -; T2: uxth r1, r0 -; T2-NEXT: add.w r0, r0, r1, lsr #15 -; T2-NEXT: sxth r0, r0 -; T2-NEXT: asrs r0, r0, #1 -; T2-NEXT: bx lr - -; T1-LABEL: f4 -; T1: uxth r1, r0 -; T1-NEXT: lsrs r1, r1, #15 -; T1-NEXT: adds r0, r0, r1 -; T1-NEXT: sxth r0, r0 -; T1-NEXT: asrs r0, r0, #1 -; T1-NEXT: bx lr +; T2-LABEL: f4: +; T2: @ %bb.0: @ %entry +; T2-NEXT: and r1, r0, #32768 +; T2-NEXT: add.w r0, r0, r1, lsr #15 +; T2-NEXT: sxth r0, r0 +; T2-NEXT: asrs r0, r0, #1 +; T2-NEXT: bx lr +; +; T1-LABEL: f4: +; T1: @ %bb.0: @ %entry +; T1-NEXT: uxth r1, r0 +; T1-NEXT: lsrs r1, r1, #15 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: sxth r0, r0 +; T1-NEXT: asrs r0, r0, #1 +; T1-NEXT: bx lr +; +; V6M-LABEL: f4: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: uxth r1, r0 +; V6M-NEXT: lsrs r1, r1, #15 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: sxth r0, r0 +; V6M-NEXT: asrs r0, r0, #1 +; V6M-NEXT: bx lr + entry: %0 = sdiv i16 %F, 2 @@ -86,18 +146,29 @@ entry: } define dso_local i32 @f5(i32 %F) { -; T2-LABEL: f5 -; T2: asrs r1, r0, #31 -; T2-NEXT: add.w r0, r0, r1, lsr #30 -; T2-NEXT: asrs r0, r0, #2 -; T2-NEXT: bx lr - -; T1-LABEL: f5 -; T1: asrs r1, r0, #31 -; T1-NEXT: lsrs r1, r1, #30 -; T1-NEXT: adds r0, r0, r1 -; T1-NEXT: asrs r0, r0, #2 -; T1-NEXT: bx lr +; T2-LABEL: f5: +; T2: @ %bb.0: @ %entry +; T2-NEXT: asrs r1, r0, #31 +; T2-NEXT: add.w r0, r0, r1, lsr #30 +; T2-NEXT: asrs r0, r0, #2 +; T2-NEXT: bx lr +; +; T1-LABEL: f5: +; T1: @ %bb.0: @ %entry +; T1-NEXT: asrs r1, r0, #31 +; T1-NEXT: lsrs r1, r1, #30 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: asrs r0, r0, #2 +; T1-NEXT: bx lr +; +; V6M-LABEL: f5: +; V6M: @ %bb.0: @ %entry +; V6M-NEXT: asrs r1, r0, #31 +; V6M-NEXT: lsrs r1, r1, #30 +; V6M-NEXT: adds r0, r0, r1 +; V6M-NEXT: asrs r0, r0, #2 +; V6M-NEXT: bx lr + entry: %div = sdiv i32 %F, 4 diff --git a/llvm/test/CodeGen/Thumb/bic_imm.ll b/llvm/test/CodeGen/Thumb/bic_imm.ll index 741b2cf8db2e3..7257891c7a116 100644 --- a/llvm/test/CodeGen/Thumb/bic_imm.ll +++ b/llvm/test/CodeGen/Thumb/bic_imm.ll @@ -39,14 +39,19 @@ entry: define void @truncated(i16 %a, ptr %p) { ; CHECK-T1-LABEL: truncated: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: movs r2, #128 -; CHECK-T1-NEXT: bics r0, r2 -; CHECK-T1-NEXT: strh r0, [r1] +; CHECK-T1-NEXT: ldr r2, .LCPI2_0 +; CHECK-T1-NEXT: ands r2, r0 +; CHECK-T1-NEXT: strh r2, [r1] ; CHECK-T1-NEXT: bx lr +; CHECK-T1-NEXT: .p2align 2 +; CHECK-T1-NEXT: @ %bb.1: +; CHECK-T1-NEXT: .LCPI2_0: +; CHECK-T1-NEXT: .long 65407 @ 0xff7f ; ; CHECK-T2-LABEL: truncated: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: bic r0, r0, #128 +; CHECK-T2-NEXT: movw r2, #65407 +; CHECK-T2-NEXT: ands r0, r2 ; CHECK-T2-NEXT: strh r0, [r1] ; CHECK-T2-NEXT: bx lr %and = and i16 %a, -129 @@ -57,14 +62,19 @@ define void @truncated(i16 %a, ptr %p) { define void @truncated_neg2(i16 %a, ptr %p) { ; CHECK-T1-LABEL: truncated_neg2: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: movs r2, #1 -; CHECK-T1-NEXT: bics r0, r2 -; CHECK-T1-NEXT: strh r0, [r1] +; CHECK-T1-NEXT: ldr r2, .LCPI3_0 +; CHECK-T1-NEXT: ands r2, r0 +; CHECK-T1-NEXT: strh r2, [r1] ; CHECK-T1-NEXT: bx lr +; CHECK-T1-NEXT: .p2align 2 +; CHECK-T1-NEXT: @ %bb.1: +; CHECK-T1-NEXT: .LCPI3_0: +; CHECK-T1-NEXT: .long 65534 @ 0xfffe ; ; CHECK-T2-LABEL: truncated_neg2: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: bic r0, r0, #1 +; CHECK-T2-NEXT: movw r2, #65534 +; CHECK-T2-NEXT: ands r0, r2 ; CHECK-T2-NEXT: strh r0, [r1] ; CHECK-T2-NEXT: bx lr %and = and i16 %a, -2 @@ -76,13 +86,14 @@ define void @truncated_neg256(i16 %a, ptr %p) { ; CHECK-T1-LABEL: truncated_neg256: ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: movs r2, #255 -; CHECK-T1-NEXT: bics r0, r2 -; CHECK-T1-NEXT: strh r0, [r1] +; CHECK-T1-NEXT: lsls r2, r2, #8 +; CHECK-T1-NEXT: ands r2, r0 +; CHECK-T1-NEXT: strh r2, [r1] ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: truncated_neg256: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: bic r0, r0, #255 +; CHECK-T2-NEXT: and r0, r0, #65280 ; CHECK-T2-NEXT: strh r0, [r1] ; CHECK-T2-NEXT: bx lr %and = and i16 %a, -256