diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2a40fb9b476f8..2ed2915567d44 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -4832,7 +4832,7 @@ SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, } // This function returns three things: the arithmetic computation itself -// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The +// (Value), a comparison (Overflow), and a condition code (ARMcc). The // comparison and the condition code define the case in which the arithmetic // computation *does not* overflow. std::pair @@ -4840,42 +4840,30 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const { assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); - SDValue Value, OverflowCmp; + SDValue Value, Overflow; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDLoc dl(Op); - - // FIXME: We are currently always generating CMPs because we don't support - // generating CMN through the backend. This is not as good as the natural - // CMP case because it causes a register dependency and cannot be folded - // later. + unsigned Opc = 0; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: + Opc = ARMISD::ADDC; ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); - Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS); break; case ISD::UADDO: - ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); - // We use ADDC here to correspond to its use in LowerUnsignedALUO. - // We do not use it in the USUBO case as Value may not be used. - Value = DAG.getNode(ARMISD::ADDC, dl, - DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) - .getValue(0); - OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS); + Opc = ARMISD::ADDC; + ARMcc = DAG.getConstant(ARMCC::LO, dl, MVT::i32); break; case ISD::SSUBO: + Opc = ARMISD::SUBC; ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); - Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS); break; case ISD::USUBO: + Opc = ARMISD::SUBC; ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); - Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS); break; case ISD::UMULO: // We generate a UMUL_LOHI and then check if the high word is 0. @@ -4883,8 +4871,8 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, Value = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(Op.getValueType(), Op.getValueType()), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1), - DAG.getConstant(0, dl, MVT::i32)); + Overflow = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1), + DAG.getConstant(0, dl, MVT::i32)); Value = Value.getValue(0); // We only want the low 32 bits for the result. break; case ISD::SMULO: @@ -4894,15 +4882,34 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, Value = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(Op.getValueType(), Op.getValueType()), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1), - DAG.getNode(ISD::SRA, dl, Op.getValueType(), - Value.getValue(0), - DAG.getConstant(31, dl, MVT::i32))); + Overflow = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1), + DAG.getNode(ISD::SRA, dl, Op.getValueType(), + Value.getValue(0), + DAG.getConstant(31, dl, MVT::i32))); Value = Value.getValue(0); // We only want the low 32 bits for the result. break; } // switch (...) + if (Opc) { + if (Subtarget->isThumb1Only() && + (Op.getOpcode() == ISD::SADDO || Op.getOpcode() == ISD::SSUBO)) { + // FIXME: Thumb1 has to split between the cmp and the add/sub. + // Remove when the peephole optimizer handles this or we no longer need to + // split. + if (Opc == ARMISD::ADDC) { + Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); + Overflow = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS); + } else { + Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); + Overflow = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS); + } + } else { + SDVTList VTs = DAG.getVTList(Op.getValueType(), FlagsVT); + Value = DAG.getNode(Opc, dl, VTs, LHS, RHS); + Overflow = Value.getValue(1); + } + } - return std::make_pair(Value, OverflowCmp); + return std::make_pair(Value, Overflow); } SDValue @@ -4911,20 +4918,18 @@ ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { if (!isTypeLegal(Op.getValueType())) return SDValue(); - SDValue Value, OverflowCmp; - SDValue ARMcc; - std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); SDLoc dl(Op); + SDValue Value, Overflow; + SDValue ARMcc; + std::tie(Value, Overflow) = getARMXALUOOp(Op, DAG, ARMcc); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); - EVT VT = Op.getValueType(); - SDValue Overflow = - DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp); + Overflow = + DAG.getNode(ARMISD::CMOV, dl, MVT::i32, TVal, FVal, ARMcc, Overflow); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); + return DAG.getMergeValues({Value, Overflow}, dl); } static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, @@ -5055,12 +5060,12 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (!isTypeLegal(Cond->getValueType(0))) return SDValue(); - SDValue Value, OverflowCmp; + SDValue Value, Overflow; SDValue ARMcc; - std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); + std::tie(Value, Overflow) = getARMXALUOOp(Cond, DAG, ARMcc); EVT VT = Op.getValueType(); - return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG); + return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, Overflow, DAG); } // Convert: @@ -5657,9 +5662,9 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { return SDValue(); // The actual operation with overflow check. - SDValue Value, OverflowCmp; + SDValue Value, Overflow; SDValue ARMcc; - std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); + std::tie(Value, Overflow) = getARMXALUOOp(Cond, DAG, ARMcc); // Reverse the condition code. ARMCC::CondCodes CondCode = @@ -5668,7 +5673,7 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, - OverflowCmp); + Overflow); } return SDValue(); @@ -5707,9 +5712,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { return SDValue(); // The actual operation with overflow check. - SDValue Value, OverflowCmp; + SDValue Value, Overflow; SDValue ARMcc; - std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); + std::tie(Value, Overflow) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); if ((CC == ISD::SETNE) != isOneConstant(RHS)) { // Reverse the condition code. @@ -5720,7 +5725,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, - OverflowCmp); + Overflow); } if (LHS.getValueType() == MVT::i32) { diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll index b92f03d43bb4c..b070d17be227f 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll @@ -34,14 +34,12 @@ define fastcc ptr @wrongUseOfPostDominate(ptr readonly %s, i32 %off, ptr readnon ; ENABLE-NEXT: .LBB0_4: @ %while.body ; ENABLE-NEXT: @ =>This Inner Loop Header: Depth=1 ; ENABLE-NEXT: ldrb r3, [r0] +; ENABLE-NEXT: subs r1, r1, #1 ; ENABLE-NEXT: ldrb r3, [r12, r3] ; ENABLE-NEXT: add r0, r0, r3 -; ENABLE-NEXT: sub r3, r1, #1 -; ENABLE-NEXT: cmp r3, r1 -; ENABLE-NEXT: bhs .LBB0_6 +; ENABLE-NEXT: blo .LBB0_6 ; ENABLE-NEXT: @ %bb.5: @ %while.body ; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; ENABLE-NEXT: mov r1, r3 ; ENABLE-NEXT: cmp r0, r2 ; ENABLE-NEXT: blo .LBB0_4 ; ENABLE-NEXT: .LBB0_6: @ %if.end29 @@ -124,14 +122,12 @@ define fastcc ptr @wrongUseOfPostDominate(ptr readonly %s, i32 %off, ptr readnon ; DISABLE-NEXT: .LBB0_4: @ %while.body ; DISABLE-NEXT: @ =>This Inner Loop Header: Depth=1 ; DISABLE-NEXT: ldrb r3, [r0] +; DISABLE-NEXT: subs r1, r1, #1 ; DISABLE-NEXT: ldrb r3, [r12, r3] ; DISABLE-NEXT: add r0, r0, r3 -; DISABLE-NEXT: sub r3, r1, #1 -; DISABLE-NEXT: cmp r3, r1 -; DISABLE-NEXT: bhs .LBB0_6 +; DISABLE-NEXT: blo .LBB0_6 ; DISABLE-NEXT: @ %bb.5: @ %while.body ; DISABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; DISABLE-NEXT: mov r1, r3 ; DISABLE-NEXT: cmp r0, r2 ; DISABLE-NEXT: blo .LBB0_4 ; DISABLE-NEXT: .LBB0_6: @ %if.end29 diff --git a/llvm/test/CodeGen/ARM/intrinsics-overflow.ll b/llvm/test/CodeGen/ARM/intrinsics-overflow.ll index 8bd78dd0f6ab5..796e9304a6486 100644 --- a/llvm/test/CodeGen/ARM/intrinsics-overflow.ll +++ b/llvm/test/CodeGen/ARM/intrinsics-overflow.ll @@ -1,104 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=arm-linux -mcpu=generic -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=ARM ; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=THUMBV6 ; RUN: llc < %s -mtriple=thumbv7-eabi -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=THUMBV7 define i32 @uadd_overflow(i32 %a, i32 %b) #0 { +; ARM-LABEL: uadd_overflow: +; ARM: @ %bb.0: +; ARM-NEXT: adds r0, r0, r1 +; ARM-NEXT: mov r2, #0 +; ARM-NEXT: adc r0, r2, #0 +; ARM-NEXT: mov pc, lr +; +; THUMBV6-LABEL: uadd_overflow: +; THUMBV6: @ %bb.0: +; THUMBV6-NEXT: movs r2, #0 +; THUMBV6-NEXT: adds r0, r0, r1 +; THUMBV6-NEXT: adcs r2, r2 +; THUMBV6-NEXT: mov r0, r2 +; THUMBV6-NEXT: bx lr +; +; THUMBV7-LABEL: uadd_overflow: +; THUMBV7: @ %bb.0: +; THUMBV7-NEXT: adds r0, r0, r1 +; THUMBV7-NEXT: mov.w r2, #0 +; THUMBV7-NEXT: adc r0, r2, #0 +; THUMBV7-NEXT: bx lr %sadd = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %sadd, 1 %2 = zext i1 %1 to i32 ret i32 %2 - ; CHECK-LABEL: uadd_overflow: - ; ARM: adds r[[R0:[0-9]+]], r[[R0]], r[[R1:[0-9]+]] - ; ARM: mov r[[R2:[0-9]+]], #0 - ; ARM: adc r[[R0]], r[[R2]], #0 - ; THUMBV6: movs r[[R2:[0-9]+]], #0 - ; THUMBV6: adds r[[R0:[0-9]+]], r[[R0]], r[[R1:[0-9]+]] - ; THUMBV6: adcs r[[R2]], r[[R2]] - ; THUMBV6: mov r[[R0]], r[[R2]] - ; THUMBV7: adds r[[R0:[0-9]+]], r[[R0]], r[[R1:[0-9]+]] - ; THUMBV7: mov.w r[[R2:[0-9]+]], #0 - ; THUMBV7: adc r[[R0]], r[[R2]], #0 } define i32 @sadd_overflow(i32 %a, i32 %b) #0 { +; ARM-LABEL: sadd_overflow: +; ARM: @ %bb.0: +; ARM-NEXT: mov r2, #1 +; ARM-NEXT: adds r0, r0, r1 +; ARM-NEXT: movvc r2, #0 +; ARM-NEXT: mov r0, r2 +; ARM-NEXT: mov pc, lr +; +; THUMBV6-LABEL: sadd_overflow: +; THUMBV6: @ %bb.0: +; THUMBV6-NEXT: adds r0, r0, r1 +; THUMBV6-NEXT: bvc .LBB1_2 +; THUMBV6-NEXT: @ %bb.1: +; THUMBV6-NEXT: movs r0, #1 +; THUMBV6-NEXT: bx lr +; THUMBV6-NEXT: .LBB1_2: +; THUMBV6-NEXT: movs r0, #0 +; THUMBV6-NEXT: bx lr +; +; THUMBV7-LABEL: sadd_overflow: +; THUMBV7: @ %bb.0: +; THUMBV7-NEXT: movs r2, #1 +; THUMBV7-NEXT: adds r0, r0, r1 +; THUMBV7-NEXT: it vc +; THUMBV7-NEXT: movvc r2, #0 +; THUMBV7-NEXT: mov r0, r2 +; THUMBV7-NEXT: bx lr %sadd = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %sadd, 1 %2 = zext i1 %1 to i32 ret i32 %2 - ; CHECK-LABEL: sadd_overflow: - ; ARM: adds r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]] - ; ARM: mov r[[R0]], #1 - ; ARM: movvc r[[R0]], #0 - ; ARM: mov pc, lr - ; THUMBV6: adds r0, r0, r1 - ; THUMBV6: bvc .LBB1_2 - ; THUMBV7: adds r[[R2:[0-9]+]], r[[R0]], r[[R1:[0-9]+]] - ; THUMBV7: mov.w r[[R0:[0-9]+]], #1 - ; THUMBV7: it vc - ; THUMBV7: movvc r[[R0]], #0 } define i32 @usub_overflow(i32 %a, i32 %b) #0 { +; ARM-LABEL: usub_overflow: +; ARM: @ %bb.0: +; ARM-NEXT: subs r0, r0, r1 +; ARM-NEXT: mov r2, #0 +; ARM-NEXT: adc r0, r2, #0 +; ARM-NEXT: eor r0, r0, #1 +; ARM-NEXT: mov pc, lr +; +; THUMBV6-LABEL: usub_overflow: +; THUMBV6: @ %bb.0: +; THUMBV6-NEXT: movs r2, #0 +; THUMBV6-NEXT: subs r0, r0, r1 +; THUMBV6-NEXT: adcs r2, r2 +; THUMBV6-NEXT: movs r0, #1 +; THUMBV6-NEXT: eors r0, r2 +; THUMBV6-NEXT: bx lr +; +; THUMBV7-LABEL: usub_overflow: +; THUMBV7: @ %bb.0: +; THUMBV7-NEXT: subs r0, r0, r1 +; THUMBV7-NEXT: mov.w r2, #0 +; THUMBV7-NEXT: adc r0, r2, #0 +; THUMBV7-NEXT: eor r0, r0, #1 +; THUMBV7-NEXT: bx lr %sadd = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %sadd, 1 %2 = zext i1 %1 to i32 ret i32 %2 - ; CHECK-LABEL: usub_overflow: - ; ARM: subs r[[R0:[0-9]+]], r[[R0]], r[[R1:[0-9]+]] - ; ARM: mov r[[R2:[0-9]+]], #0 - ; ARM: adc r[[R0]], r[[R2]], #0 - ; ARM: eor r[[R0]], r[[R0]], #1 - ; THUMBV6: movs r[[R2:[0-9]+]], #0 - ; THUMBV6: subs r[[R0:[0-9]+]], r[[R0]], r[[R1:[0-9]+]] - ; THUMBV6: adcs r[[R2]], r[[R2]] - ; THUMBV6: movs r[[R0]], #1 - ; THUMBV6: eors r[[R0]], r[[R2]] - ; THUMBV7: subs r[[R0:[0-9]+]], r[[R0]], r[[R1:[0-9]+]] - ; THUMBV7: mov.w r[[R2:[0-9]+]], #0 - ; THUMBV7: adc r[[R0]], r[[R2]], #0 - ; THUMBV7: eor r[[R0]], r[[R0]], #1 ; We should know that the overflow is just 1 bit, ; no need to clear any other bit - ; CHECK-NOT: and } define i32 @ssub_overflow(i32 %a, i32 %b) #0 { +; ARM-LABEL: ssub_overflow: +; ARM: @ %bb.0: +; ARM-NEXT: mov r2, #1 +; ARM-NEXT: subs r0, r0, r1 +; ARM-NEXT: movvc r2, #0 +; ARM-NEXT: mov r0, r2 +; ARM-NEXT: mov pc, lr +; +; THUMBV6-LABEL: ssub_overflow: +; THUMBV6: @ %bb.0: +; THUMBV6-NEXT: cmp r0, r1 +; THUMBV6-NEXT: bvc .LBB3_2 +; THUMBV6-NEXT: @ %bb.1: +; THUMBV6-NEXT: movs r0, #1 +; THUMBV6-NEXT: bx lr +; THUMBV6-NEXT: .LBB3_2: +; THUMBV6-NEXT: movs r0, #0 +; THUMBV6-NEXT: bx lr +; +; THUMBV7-LABEL: ssub_overflow: +; THUMBV7: @ %bb.0: +; THUMBV7-NEXT: movs r2, #1 +; THUMBV7-NEXT: subs r0, r0, r1 +; THUMBV7-NEXT: it vc +; THUMBV7-NEXT: movvc r2, #0 +; THUMBV7-NEXT: mov r0, r2 +; THUMBV7-NEXT: bx lr %sadd = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %sadd, 1 %2 = zext i1 %1 to i32 ret i32 %2 - ; CHECK-LABEL: ssub_overflow: - ; ARM: mov r[[R2]], #1 - ; ARM: cmp r[[R0]], r[[R1]] - ; ARM: movvc r[[R2]], #0 - ; THUMBV6: cmp r0, r1 - ; THUMBV6: bvc .LBB3_2 - ; THUMBV7: movs r[[R2:[0-9]+]], #1 - ; THUMBV7: cmp r[[R0:[0-9]+]], r[[R1:[0-9]+]] - ; THUMBV7: it vc - ; THUMBV7: movvc r[[R2]], #0 - ; THUMBV7: mov r[[R0]], r[[R2]] } declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #2 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #3 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #4 +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/ARM/sadd_sat.ll b/llvm/test/CodeGen/ARM/sadd_sat.ll index b8f7a2daaeaba..5b993034e6d54 100644 --- a/llvm/test/CodeGen/ARM/sadd_sat.ll +++ b/llvm/test/CodeGen/ARM/sadd_sat.ll @@ -31,9 +31,9 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-T2NODSP-LABEL: func: ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: adds r0, r0, r1 -; CHECK-T2NODSP-NEXT: mov.w r1, #-2147483648 +; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 ; CHECK-T2NODSP-NEXT: it vs -; CHECK-T2NODSP-NEXT: eorvs.w r0, r1, r0, asr #31 +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func: @@ -44,8 +44,8 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-ARMNODPS-LABEL: func: ; CHECK-ARMNODPS: @ %bb.0: ; CHECK-ARMNODPS-NEXT: adds r0, r0, r1 -; CHECK-ARMNODPS-NEXT: mov r1, #-2147483648 -; CHECK-ARMNODPS-NEXT: eorvs r0, r1, r0, asr #31 +; CHECK-ARMNODPS-NEXT: mov r2, #-2147483648 +; CHECK-ARMNODPS-NEXT: eorvs r0, r2, r0, asr #31 ; CHECK-ARMNODPS-NEXT: bx lr ; ; CHECK-ARMBASEDSP-LABEL: func: @@ -149,28 +149,28 @@ define i64 @func2(i64 %x, i64 %y) nounwind { } define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind { -; CHECK-T1-LABEL: func16: -; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: ldr r1, .LCPI2_0 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: blt .LBB2_2 -; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB2_2: -; CHECK-T1-NEXT: ldr r1, .LCPI2_1 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bgt .LBB2_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB2_4: -; CHECK-T1-NEXT: bx lr -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.5: -; CHECK-T1-NEXT: .LCPI2_0: -; CHECK-T1-NEXT: .long 32767 @ 0x7fff -; CHECK-T1-NEXT: .LCPI2_1: -; CHECK-T1-NEXT: .long 4294934528 @ 0xffff8000 +; CHECK-T16-LABEL: func16: +; CHECK-T16: @ %bb.0: +; CHECK-T16-NEXT: adds r0, r0, r1 +; CHECK-T16-NEXT: ldr r1, .LCPI2_0 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: blt .LBB2_2 +; CHECK-T16-NEXT: @ %bb.1: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB2_2: +; CHECK-T16-NEXT: ldr r1, .LCPI2_1 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: bgt .LBB2_4 +; CHECK-T16-NEXT: @ %bb.3: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB2_4: +; CHECK-T16-NEXT: bx lr +; CHECK-T16-NEXT: .p2align 2 +; CHECK-T16-NEXT: @ %bb.5: +; CHECK-T16-NEXT: .LCPI2_0: +; CHECK-T16-NEXT: .long 32767 @ 0x7fff +; CHECK-T16-NEXT: .LCPI2_1: +; CHECK-T16-NEXT: .long 4294934528 @ 0xffff8000 ; ; CHECK-T2NODSP-LABEL: func16: ; CHECK-T2NODSP: @ %bb.0: @@ -208,6 +208,29 @@ define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind { ; CHECK-ARMBASEDSP-NEXT: asr r0, r0, #16 ; CHECK-ARMBASEDSP-NEXT: bx lr ; +; CHECK-T15TE-LABEL: func16: +; CHECK-T15TE: @ %bb.0: +; CHECK-T15TE-NEXT: adds r0, r0, r1 +; CHECK-T15TE-NEXT: ldr r1, .LCPI2_0 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: blt .LBB2_2 +; CHECK-T15TE-NEXT: @ %bb.1: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB2_2: +; CHECK-T15TE-NEXT: ldr r1, .LCPI2_1 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: bgt .LBB2_4 +; CHECK-T15TE-NEXT: @ %bb.3: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB2_4: +; CHECK-T15TE-NEXT: bx lr +; CHECK-T15TE-NEXT: .p2align 2 +; CHECK-T15TE-NEXT: @ %bb.5: +; CHECK-T15TE-NEXT: .LCPI2_0: +; CHECK-T15TE-NEXT: .long 32767 @ 0x7fff +; CHECK-T15TE-NEXT: .LCPI2_1: +; CHECK-T15TE-NEXT: .long 4294934528 @ 0xffff8000 +; ; CHECK-ARMDSP-LABEL: func16: ; CHECK-ARMDSP: @ %bb.0: ; CHECK-ARMDSP-NEXT: qadd16 r0, r0, r1 @@ -218,22 +241,22 @@ define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind { } define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind { -; CHECK-T1-LABEL: func8: -; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: movs r1, #127 -; CHECK-T1-NEXT: cmp r0, #127 -; CHECK-T1-NEXT: blt .LBB3_2 -; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB3_2: -; CHECK-T1-NEXT: mvns r1, r1 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bgt .LBB3_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB3_4: -; CHECK-T1-NEXT: bx lr +; CHECK-T16-LABEL: func8: +; CHECK-T16: @ %bb.0: +; CHECK-T16-NEXT: adds r0, r0, r1 +; CHECK-T16-NEXT: movs r1, #127 +; CHECK-T16-NEXT: cmp r0, #127 +; CHECK-T16-NEXT: blt .LBB3_2 +; CHECK-T16-NEXT: @ %bb.1: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB3_2: +; CHECK-T16-NEXT: mvns r1, r1 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: bgt .LBB3_4 +; CHECK-T16-NEXT: @ %bb.3: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB3_4: +; CHECK-T16-NEXT: bx lr ; ; CHECK-T2NODSP-LABEL: func8: ; CHECK-T2NODSP: @ %bb.0: @@ -264,6 +287,23 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind { ; CHECK-ARMBASEDSP-NEXT: asr r0, r0, #24 ; CHECK-ARMBASEDSP-NEXT: bx lr ; +; CHECK-T15TE-LABEL: func8: +; CHECK-T15TE: @ %bb.0: +; CHECK-T15TE-NEXT: adds r0, r0, r1 +; CHECK-T15TE-NEXT: movs r1, #127 +; CHECK-T15TE-NEXT: cmp r0, #127 +; CHECK-T15TE-NEXT: blt .LBB3_2 +; CHECK-T15TE-NEXT: @ %bb.1: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB3_2: +; CHECK-T15TE-NEXT: mvns r1, r1 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: bgt .LBB3_4 +; CHECK-T15TE-NEXT: @ %bb.3: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB3_4: +; CHECK-T15TE-NEXT: bx lr +; ; CHECK-ARMDSP-LABEL: func8: ; CHECK-ARMDSP: @ %bb.0: ; CHECK-ARMDSP-NEXT: qadd8 r0, r0, r1 @@ -274,22 +314,22 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind { } define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind { -; CHECK-T1-LABEL: func3: -; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: movs r1, #7 -; CHECK-T1-NEXT: cmp r0, #7 -; CHECK-T1-NEXT: blt .LBB4_2 -; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB4_2: -; CHECK-T1-NEXT: mvns r1, r1 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bgt .LBB4_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB4_4: -; CHECK-T1-NEXT: bx lr +; CHECK-T16-LABEL: func3: +; CHECK-T16: @ %bb.0: +; CHECK-T16-NEXT: adds r0, r0, r1 +; CHECK-T16-NEXT: movs r1, #7 +; CHECK-T16-NEXT: cmp r0, #7 +; CHECK-T16-NEXT: blt .LBB4_2 +; CHECK-T16-NEXT: @ %bb.1: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB4_2: +; CHECK-T16-NEXT: mvns r1, r1 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: bgt .LBB4_4 +; CHECK-T16-NEXT: @ %bb.3: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB4_4: +; CHECK-T16-NEXT: bx lr ; ; CHECK-T2NODSP-LABEL: func3: ; CHECK-T2NODSP: @ %bb.0: @@ -322,6 +362,23 @@ define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind { ; CHECK-ARMBASEDSP-NEXT: asr r0, r0, #28 ; CHECK-ARMBASEDSP-NEXT: bx lr ; +; CHECK-T15TE-LABEL: func3: +; CHECK-T15TE: @ %bb.0: +; CHECK-T15TE-NEXT: adds r0, r0, r1 +; CHECK-T15TE-NEXT: movs r1, #7 +; CHECK-T15TE-NEXT: cmp r0, #7 +; CHECK-T15TE-NEXT: blt .LBB4_2 +; CHECK-T15TE-NEXT: @ %bb.1: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB4_2: +; CHECK-T15TE-NEXT: mvns r1, r1 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: bgt .LBB4_4 +; CHECK-T15TE-NEXT: @ %bb.3: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB4_4: +; CHECK-T15TE-NEXT: bx lr +; ; CHECK-ARMDSP-LABEL: func3: ; CHECK-ARMDSP: @ %bb.0: ; CHECK-ARMDSP-NEXT: lsl r0, r0, #28 diff --git a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll index 0ddb64fc3f2d1..26fa25dafc132 100644 --- a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll @@ -26,12 +26,11 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; ; CHECK-T2NODSP-LABEL: func32: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: mla r1, r1, r2, r0 +; CHECK-T2NODSP-NEXT: muls r1, r2, r1 ; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 -; CHECK-T2NODSP-NEXT: cmp r1, r0 +; CHECK-T2NODSP-NEXT: adds r0, r0, r1 ; CHECK-T2NODSP-NEXT: it vs -; CHECK-T2NODSP-NEXT: eorvs.w r1, r2, r1, asr #31 -; CHECK-T2NODSP-NEXT: mov r0, r1 +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func32: diff --git a/llvm/test/CodeGen/ARM/ssub_sat.ll b/llvm/test/CodeGen/ARM/ssub_sat.ll index 0978bfd1f0140..e04d61f782293 100644 --- a/llvm/test/CodeGen/ARM/ssub_sat.ll +++ b/llvm/test/CodeGen/ARM/ssub_sat.ll @@ -29,9 +29,9 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-T2NODSP-LABEL: func: ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: subs r0, r0, r1 -; CHECK-T2NODSP-NEXT: mov.w r1, #-2147483648 +; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 ; CHECK-T2NODSP-NEXT: it vs -; CHECK-T2NODSP-NEXT: eorvs.w r0, r1, r0, asr #31 +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func: @@ -42,8 +42,8 @@ define i32 @func(i32 %x, i32 %y) nounwind { ; CHECK-ARMNODPS-LABEL: func: ; CHECK-ARMNODPS: @ %bb.0: ; CHECK-ARMNODPS-NEXT: subs r0, r0, r1 -; CHECK-ARMNODPS-NEXT: mov r1, #-2147483648 -; CHECK-ARMNODPS-NEXT: eorvs r0, r1, r0, asr #31 +; CHECK-ARMNODPS-NEXT: mov r2, #-2147483648 +; CHECK-ARMNODPS-NEXT: eorvs r0, r2, r0, asr #31 ; CHECK-ARMNODPS-NEXT: bx lr ; ; CHECK-ARMBASEDSP-LABEL: func: @@ -347,10 +347,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: .save {r7, lr} ; CHECK-T2NODSP-NEXT: push {r7, lr} -; CHECK-T2NODSP-NEXT: ldr.w r12, [sp, #8] -; CHECK-T2NODSP-NEXT: ldr.w lr, [sp, #12] -; CHECK-T2NODSP-NEXT: subs.w r0, r0, r12 +; CHECK-T2NODSP-NEXT: ldr.w lr, [sp, #8] ; CHECK-T2NODSP-NEXT: mov.w r12, #-2147483648 +; CHECK-T2NODSP-NEXT: subs.w r0, r0, lr +; CHECK-T2NODSP-NEXT: ldr.w lr, [sp, #12] ; CHECK-T2NODSP-NEXT: it vs ; CHECK-T2NODSP-NEXT: eorvs.w r0, r12, r0, asr #31 ; CHECK-T2NODSP-NEXT: subs.w r1, r1, lr @@ -382,10 +382,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-ARMNODPS: @ %bb.0: ; CHECK-ARMNODPS-NEXT: .save {r11, lr} ; CHECK-ARMNODPS-NEXT: push {r11, lr} -; CHECK-ARMNODPS-NEXT: ldr r12, [sp, #8] -; CHECK-ARMNODPS-NEXT: ldr lr, [sp, #12] -; CHECK-ARMNODPS-NEXT: subs r0, r0, r12 +; CHECK-ARMNODPS-NEXT: ldr lr, [sp, #8] ; CHECK-ARMNODPS-NEXT: mov r12, #-2147483648 +; CHECK-ARMNODPS-NEXT: subs r0, r0, lr +; CHECK-ARMNODPS-NEXT: ldr lr, [sp, #12] ; CHECK-ARMNODPS-NEXT: eorvs r0, r12, r0, asr #31 ; CHECK-ARMNODPS-NEXT: subs r1, r1, lr ; CHECK-ARMNODPS-NEXT: ldr lr, [sp, #16] diff --git a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll index adf6cafc6ccb8..2ea85f2697d97 100644 --- a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll @@ -26,13 +26,11 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind { ; ; CHECK-T2NODSP-LABEL: func32: ; CHECK-T2NODSP: @ %bb.0: -; CHECK-T2NODSP-NEXT: mls r3, r1, r2, r0 -; CHECK-T2NODSP-NEXT: mov.w r12, #-2147483648 ; CHECK-T2NODSP-NEXT: muls r1, r2, r1 -; CHECK-T2NODSP-NEXT: cmp r0, r1 +; CHECK-T2NODSP-NEXT: mov.w r2, #-2147483648 +; CHECK-T2NODSP-NEXT: subs r0, r0, r1 ; CHECK-T2NODSP-NEXT: it vs -; CHECK-T2NODSP-NEXT: eorvs.w r3, r12, r3, asr #31 -; CHECK-T2NODSP-NEXT: mov r0, r3 +; CHECK-T2NODSP-NEXT: eorvs.w r0, r2, r0, asr #31 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func32: diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 43ed5eefbf4c7..963fec79aff81 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -10,24 +10,22 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: cmp r2, #16 ; CHECK-NEXT: blo .LBB0_5 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: movs r6, #2 -; CHECK-NEXT: lsrs r7, r2, #3 -; CHECK-NEXT: rsb r6, r6, r2, lsr #3 -; CHECK-NEXT: cmp r7, #2 -; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: csel r7, r6, r5, hs -; CHECK-NEXT: add.w lr, r7, #1 -; CHECK-NEXT: mov r4, r5 +; CHECK-NEXT: movs r7, #2 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: rsbs r7, r7, r2, lsr #3 ; CHECK-NEXT: vldrh.u16 q0, [r0], #32 -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: csel r7, r7, r5, hs +; CHECK-NEXT: add.w lr, r7, #1 +; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: vmlsldava.s16 r6, r7, q0, q1 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 +; CHECK-NEXT: mov r8, r5 ; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r6, r7, q2, q3 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 ; CHECK-NEXT: vldrh.u16 q1, [r0], #32 @@ -37,30 +35,30 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 ; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlsldava.s16 r6, r7, q1, q0 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] ; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 ; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 +; CHECK-NEXT: vmlsldava.s16 r6, r7, q2, q3 ; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: movs r6, #14 -; CHECK-NEXT: and.w r2, r6, r2, lsl #1 +; CHECK-NEXT: movs r4, #14 +; CHECK-NEXT: and.w r2, r4, r2, lsl #1 ; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlsldava.s16 r6, r7, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] ; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r6, r7, q2, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u16 q1, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r6, r7, q1, q0 ; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 @@ -69,40 +67,42 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrht.u16 q0, [r0, #16] ; CHECK-NEXT: vldrht.u16 q1, [r1, #16] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q0, q1 +; CHECK-NEXT: vmlsldavat.s16 r6, r7, q0, q1 ; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q0, q1 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_5: @ %if.else -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: cbz r2, .LBB0_9 ; CHECK-NEXT: @ %bb.6: @ %while.body14.preheader -; CHECK-NEXT: lsls r6, r2, #1 -; CHECK-NEXT: mov r5, r4 -; CHECK-NEXT: mov r7, r4 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: dlstp.16 lr, r6 +; CHECK-NEXT: movs r7, #8 +; CHECK-NEXT: rsbs r7, r7, r2, lsl #1 +; CHECK-NEXT: lsl.w r2, r2, #1 +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: mov r7, r6 +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_7: @ %while.body14 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 -; CHECK-NEXT: vmlsldava.s16 r2, r7, q0, q1 -; CHECK-NEXT: vmlaldavax.s16 r4, r5, q0, q1 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_7 ; CHECK-NEXT: @ %bb.8: @ %if.end.loopexit177 -; CHECK-NEXT: mov r8, r4 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r8, r6 +; CHECK-NEXT: mov r6, r4 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_9: -; CHECK-NEXT: mov r7, r4 +; CHECK-NEXT: mov r7, r6 ; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: mov r5, r4 +; CHECK-NEXT: mov r5, r6 ; CHECK-NEXT: .LBB0_10: @ %if.end -; CHECK-NEXT: asrl r4, r7, #6 +; CHECK-NEXT: asrl r6, r7, #6 ; CHECK-NEXT: asrl r8, r5, #6 -; CHECK-NEXT: str r4, [r3] +; CHECK-NEXT: str r6, [r3] ; CHECK-NEXT: str.w r8, [r12] ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: