diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 6f060800f760d..ee4d973b2326e 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3530,9 +3530,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI, const SCEV *LoopStep = AR->getStepRecurrence(SE); if (!isa(LoopStep)) return false; - if (LU.AccessTy.getType()->getScalarSizeInBits() != - LoopStep->getType()->getScalarSizeInBits()) - return false; // Check if a post-indexed load/store can be used. if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { diff --git a/llvm/test/CodeGen/Hexagon/addrmode-align.ll b/llvm/test/CodeGen/Hexagon/addrmode-align.ll index 1a4df00d47cbd..f39019a0b40ef 100644 --- a/llvm/test/CodeGen/Hexagon/addrmode-align.ll +++ b/llvm/test/CodeGen/Hexagon/addrmode-align.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=hexagon < %s | FileCheck %s ; CHECK: [[REG0:(r[0-9]+)]] = add(r29 -; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#4) +; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#8) ; CHECK-DAG: memd([[REG1]]+#8) = ; CHECK-DAG: memd([[REG1]]+#0) = diff --git a/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll b/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll new file mode 100644 index 0000000000000..8fbf913a22cbb --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll @@ -0,0 +1,50 @@ +; RUN: llc -O3 -march=hexagon < %s | FileCheck %s +; Test to ensure LSR does not optimize out addrec of the outerloop. +; This will help to generate post-increment instructions, otherwise +; it end up an as extra reg+reg add inside the loop. +; CHECK: loop0(.LBB0_[[LOOP:.]], +; CHECK: .LBB0_[[LOOP]]: +; CHECK: memuh{{.*}}++ +; CHECK: endloop + + +define dso_local signext i16 @foo(i16* nocapture readonly %filt, i16* nocapture readonly %inp, i32 %c1, i32 %c2) local_unnamed_addr { +entry: + %cmp28 = icmp sgt i32 %c1, 0 + %cmp221 = icmp sgt i32 %c2, 0 + %or.cond = and i1 %cmp28, %cmp221 + br i1 %or.cond, label %for.cond1.preheader.us, label %for.cond.cleanup + +for.cond1.preheader.us: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge.us + %filt.addr.032.us = phi i16* [ %scevgep, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %filt, %entry ] + %inp.addr.031.us = phi i16* [ %scevgep35, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %inp, %entry ] + %l.030.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ] + %sum0.029.us = phi i16 [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ] + %scevgep = getelementptr i16, i16* %filt.addr.032.us, i32 %c2 + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us + %z.025.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ] + %filt.addr.124.us = phi i16* [ %filt.addr.032.us, %for.cond1.preheader.us ], [ %incdec.ptr.us, %for.body4.us ] + %inp.addr.123.us = phi i16* [ %inp.addr.031.us, %for.cond1.preheader.us ], [ %incdec.ptr5.us, %for.body4.us ] + %sum0.122.us = phi i16 [ %sum0.029.us, %for.cond1.preheader.us ], [ %add8.us, %for.body4.us ] + %incdec.ptr.us = getelementptr inbounds i16, i16* %filt.addr.124.us, i32 1 + %0 = load i16, i16* %filt.addr.124.us, align 2 + %incdec.ptr5.us = getelementptr inbounds i16, i16* %inp.addr.123.us, i32 1 + %1 = load i16, i16* %inp.addr.123.us, align 2 + %add.us = add i16 %0, %sum0.122.us + %add8.us = add i16 %add.us, %1 + %inc.us = add nuw nsw i32 %z.025.us, 1 + %exitcond = icmp eq i32 %inc.us, %c2 + br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us + %scevgep35 = getelementptr i16, i16* %inp.addr.031.us, i32 %c2 + %inc11.us = add nuw nsw i32 %l.030.us, 1 + %exitcond36 = icmp eq i32 %inc11.us, %c1 + br i1 %exitcond36, label %for.cond.cleanup, label %for.cond1.preheader.us + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry + %sum0.0.lcssa = phi i16 [ 0, %entry ], [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ] + ret i16 %sum0.0.lcssa +} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index ebd93db9bdbe9..905b6d14bf080 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1778,11 +1778,11 @@ for.body: ; preds = %for.body, %for.body define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: half_short_mac: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: cbz r2, .LBB11_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 -; CHECK-NEXT: and r7, r2, #3 +; CHECK-NEXT: and r6, r2, #3 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB11_4 ; CHECK-NEXT: @ %bb.2: @@ -1799,33 +1799,33 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 -; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: adds r3, r1, #4 +; CHECK-NEXT: adds r2, r0, #4 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_5: @ %for.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: adds r6, r0, r3 -; CHECK-NEXT: vldr.16 s2, [r6, #6] +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrsh.w r4, [r3, #2] +; CHECK-NEXT: vldr.16 s2, [r2, #2] ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: ldrsh.w r4, [r2, #2] -; CHECK-NEXT: ldrsh.w r5, [r2, #4] -; CHECK-NEXT: ldrsh.w r2, [r2, #6] -; CHECK-NEXT: vmov s8, r4 -; CHECK-NEXT: vmov s6, r5 -; CHECK-NEXT: vmov s4, r2 +; CHECK-NEXT: vmov s4, r4 ; CHECK-NEXT: vcvt.f16.s32 s4, s4 +; CHECK-NEXT: ldrsh.w r4, [r3] ; CHECK-NEXT: vmul.f16 s2, s2, s4 -; CHECK-NEXT: vldr.16 s4, [r6, #4] +; CHECK-NEXT: vldr.16 s4, [r2] +; CHECK-NEXT: vmov s6, r4 ; CHECK-NEXT: vcvt.f16.s32 s6, s6 +; CHECK-NEXT: ldrsh r5, [r3, #-2] +; CHECK-NEXT: ldrsh r4, [r3, #-4] ; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vldr.16 s6, [r6, #2] +; CHECK-NEXT: vldr.16 s6, [r2, #-2] +; CHECK-NEXT: adds r3, #8 +; CHECK-NEXT: vmov s8, r5 ; CHECK-NEXT: vcvt.f16.s32 s8, s8 -; CHECK-NEXT: ldrsh r2, [r1, r3] +; CHECK-NEXT: vmov s10, r4 ; CHECK-NEXT: vmul.f16 s6, s6, s8 -; CHECK-NEXT: vldr.16 s8, [r6] -; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vmov s10, r2 +; CHECK-NEXT: vldr.16 s8, [r2, #-4] ; CHECK-NEXT: vcvt.f16.s32 s10, s10 +; CHECK-NEXT: adds r2, #8 ; CHECK-NEXT: vmul.f16 s8, s8, s10 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6 @@ -1837,11 +1837,11 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB11_5 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r7, .LBB11_9 +; CHECK-NEXT: wls lr, r6, .LBB11_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: add.w r1, r1, r12, lsl #1 -; CHECK-NEXT: mov lr, r7 +; CHECK-NEXT: mov lr, r6 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r2, [r1], #2 @@ -1854,7 +1854,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB11_8 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: .LCPI11_0: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index 5fd03a7813226..0b8a20e825694 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -372,29 +372,29 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r4, r3, r12, lsl #2 -; CHECK-NEXT: add.w r5, r1, r12 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r12 -; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 +; CHECK-NEXT: add.w r4, r1, r12 +; CHECK-NEXT: cmp r6, r1 +; CHECK-NEXT: add.w r5, r0, r12 +; CHECK-NEXT: cset lr, hi +; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 -; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r4, r5, r7 -; CHECK-NEXT: lslseq.w r4, r4, #31 -; CHECK-NEXT: beq .LBB5_4 +; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: ands r5, r6 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: itt eq +; CHECK-NEXT: andeq.w r5, r4, lr +; CHECK-NEXT: lslseq.w r5, r5, #31 +; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and r9, r12, #3 -; CHECK-NEXT: cmp r4, #3 -; CHECK-NEXT: bhs .LBB5_6 +; CHECK-NEXT: sub.w r5, r12, #1 +; CHECK-NEXT: and r9, r12, #3 +; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: b .LBB5_8 @@ -409,35 +409,37 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r12, #3 +; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 -; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, lr, r7, lsr #2 +; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: adds r5, r0, #3 +; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb.w r5, [r0, r12] -; CHECK-NEXT: add.w r7, r1, r12 -; CHECK-NEXT: ldrb.w r6, [r1, r12] -; CHECK-NEXT: smlabb r5, r6, r5, r2 -; CHECK-NEXT: str r5, [r4, #-8] -; CHECK-NEXT: add.w r5, r0, r12 -; CHECK-NEXT: ldrb r6, [r7, #1] -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: ldrb.w r8, [r5, #1] -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r4, #-4] -; CHECK-NEXT: ldrb.w r8, [r5, #2] -; CHECK-NEXT: ldrb r6, [r7, #2] -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r4] -; CHECK-NEXT: ldrb r5, [r5, #3] -; CHECK-NEXT: ldrb r6, [r7, #3] -; CHECK-NEXT: smlabb r5, r6, r5, r2 -; CHECK-NEXT: str r5, [r4, #4] -; CHECK-NEXT: adds r4, #16 -; CHECK-NEXT: le lr, .LBB5_7 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrb r8, [r5, #-3] +; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: ldrb r7, [r6, #-1] +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #-8] +; CHECK-NEXT: ldrb r8, [r5, #-2] +; CHECK-NEXT: ldrb r7, [r6] +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #-4] +; CHECK-NEXT: ldrb r8, [r5, #-1] +; CHECK-NEXT: ldrb r7, [r6, #1] +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4] +; CHECK-NEXT: ldrb.w r8, [r5] +; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: ldrb r7, [r6, #2] +; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #4] +; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r9, .LBB5_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader @@ -447,10 +449,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb r7, [r0], #1 -; CHECK-NEXT: ldrb r6, [r1], #1 -; CHECK-NEXT: smlabb r7, r6, r7, r2 -; CHECK-NEXT: str r7, [r3], #4 +; CHECK-NEXT: ldrb r6, [r0], #1 +; CHECK-NEXT: ldrb r5, [r1], #1 +; CHECK-NEXT: smlabb r6, r5, r6, r2 +; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB5_10 ; CHECK-NEXT: .LBB5_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} @@ -663,28 +665,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r4, r3, r12, lsl #2 -; CHECK-NEXT: add.w r5, r1, r12 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r12 -; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 +; CHECK-NEXT: add.w r4, r1, r12 +; CHECK-NEXT: cmp r6, r1 +; CHECK-NEXT: add.w r5, r0, r12 +; CHECK-NEXT: cset lr, hi +; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 +; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: ands r5, r6 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r4, r5, r7 -; CHECK-NEXT: lslseq.w r4, r4, #31 +; CHECK-NEXT: andeq.w r5, r4, lr +; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 +; CHECK-NEXT: sub.w r5, r12, #1 ; CHECK-NEXT: and r9, r12, #3 -; CHECK-NEXT: cmp r4, #3 +; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 @@ -700,33 +702,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r12, #3 +; CHECK-NEXT: bic r5, r12, #3 ; CHECK-NEXT: add.w r4, r3, #8 -; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, lr, r7, lsr #2 +; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: adds r5, r0, #3 +; CHECK-NEXT: adds r6, r1, #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb.w r5, [r0, r12] -; CHECK-NEXT: add.w r7, r1, r12 -; CHECK-NEXT: ldrb.w r6, [r1, r12] -; CHECK-NEXT: smlabb r5, r6, r5, r2 -; CHECK-NEXT: str r5, [r4, #-8] -; CHECK-NEXT: add.w r5, r0, r12 -; CHECK-NEXT: ldrb r6, [r7, #1] +; CHECK-NEXT: ldrb r8, [r5, #-3] ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: ldrb.w r8, [r5, #1] -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r4, #-4] -; CHECK-NEXT: ldrb.w r8, [r5, #2] -; CHECK-NEXT: ldrb r6, [r7, #2] -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r4] -; CHECK-NEXT: ldrb r5, [r5, #3] -; CHECK-NEXT: ldrb r6, [r7, #3] -; CHECK-NEXT: smlabb r5, r6, r5, r2 -; CHECK-NEXT: str r5, [r4, #4] +; CHECK-NEXT: ldrb r7, [r6, #-1] +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #-8] +; CHECK-NEXT: ldrb r8, [r5, #-2] +; CHECK-NEXT: ldrb r7, [r6] +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #-4] +; CHECK-NEXT: ldrb r8, [r5, #-1] +; CHECK-NEXT: ldrb r7, [r6, #1] +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4] +; CHECK-NEXT: ldrb.w r8, [r5] +; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: ldrb r7, [r6, #2] +; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: smlabb r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa @@ -738,10 +742,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb r7, [r0], #1 -; CHECK-NEXT: ldrb r6, [r1], #1 -; CHECK-NEXT: smlabb r7, r6, r7, r2 -; CHECK-NEXT: str r7, [r3], #4 +; CHECK-NEXT: ldrb r6, [r0], #1 +; CHECK-NEXT: ldrb r5, [r1], #1 +; CHECK-NEXT: smlabb r6, r5, r6, r2 +; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB7_10 ; CHECK-NEXT: .LBB7_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}