Skip to content

Commit

Permalink
[ARM] Move VPTBlock pass after post-ra scheduling
Browse files Browse the repository at this point in the history
Currently when tail predicating loops, vpt blocks need to be created
with the vctp predicate in case we need to revert to non-tail predicated
form. This has the unfortunate side effect of severely hampering post-ra
scheduling at times as the instructions are already stuck in vpt blocks,
not allowed to be independently ordered.

This patch addresses that by just moving the creation of VPT blocks
later in the pipeline, after post-ra scheduling has been performed. This
allows more optimal scheduling post-ra before the vpt blocks are
created, leading to more optimal tail predicated loops.

Differential Revision: https://reviews.llvm.org/D113094
  • Loading branch information
davemgreen committed Nov 4, 2021
1 parent 9b6f8b9 commit 0912440
Show file tree
Hide file tree
Showing 32 changed files with 365 additions and 363 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/ARMTargetMachine.cpp
Expand Up @@ -541,7 +541,6 @@ void ARMPassConfig::addPreSched2() {
return !MF.getSubtarget<ARMSubtarget>().isThumb1Only();
}));
}
addPass(createMVEVPTBlockPass());
addPass(createThumb2ITBlockPass());

// Add both scheduling passes to give the subtarget an opportunity to pick
Expand All @@ -551,6 +550,7 @@ void ARMPassConfig::addPreSched2() {
addPass(&PostRASchedulerID);
}

addPass(createMVEVPTBlockPass());
addPass(createARMIndirectThunks());
addPass(createARMSLSHardeningPass());
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/O3-pipeline.ll
Expand Up @@ -151,12 +151,12 @@
; CHECK-NEXT: Machine Natural Loop Construction
; CHECK-NEXT: Machine Block Frequency Analysis
; CHECK-NEXT: If Converter
; CHECK-NEXT: MVE VPT block insertion pass
; CHECK-NEXT: Thumb IT blocks insertion pass
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction
; CHECK-NEXT: PostRA Machine Instruction Scheduler
; CHECK-NEXT: Post RA top-down list latency scheduler
; CHECK-NEXT: MVE VPT block insertion pass
; CHECK-NEXT: ARM Indirect Thunks
; CHECK-NEXT: ARM sls hardening pass
; CHECK-NEXT: Analyze Machine Code For Garbage Collection
Expand Down
Expand Up @@ -44,8 +44,8 @@ define void @arm_cmplx_dot_prod_f32(float* %pSrcA, float* %pSrcB, i32 %numSample
; CHECK-NEXT: vcmla.f32 q0, q2, q1, #90
; CHECK-NEXT: cbz r2, .LBB0_8
; CHECK-NEXT: @ %bb.4: @ %while.body9
; CHECK-NEXT: cmp r2, #4
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: cmp r2, #4
; CHECK-NEXT: vpstttt
; CHECK-NEXT: vldrwt.u32 q1, [r1]
; CHECK-NEXT: vldrwt.u32 q2, [r0]
Expand Down
Expand Up @@ -20,20 +20,20 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: and r4, r12, #15
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r2], #16
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
; CHECK-NEXT: vdup.32 q3, r4
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: vpt.i32 eq, q3, zr
; CHECK-NEXT: vmovt q1, q2
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vmul.i32 q1, q1, q2
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vmul.i32 q1, q1, q2
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down Expand Up @@ -101,22 +101,22 @@ define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a,
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: and r5, r4, #15
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vldrwt.u32 q2, [r3], #16
; CHECK-NEXT: vldrwt.u32 q3, [r2], #16
; CHECK-NEXT: vdup.32 q4, r5
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vpt.i32 eq, q4, zr
; CHECK-NEXT: vsubt.i32 q1, q3, q2
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vmul.i32 q1, q1, q2
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vmul.i32 q1, q1, q2
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down Expand Up @@ -200,8 +200,8 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vcmpt.i32 eq, q1, zr
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
Expand Down Expand Up @@ -288,13 +288,13 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: vpnot
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstee
; CHECK-NEXT: vcmpt.i32 ne, q1, zr
; CHECK-NEXT: vldrwe.u32 q1, [r3], #16
; CHECK-NEXT: vldrwe.u32 q2, [r2], #16
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB3_2
Expand Down Expand Up @@ -415,8 +415,9 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
; CHECK-NEXT: .LBB5_2: @ %bb12
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vpt.i32 ne, q0, zr
; CHECK-NEXT: vcmpt.s32 le, q0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: vpst
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
Expand Up @@ -214,10 +214,10 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vfma.f32 q0, q3, q2
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
Expand Up @@ -21,9 +21,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrbt.u32 q2, [r1], #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down Expand Up @@ -86,9 +86,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.s32 q2, [r1], #8
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down Expand Up @@ -151,9 +151,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrbt.u32 q2, [r1], #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down Expand Up @@ -216,9 +216,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.u32 q2, [r1], #8
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: le lr, .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down Expand Up @@ -281,9 +281,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: le lr, .LBB4_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
Expand Up @@ -78,12 +78,11 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vpst
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrbt.u16 q1, [r0], #8
; CHECK-NEXT: vldrbt.u16 q2, [r1], #8
; CHECK-NEXT: subs r2, #8
; CHECK-NEXT: vadd.i16 q1, q0, q1
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrbt.u16 q2, [r1], #8
; CHECK-NEXT: vadd.i16 q1, q1, q2
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
Expand Down Expand Up @@ -559,9 +558,9 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur
; CHECK-NEXT: vldrbt.u16 q1, [r3], #8
; CHECK-NEXT: vldrbt.u16 q4, [r4], #8
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: subs r2, #8
; CHECK-NEXT: vsub.i16 q3, q4, q1
; CHECK-NEXT: vmul.i16 q1, q4, q1
; CHECK-NEXT: subs r2, #8
; CHECK-NEXT: vadd.i16 q3, q3, q2
; CHECK-NEXT: vadd.i16 q1, q1, q0
; CHECK-NEXT: le lr, .LBB7_2
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
Expand Up @@ -57,10 +57,10 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(i16* noalias nocaptur
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.16 r6
; CHECK-NEXT: subs r6, #8
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.u16 q0, [r5]
; CHECK-NEXT: vshr.u16 q1, q0, #3
; CHECK-NEXT: subs r6, #8
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov q2, q4
; CHECK-NEXT: vmla.u16 q2, q1, r2
Expand Down Expand Up @@ -237,10 +237,10 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(i16* noalias no
; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrh.u16 q0, [r5]
; CHECK-NEXT: vshl.i16 q1, q0, #3
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov.f64 d6, d4
; CHECK-NEXT: vmov.f64 d7, d5
; CHECK-NEXT: vshl.i16 q1, q0, #3
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov q2, q4
; CHECK-NEXT: vmla.u16 q2, q1, r3
; CHECK-NEXT: vshr.u16 q1, q0, #3
Expand All @@ -265,10 +265,10 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(i16* noalias no
; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov.f64 d8, d10
; CHECK-NEXT: vmov.f64 d9, d11
; CHECK-NEXT: vmov.f64 d10, d14
; CHECK-NEXT: vmov.f64 d11, d15
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vorr q0, q0, q1
; CHECK-NEXT: vmov.f64 d10, d14
; CHECK-NEXT: vmov.f64 d11, d15
; CHECK-NEXT: vstrh.16 q0, [r5], #16
; CHECK-NEXT: letp lr, .LBB1_4
; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
Expand Down
Expand Up @@ -57,13 +57,13 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca
; DISABLED-NEXT: .LBB0_3: @ %vector.body
; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1
; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2
; DISABLED-NEXT: mov lr, r7
; DISABLED-NEXT: vctp.32 r6
; DISABLED-NEXT: subs r7, #1
; DISABLED-NEXT: subs r6, #4
; DISABLED-NEXT: mov lr, r7
; DISABLED-NEXT: vpstt
; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16
; DISABLED-NEXT: vldrwt.u32 q1, [r4], #16
; DISABLED-NEXT: subs r7, #1
; DISABLED-NEXT: subs r6, #4
; DISABLED-NEXT: vadd.i32 q0, q1, q0
; DISABLED-NEXT: vpst
; DISABLED-NEXT: vstrwt.32 q0, [r12], #16
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
Expand Up @@ -63,9 +63,9 @@ define i32 @bad(i32* readonly %x, i32* nocapture readonly %y, i32 %n) {
; CHECK-NEXT: .LBB1_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmlava.s32 r12, q0, q1
; CHECK-NEXT: le lr, .LBB1_1
Expand Down
Expand Up @@ -71,10 +71,10 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
; ENABLED-NEXT: vldrht.s32 q1, [r0], #8
; ENABLED-NEXT: vldrht.s32 q2, [r7], #8
; ENABLED-NEXT: mov lr, r6
; ENABLED-NEXT: vmul.i32 q1, q2, q1
; ENABLED-NEXT: subs r6, #1
; ENABLED-NEXT: vshl.s32 q1, r5
; ENABLED-NEXT: vmul.i32 q1, q2, q1
; ENABLED-NEXT: subs r4, #4
; ENABLED-NEXT: vshl.s32 q1, r5
; ENABLED-NEXT: vadd.i32 q1, q1, q0
; ENABLED-NEXT: le lr, .LBB0_6
; ENABLED-NEXT: @ %bb.7: @ %middle.block
Expand Down Expand Up @@ -142,10 +142,10 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8
; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8
; NOREDUCTIONS-NEXT: mov lr, r6
; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1
; NOREDUCTIONS-NEXT: subs r6, #1
; NOREDUCTIONS-NEXT: vshl.s32 q1, r5
; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1
; NOREDUCTIONS-NEXT: subs r4, #4
; NOREDUCTIONS-NEXT: vshl.s32 q1, r5
; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0
; NOREDUCTIONS-NEXT: le lr, .LBB0_6
; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/active_lane_mask.ll
Expand Up @@ -342,11 +342,11 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
; CHECK-NEXT: add r0, sp, #88
; CHECK-NEXT: vcmp.i8 ne, q3, zr
; CHECK-NEXT: vldr d1, [sp, #80]
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vpnot
; CHECK-NEXT: vmov d0, r2, r3
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.i8 ne, q2, zr
; CHECK-NEXT: vmov d0, r2, r3
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
Expand Up @@ -453,8 +453,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vshl.i32 q2, q1, #2
; CHECK-NEXT: vadd.i32 q1, q1, q6
; CHECK-NEXT: vadd.i32 q2, q2, r10
; CHECK-NEXT: vadd.i32 q1, q1, q6
; CHECK-NEXT: vstrw.32 q0, [q2]
; CHECK-NEXT: letp lr, .LBB1_10
; CHECK-NEXT: b .LBB1_13
Expand All @@ -467,8 +467,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
; CHECK-NEXT: vshl.i32 q2, q1, #2
; CHECK-NEXT: vadd.i32 q1, q1, q5
; CHECK-NEXT: vadd.i32 q2, q2, r10
; CHECK-NEXT: vadd.i32 q1, q1, q5
; CHECK-NEXT: vstrw.32 q0, [q2]
; CHECK-NEXT: letp lr, .LBB1_12
; CHECK-NEXT: .LBB1_13: @ %for.cond9.for.cond15.preheader_crit_edge.us
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
Expand Up @@ -835,10 +835,10 @@ define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonl
; CHECK-NEXT: and r5, r3, #3
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vctp.16 r5
; CHECK-NEXT: add.w r1, r10, #2
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r10]
; CHECK-NEXT: add.w r1, r10, #2
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: add.w r1, r10, #6
; CHECK-NEXT: vmul.f16 q0, q0, r7
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
Expand Up @@ -1481,9 +1481,9 @@ define void @shlor(i32* nocapture %x, i32* noalias nocapture readonly %y, i32 %n
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q4, [q3, #128]!
; CHECK-NEXT: vldrw.u32 q5, [q2, #128]!
; CHECK-NEXT: vldrw.u32 q6, [q0, #128]!
; CHECK-NEXT: vadd.i32 q4, q5, q4
; CHECK-NEXT: vldrw.u32 q5, [q1, #128]!
; CHECK-NEXT: vldrw.u32 q6, [q0, #128]!
; CHECK-NEXT: vadd.i32 q4, q4, q5
; CHECK-NEXT: vadd.i32 q4, q4, q6
; CHECK-NEXT: vstrw.32 q4, [r0], #16
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
Expand Up @@ -231,8 +231,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp1(i16* %base, <8 x i16>* %offptr) {
; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i16 q0, #0x1
; CHECK-NEXT: vldrh.u16 q1, [r1]
; CHECK-NEXT: vmov.i16 q0, #0x1
; CHECK-NEXT: vpt.s16 gt, q1, zr
; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1]
; CHECK-NEXT: vpsel q0, q2, q0
Expand Down

0 comments on commit 0912440

Please sign in to comment.