Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main-arm-unknown-eabi"

define dso_local void @use_before_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
define dso_local void @use_before_def(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
Expand All @@ -23,23 +23,23 @@
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
%7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%lsr.iv13 = bitcast ptr %lsr.iv to ptr
%lsr.iv1416 = bitcast ptr %lsr.iv14 to ptr
%lsr.iv1719 = bitcast ptr %lsr.iv17 to ptr
%8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)
%9 = sub i32 %7, 4
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
%10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
call void @llvm.masked.store.v4i32.p0(<4 x i32> %10, ptr %lsr.iv1719, i32 4, <4 x i1> %8)
%scevgep = getelementptr i32, ptr %lsr.iv, i32 4
%scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%12 = icmp ne i32 %11, 0
br i1 %12, label %vector.body, label %for.cond.cleanup
Expand All @@ -50,9 +50,9 @@
declare i32 @llvm.start.loop.iterations.i32(i32)
declare <4 x i1> @llvm.arm.vctp32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)

...
---
Expand Down
178 changes: 89 additions & 89 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/emptyblock.mir

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# predication.

--- |
define dso_local i32 @no_vpsel_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
define dso_local i32 @no_vpsel_liveout(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -22,22 +22,22 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv18 = phi ptr [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.iv1820 = bitcast ptr %lsr.iv18 to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%tmp13 = add <4 x i32> %tmp12, %vec.phi
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep19 = getelementptr i16, ptr %lsr.iv18, i32 4
%tmp14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp15 = icmp ne i32 %tmp14, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -51,7 +51,7 @@
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp16, %middle.block ]
ret i32 %res.0.lcssa
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>)
declare i32 @llvm.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-16.mir
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
--- |
define dso_local void @incorrect_sub_16(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
define dso_local void @incorrect_sub_16(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
Expand All @@ -17,23 +17,23 @@
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv17 = phi i16* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
%7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
%lsr.iv13 = bitcast i16* %lsr.iv to <8 x i16>*
%lsr.iv1416 = bitcast i16* %lsr.iv14 to <8 x i16>*
%lsr.iv1719 = bitcast i16* %lsr.iv17 to <8 x i16>*
%lsr.iv13 = bitcast ptr %lsr.iv to ptr
%lsr.iv1416 = bitcast ptr %lsr.iv14 to ptr
%lsr.iv1719 = bitcast ptr %lsr.iv17 to ptr
%8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7)
%9 = sub i32 %7, 7
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv13, i32 4, <8 x i1> %8, <8 x i16> undef)
%wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv1416, i32 4, <8 x i1> %8, <8 x i16> undef)
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %lsr.iv13, i32 4, <8 x i1> %8, <8 x i16> undef)
%wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %lsr.iv1416, i32 4, <8 x i1> %8, <8 x i16> undef)
%10 = add nsw <8 x i16> %wide.masked.load12, %wide.masked.load
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %10, <8 x i16>* %lsr.iv1719, i32 4, <8 x i1> %8)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 8
%scevgep15 = getelementptr i16, i16* %lsr.iv14, i32 8
%scevgep18 = getelementptr i16, i16* %lsr.iv17, i32 8
call void @llvm.masked.store.v8i16.p0(<8 x i16> %10, ptr %lsr.iv1719, i32 4, <8 x i1> %8)
%scevgep = getelementptr i16, ptr %lsr.iv, i32 8
%scevgep15 = getelementptr i16, ptr %lsr.iv14, i32 8
%scevgep18 = getelementptr i16, ptr %lsr.iv17, i32 8
%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%12 = icmp ne i32 %11, 0
br i1 %12, label %vector.body, label %for.cond.cleanup
Expand All @@ -44,8 +44,8 @@
declare i32 @llvm.start.loop.iterations.i32(i32)
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
...
---
name: incorrect_sub_16
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-32.mir
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# We should optimise away the SUB

--- |
define dso_local void @incorrect_sub_32(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
define dso_local void @incorrect_sub_32(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
Expand All @@ -24,23 +24,23 @@
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
%7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%lsr.iv13 = bitcast ptr %lsr.iv to ptr
%lsr.iv1416 = bitcast ptr %lsr.iv14 to ptr
%lsr.iv1719 = bitcast ptr %lsr.iv17 to ptr
%8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
%9 = sub i32 %7, 5
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
%10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
call void @llvm.masked.store.v4i32.p0(<4 x i32> %10, ptr %lsr.iv1719, i32 4, <4 x i1> %8)
%scevgep = getelementptr i32, ptr %lsr.iv, i32 4
%scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%12 = icmp ne i32 %11, 0
br i1 %12, label %vector.body, label %for.cond.cleanup
Expand All @@ -51,8 +51,8 @@
declare i32 @llvm.start.loop.iterations.i32(i32)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)

...
---
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/incorrect-sub-8.mir
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
--- |
define dso_local void @incorrect_sub_8(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i8* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
define dso_local void @incorrect_sub_8(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
Expand All @@ -17,23 +17,23 @@
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv17 = phi i8* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
%7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
%lsr.iv13 = bitcast i8* %lsr.iv to <16 x i8>*
%lsr.iv1416 = bitcast i8* %lsr.iv14 to <16 x i8>*
%lsr.iv1719 = bitcast i8* %lsr.iv17 to <16 x i8>*
%lsr.iv13 = bitcast ptr %lsr.iv to ptr
%lsr.iv1416 = bitcast ptr %lsr.iv14 to ptr
%lsr.iv1719 = bitcast ptr %lsr.iv17 to ptr
%8 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7)
%9 = sub i32 %7, 15
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv13, i32 4, <16 x i1> %8, <16 x i8> undef)
%wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv1416, i32 4, <16 x i1> %8, <16 x i8> undef)
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %lsr.iv13, i32 4, <16 x i1> %8, <16 x i8> undef)
%wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %lsr.iv1416, i32 4, <16 x i1> %8, <16 x i8> undef)
%10 = add nsw <16 x i8> %wide.masked.load12, %wide.masked.load
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %10, <16 x i8>* %lsr.iv1719, i32 4, <16 x i1> %8)
%scevgep = getelementptr i8, i8* %lsr.iv, i32 16
%scevgep15 = getelementptr i8, i8* %lsr.iv14, i32 16
%scevgep18 = getelementptr i8, i8* %lsr.iv17, i32 16
call void @llvm.masked.store.v16i8.p0(<16 x i8> %10, ptr %lsr.iv1719, i32 4, <16 x i1> %8)
%scevgep = getelementptr i8, ptr %lsr.iv, i32 16
%scevgep15 = getelementptr i8, ptr %lsr.iv14, i32 16
%scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 16
%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%12 = icmp ne i32 %11, 0
br i1 %12, label %vector.body, label %for.cond.cleanup
Expand All @@ -44,9 +44,9 @@
declare i32 @llvm.start.loop.iterations.i32(i32)
declare <16 x i1> @llvm.arm.mve.vctp8(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
declare void @llvm.stackprotector(i8*, i8**)
declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
declare void @llvm.stackprotector(ptr, ptr)
...
---
name: incorrect_sub_8
Expand Down
46 changes: 23 additions & 23 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-1.mir
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Test that VPNOTs cannot be within a tail predicated loop.

--- |
define dso_local void @inloop_vpnot(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32* nocapture %e, i32 %N) local_unnamed_addr #0 {
define dso_local void @inloop_vpnot(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture readonly %c, ptr nocapture readonly %d, ptr nocapture %e, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -21,39 +21,39 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv.e = phi ptr [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi ptr [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi ptr [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi ptr [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.cast.e = bitcast i32* %lsr.iv.e to <4 x i32>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.iv1820 = bitcast ptr %lsr.iv18 to ptr
%lsr.iv1820.c = bitcast ptr %lsr.iv.c to ptr
%lsr.iv17.d = bitcast ptr %lsr.iv.d to ptr
%lsr.cast.e = bitcast ptr %lsr.iv.e to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%tmp14 = add <4 x i32> %tmp13, %vec.phi
%vpnot = xor <4 x i1> %tmp8, <i1 true, i1 true, i1 true, i1 true>
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp14, <4 x i32>* %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, i32* %lsr.iv.e, i32 4
call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp14, ptr %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep19 = getelementptr i16, ptr %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, ptr %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, ptr %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, ptr %lsr.iv.e, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -62,8 +62,8 @@
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
declare i32 @llvm.start.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
Expand Down
46 changes: 23 additions & 23 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-2.mir
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Test that a predicated VPNOT cannot be in a tail predicated loop.

--- |
define dso_local void @inloop_vpnot(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32* nocapture %e, i32 %N) local_unnamed_addr #0 {
define dso_local void @inloop_vpnot(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture readonly %c, ptr nocapture readonly %d, ptr nocapture %e, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -21,39 +21,39 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv.e = phi ptr [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi ptr [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi ptr [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi ptr [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.cast.e = bitcast i32* %lsr.iv.e to <4 x i32>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.iv1820 = bitcast ptr %lsr.iv18 to ptr
%lsr.iv1820.c = bitcast ptr %lsr.iv.c to ptr
%lsr.iv17.d = bitcast ptr %lsr.iv.d to ptr
%lsr.cast.e = bitcast ptr %lsr.iv.e to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%tmp14 = add <4 x i32> %tmp13, %vec.phi
%vpnot = xor <4 x i1> %tmp8, <i1 true, i1 true, i1 true, i1 true>
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp14, <4 x i32>* %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, i32* %lsr.iv.e, i32 4
call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp14, ptr %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep19 = getelementptr i16, ptr %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, ptr %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, ptr %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, ptr %lsr.iv.e, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -62,8 +62,8 @@
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
declare i32 @llvm.start.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
Expand Down
46 changes: 23 additions & 23 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpnot-3.mir
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Test that a VPNOT is not added to a max sized VPT block.

--- |
define dso_local void @inloop_vpnot(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32* nocapture %e, i32 %N) local_unnamed_addr #0 {
define dso_local void @inloop_vpnot(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture readonly %c, ptr nocapture readonly %d, ptr nocapture %e, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -21,39 +21,39 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv.e = phi i32* [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv.e = phi ptr [ %scevgep.e, %vector.body ], [ %e, %vector.ph ]
%lsr.iv.d = phi ptr [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi ptr [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi ptr [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.cast.e = bitcast i32* %lsr.iv.e to <4 x i32>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.iv1820 = bitcast ptr %lsr.iv18 to ptr
%lsr.iv1820.c = bitcast ptr %lsr.iv.c to ptr
%lsr.iv17.d = bitcast ptr %lsr.iv.d to ptr
%lsr.cast.e = bitcast ptr %lsr.iv.e to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%tmp14 = add <4 x i32> %tmp13, %vec.phi
%vpnot = xor <4 x i1> %tmp8, <i1 true, i1 true, i1 true, i1 true>
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp14, <4 x i32>* %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, i32* %lsr.iv.e, i32 4
call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp14, ptr %lsr.cast.e, i32 4, <4 x i1> %vpnot)
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep19 = getelementptr i16, ptr %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, ptr %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, ptr %lsr.iv.d, i32 4
%scevgep.e = getelementptr i32, ptr %lsr.iv.e, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -62,8 +62,8 @@
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>) #1
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
declare i32 @llvm.start.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# General test for vpsel exclusion from tail predication

--- |
define dso_local i32 @vpsel_after_vpt(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32 %N) local_unnamed_addr #0 {
define dso_local i32 @vpsel_after_vpt(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture readonly %c, ptr nocapture readonly %d, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -21,35 +21,35 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv.d = phi ptr [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi ptr [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi ptr [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.iv1820 = bitcast ptr %lsr.iv18 to ptr
%lsr.iv1820.c = bitcast ptr %lsr.iv.c to ptr
%lsr.iv17.d = bitcast ptr %lsr.iv.d to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%acc = add <4 x i32> %tmp13, %vec.phi
%tmp14 = select <4 x i1> %tmp8, <4 x i32> %acc, <4 x i32> %vec.phi
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep19 = getelementptr i16, ptr %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, ptr %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, ptr %lsr.iv.d, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -63,7 +63,7 @@
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17, %middle.block ]
ret i32 %res.0.lcssa
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>) #1
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
declare i32 @llvm.start.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# the block.

--- |
define dso_local i32 @vpsel_after_vpt(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %c, i16* nocapture readonly %d, i32 %N) local_unnamed_addr #0 {
define dso_local i32 @vpsel_after_vpt(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture readonly %c, ptr nocapture readonly %d, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -23,35 +23,35 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv.d = phi i16* [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi i16* [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv.d = phi ptr [ %scevgep.d, %vector.body ], [ %d, %vector.ph ]
%lsr.iv.c = phi ptr [ %scevgep.c, %vector.body ], [ %c, %vector.ph ]
%lsr.iv18 = phi ptr [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp14, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv1820.c = bitcast i16* %lsr.iv.c to <4 x i16>*
%lsr.iv17.d = bitcast i16* %lsr.iv.d to <4 x i16>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.iv1820 = bitcast ptr %lsr.iv18 to ptr
%lsr.iv1820.c = bitcast ptr %lsr.iv.c to ptr
%lsr.iv17.d = bitcast ptr %lsr.iv.d to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.c = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820.c, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.c = sext <4 x i16> %wide.masked.load.c to <4 x i32>
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load.d = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17.d, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%sext.load.d = sext <4 x i16> %wide.masked.load.d to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%mul.2 = mul nsw <4 x i32> %sext.load.c, %sext.load.d
%tmp13 = add <4 x i32> %tmp12, %mul.2
%acc = add <4 x i32> %tmp13, %vec.phi
%tmp14 = select <4 x i1> %tmp8, <4 x i32> %acc, <4 x i32> %vec.phi
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, i16* %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, i16* %lsr.iv.d, i32 4
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep19 = getelementptr i16, ptr %lsr.iv18, i32 4
%scevgep.c = getelementptr i16, ptr %lsr.iv.c, i32 4
%scevgep.d = getelementptr i16, ptr %lsr.iv.d, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -65,7 +65,7 @@
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17, %middle.block ]
ret i32 %res.0.lcssa
}
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>) #1
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2
declare i32 @llvm.start.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s

--- |
define dso_local <4 x i32> @invariant_use_store(i16* nocapture readonly %a, i32* %c, i32 %N, <4 x i32> %pass) {
define dso_local <4 x i32> @invariant_use_store(ptr nocapture readonly %a, ptr %c, i32 %N, <4 x i32> %pass) {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -19,21 +19,21 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv20 = phi ptr [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.store = bitcast i32* %lsr.iv20 to <4 x i32>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.store = bitcast ptr %lsr.iv20 to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%tmp12 = mul nsw <4 x i32> %pass, %tmp10
%tmp13 = add <4 x i32> %tmp12, %vec.phi
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp13, <4 x i32>* %lsr.store, i32 4, <4 x i1> %tmp8)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep20 = getelementptr i32, i32* %lsr.iv20, i32 4
call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp13, ptr %lsr.store, i32 4, <4 x i1> %tmp8)
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep20 = getelementptr i32, ptr %lsr.iv20, i32 4
%tmp14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp15 = icmp ne i32 %tmp14, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -43,7 +43,7 @@
ret <4 x i32> %pass
}

define dso_local i32 @invariant_mul_use_reduce(i16* nocapture readonly %a, i32* %c, i32 %N, <4 x i32> %pass) {
define dso_local i32 @invariant_mul_use_reduce(ptr nocapture readonly %a, ptr %c, i32 %N, <4 x i32> %pass) {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -60,16 +60,16 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%tmp12 = mul nsw <4 x i32> %pass, %tmp10
%tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -80,7 +80,7 @@
ret i32 %res
}

define dso_local i32 @invariant_add_use_reduce(i16* nocapture readonly %a, i32* %c, i32 %N, <4 x i32> %pass) {
define dso_local i32 @invariant_add_use_reduce(ptr nocapture readonly %a, ptr %c, i32 %N, <4 x i32> %pass) {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -97,16 +97,16 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%tmp12 = add nsw <4 x i32> %pass, %tmp10
%tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -118,8 +118,8 @@
}

declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
declare i32 @llvm.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
Expand Down
44 changes: 22 additions & 22 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s

--- |
define hidden arm_aapcs_vfpcc void @it_block_store_count_before_start(float* %pSrc, float* %pDst, i32 %blockSize, i32* %iter.addr) #0 {
define hidden arm_aapcs_vfpcc void @it_block_store_count_before_start(ptr %pSrc, ptr %pDst, i32 %blockSize, ptr %iter.addr) #0 {
entry:
%mul = shl i32 %blockSize, 1
%0 = add i32 %mul, 3
Expand All @@ -11,23 +11,23 @@
%2 = sub i32 %0, %smin
%3 = lshr i32 %2, 2
%4 = add nuw nsw i32 %3, 1
store i32 %4, i32* %iter.addr, align 4
store i32 %4, ptr %iter.addr, align 4
%start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
br label %do.body

do.body: ; preds = %do.body, %entry
%lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ]
%blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
%pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
%pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
%pDst.addr.01 = bitcast float* %pDst.addr.0 to <4 x float>*
%pSrc.addr.02 = bitcast float* %pSrc.addr.0 to <4 x float>*
%pDst.addr.0 = phi ptr [ %pDst, %entry ], [ %add.ptr4, %do.body ]
%pSrc.addr.0 = phi ptr [ %pSrc, %entry ], [ %add.ptr, %do.body ]
%pDst.addr.01 = bitcast ptr %pDst.addr.0 to ptr
%pSrc.addr.02 = bitcast ptr %pSrc.addr.0 to ptr
%5 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
%6 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.02, i32 4, <4 x i1> %5, <4 x float> undef)
%6 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %pSrc.addr.02, i32 4, <4 x i1> %5, <4 x float> undef)
%7 = fmul <4 x float> %6, %6
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %7, <4 x float>* %pDst.addr.01, i32 4, <4 x i1> %5)
%add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
%add.ptr4 = getelementptr inbounds float, float* %pDst.addr.0, i32 4
tail call void @llvm.masked.store.v4f32.p0(<4 x float> %7, ptr %pDst.addr.01, i32 4, <4 x i1> %5)
%add.ptr = getelementptr inbounds float, ptr %pSrc.addr.0, i32 4
%add.ptr4 = getelementptr inbounds float, ptr %pDst.addr.0, i32 4
%sub = add nsw i32 %blkCnt.0, -4
%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%9 = icmp ne i32 %8, 0
Expand All @@ -38,7 +38,7 @@
ret void
}

define hidden arm_aapcs_vfpcc void @it_block_store_count_after_start(float* %pSrc, float* %pDst, i32 %blockSize, i32* %iter.addr) #0 {
define hidden arm_aapcs_vfpcc void @it_block_store_count_after_start(ptr %pSrc, ptr %pDst, i32 %blockSize, ptr %iter.addr) #0 {
entry:
%mul = shl i32 %blockSize, 1
%0 = add i32 %mul, 3
Expand All @@ -48,22 +48,22 @@
%3 = lshr i32 %2, 2
%4 = add nuw nsw i32 %3, 1
%start = call i32 @llvm.start.loop.iterations.i32(i32 %4)
store i32 %4, i32* %iter.addr, align 4
store i32 %4, ptr %iter.addr, align 4
br label %do.body

do.body: ; preds = %do.body, %entry
%lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %start, %entry ]
%blkCnt.0 = phi i32 [ %mul, %entry ], [ %sub, %do.body ]
%pDst.addr.0 = phi float* [ %pDst, %entry ], [ %add.ptr4, %do.body ]
%pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
%pDst.addr.01 = bitcast float* %pDst.addr.0 to <4 x float>*
%pSrc.addr.02 = bitcast float* %pSrc.addr.0 to <4 x float>*
%pDst.addr.0 = phi ptr [ %pDst, %entry ], [ %add.ptr4, %do.body ]
%pSrc.addr.0 = phi ptr [ %pSrc, %entry ], [ %add.ptr, %do.body ]
%pDst.addr.01 = bitcast ptr %pDst.addr.0 to ptr
%pSrc.addr.02 = bitcast ptr %pSrc.addr.0 to ptr
%5 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
%6 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.02, i32 4, <4 x i1> %5, <4 x float> undef)
%6 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %pSrc.addr.02, i32 4, <4 x i1> %5, <4 x float> undef)
%7 = fmul <4 x float> %6, %6
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %7, <4 x float>* %pDst.addr.01, i32 4, <4 x i1> %5)
%add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
%add.ptr4 = getelementptr inbounds float, float* %pDst.addr.0, i32 4
tail call void @llvm.masked.store.v4f32.p0(<4 x float> %7, ptr %pDst.addr.01, i32 4, <4 x i1> %5)
%add.ptr = getelementptr inbounds float, ptr %pSrc.addr.0, i32 4
%add.ptr4 = getelementptr inbounds float, ptr %pDst.addr.0, i32 4
%sub = add nsw i32 %blkCnt.0, -4
%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%9 = icmp ne i32 %8, 0
Expand All @@ -78,10 +78,10 @@
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1

; Function Attrs: argmemonly nounwind readonly willreturn
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) #2

; Function Attrs: argmemonly nounwind willreturn writeonly
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #3
declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>) #3

; Function Attrs: noduplicate nounwind
declare i32 @llvm.start.loop.iterations.i32(i32) #4
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp-reordered.mir
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# TODO: We should be able to handle the VCMP -> VPST -> VCMP -> VCTP case.

--- |
define dso_local arm_aapcs_vfpcc void @test(i32* noalias nocapture %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
define dso_local arm_aapcs_vfpcc void @test(ptr noalias nocapture %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -25,25 +25,25 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
%lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv3 = phi ptr [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
%lsr.iv1 = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%elts.rem = phi i32 [ %N, %vector.ph ], [ %elts.rem.next, %vector.body ]
%lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
%lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>*
%lsr.iv12 = bitcast ptr %lsr.iv1 to ptr
%lsr.iv35 = bitcast ptr %lsr.iv3 to ptr
%tmp7 = insertelement <4 x i32> undef, i32 %div, i32 0
%tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp9 = icmp ult <4 x i32> %vec.ind, %tmp8
%lower = icmp uge <4 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1>
%tmp10 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %elts.rem)
%tmp11 = and <4 x i1> %tmp9, %tmp10
%pred = and <4 x i1> %tmp11, %lower
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv35, i32 4, <4 x i1> %pred, <4 x i32> undef)
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.load, <4 x i32>* %lsr.iv12, i32 4, <4 x i1> %pred)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv35, i32 4, <4 x i1> %pred, <4 x i32> undef)
call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.load, ptr %lsr.iv12, i32 4, <4 x i1> %pred)
%vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
%elts.rem.next = sub i32 %elts.rem, 4
%scevgep = getelementptr i32, i32* %lsr.iv1, i32 4
%scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 4
%scevgep = getelementptr i32, ptr %lsr.iv1, i32 4
%scevgep4 = getelementptr i32, ptr %lsr.iv3, i32 4
%tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1)
%tmp13 = icmp ne i32 %tmp12, 0
%lsr.iv.next = add nsw i32 %lsr.iv, -1
Expand All @@ -53,8 +53,8 @@
ret void
}

declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare i32 @llvm.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-two-vcmp.mir
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s

--- |
define dso_local arm_aapcs_vfpcc void @test(i32* noalias nocapture %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
define dso_local arm_aapcs_vfpcc void @test(ptr noalias nocapture %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -23,25 +23,25 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
%lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv3 = phi ptr [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
%lsr.iv1 = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%elts.rem = phi i32 [ %N, %vector.ph ], [ %elts.rem.next, %vector.body ]
%lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
%lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>*
%lsr.iv12 = bitcast ptr %lsr.iv1 to ptr
%lsr.iv35 = bitcast ptr %lsr.iv3 to ptr
%tmp7 = insertelement <4 x i32> undef, i32 %div, i32 0
%tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp9 = icmp ult <4 x i32> %vec.ind, %tmp8
%lower = icmp uge <4 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1>
%tmp10 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %elts.rem)
%tmp11 = and <4 x i1> %tmp9, %tmp10
%pred = and <4 x i1> %tmp11, %lower
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv35, i32 4, <4 x i1> %pred, <4 x i32> undef)
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.load, <4 x i32>* %lsr.iv12, i32 4, <4 x i1> %pred)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv35, i32 4, <4 x i1> %pred, <4 x i32> undef)
call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.load, ptr %lsr.iv12, i32 4, <4 x i1> %pred)
%vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
%elts.rem.next = sub i32 %elts.rem, 4
%scevgep = getelementptr i32, i32* %lsr.iv1, i32 4
%scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 4
%scevgep = getelementptr i32, ptr %lsr.iv1, i32 4
%scevgep4 = getelementptr i32, ptr %lsr.iv3, i32 4
%tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1)
%tmp13 = icmp ne i32 %tmp12, 0
%lsr.iv.next = add nsw i32 %lsr.iv, -1
Expand All @@ -51,8 +51,8 @@
ret void
}

declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare i32 @llvm.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/iv-vcmp.mir
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
--- |

; Function Attrs: nofree norecurse nounwind
define dso_local arm_aapcs_vfpcc void @test(i32* noalias nocapture %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
define dso_local arm_aapcs_vfpcc void @test(ptr noalias nocapture %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
entry:
%cmp9 = icmp eq i32 %N, 0
%0 = add i32 %N, 3
Expand All @@ -23,24 +23,24 @@
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv3 = phi i32* [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
%lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv3 = phi ptr [ %scevgep4, %vector.body ], [ %b, %vector.ph ]
%lsr.iv1 = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
%elts.rem = phi i32 [ %N, %vector.ph ], [ %elts.rem.next, %vector.body ]
%6 = phi i32 [ %start, %vector.ph ], [ %12, %vector.body ]
%lsr.iv35 = bitcast i32* %lsr.iv3 to <4 x i32>*
%lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>*
%lsr.iv35 = bitcast ptr %lsr.iv3 to ptr
%lsr.iv12 = bitcast ptr %lsr.iv1 to ptr
%7 = insertelement <4 x i32> undef, i32 %div, i32 0
%8 = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> zeroinitializer
%9 = icmp ult <4 x i32> %vec.ind, %8
%10 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %elts.rem)
%11 = and <4 x i1> %9, %10
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv35, i32 4, <4 x i1> %11, <4 x i32> undef)
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.load, <4 x i32>* %lsr.iv12, i32 4, <4 x i1> %11)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv35, i32 4, <4 x i1> %11, <4 x i32> undef)
call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.load, ptr %lsr.iv12, i32 4, <4 x i1> %11)
%vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
%elts.rem.next = sub i32 %elts.rem, 4
%scevgep = getelementptr i32, i32* %lsr.iv1, i32 4
%scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 4
%scevgep = getelementptr i32, ptr %lsr.iv1, i32 4
%scevgep4 = getelementptr i32, ptr %lsr.iv3, i32 4
%12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%13 = icmp ne i32 %12, 0
br i1 %13, label %vector.body, label %for.cond.cleanup
Expand All @@ -49,8 +49,8 @@
ret void
}

declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare i32 @llvm.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/livereg-no-loop-def.mir
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s

--- |
define dso_local <4 x i32> @exit_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32* %c, i32 %N, <4 x i32> %pass) {
define dso_local <4 x i32> @exit_liveout(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr %c, i32 %N, <4 x i32> %pass) {
entry:
%cmp9 = icmp eq i32 %N, 0
%tmp = add i32 %N, 3
Expand All @@ -19,26 +19,26 @@

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv20 = phi i32* [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%lsr.iv18 = phi ptr [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
%lsr.iv20 = phi ptr [ %scevgep20, %vector.body ], [ %c, %vector.ph ]
%lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %a, %vector.ph ]
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
%lsr.store = bitcast i32* %lsr.iv20 to <4 x i32>*
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
%lsr.iv17 = bitcast ptr %lsr.iv to ptr
%lsr.store = bitcast ptr %lsr.iv20 to ptr
%lsr.iv1820 = bitcast ptr %lsr.iv18 to ptr
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
%tmp9 = sub i32 %tmp7, 4
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
%tmp13 = add <4 x i32> %tmp12, %vec.phi
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp13, <4 x i32>* %lsr.store, i32 4, <4 x i1> %tmp8)
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
%scevgep20 = getelementptr i32, i32* %lsr.iv20, i32 4
call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp13, ptr %lsr.store, i32 4, <4 x i1> %tmp8)
%scevgep = getelementptr i16, ptr %lsr.iv, i32 4
%scevgep19 = getelementptr i16, ptr %lsr.iv18, i32 4
%scevgep20 = getelementptr i32, ptr %lsr.iv20, i32 4
%tmp14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
%tmp15 = icmp ne i32 %tmp14, 0
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
Expand All @@ -48,8 +48,8 @@
ret <4 x i32> %pass
}

declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
declare i32 @llvm.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
Expand Down
154 changes: 77 additions & 77 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-copy-chain.mir
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s

--- |
define dso_local arm_aapcscc void @test1(i32* nocapture %arg, i32* nocapture readonly %arg1, i32* nocapture readonly %arg2, i32 %arg3) {
define dso_local arm_aapcscc void @test1(ptr nocapture %arg, ptr nocapture readonly %arg1, ptr nocapture readonly %arg2, i32 %arg3) {
bb:
%tmp = icmp eq i32 %arg3, 0
br i1 %tmp, label %bb27, label %bb4
Expand All @@ -28,15 +28,15 @@
br i1 %tmp15, label %bb27, label %bb16

bb16: ; preds = %bb13
%tmp17 = getelementptr inbounds i32, i32* %arg1, i32 %tmp14
%tmp18 = load i32, i32* %tmp17, align 4
%tmp19 = getelementptr inbounds i32, i32* %arg2, i32 %tmp14
%tmp20 = load i32, i32* %tmp19, align 4
%tmp17 = getelementptr inbounds i32, ptr %arg1, i32 %tmp14
%tmp18 = load i32, ptr %tmp17, align 4
%tmp19 = getelementptr inbounds i32, ptr %arg2, i32 %tmp14
%tmp20 = load i32, ptr %tmp19, align 4
%tmp21 = xor i32 %tmp20, %tmp18
%tmp22 = getelementptr inbounds i32, i32* %arg, i32 %tmp14
%tmp23 = load i32, i32* %tmp22, align 4
%tmp22 = getelementptr inbounds i32, ptr %arg, i32 %tmp14
%tmp23 = load i32, ptr %tmp22, align 4
%tmp24 = add nsw i32 %tmp23, %tmp21
store i32 %tmp24, i32* %tmp22, align 4
store i32 %tmp24, ptr %tmp22, align 4
%tmp25 = add nuw i32 %tmp14, 1
%tmp26 = icmp eq i32 %tmp6, 1
br i1 %tmp26, label %bb27, label %bb57
Expand All @@ -48,69 +48,69 @@
%lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
%lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
%tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
%0 = bitcast i32* %arg1 to i8*
%1 = bitcast i32* %arg2 to i8*
%2 = bitcast i32* %arg to i8*
%uglygep14 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep1415 = bitcast i8* %uglygep14 to i32*
%scevgep617 = bitcast i32* %uglygep1415 to i32*
%tmp34 = load i32, i32* %scevgep617, align 4
%uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep89 = bitcast i8* %uglygep8 to i32*
%scevgep418 = bitcast i32* %uglygep89 to i32*
%tmp35 = load i32, i32* %scevgep418, align 4
%0 = bitcast ptr %arg1 to ptr
%1 = bitcast ptr %arg2 to ptr
%2 = bitcast ptr %arg to ptr
%uglygep14 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep1415 = bitcast ptr %uglygep14 to ptr
%scevgep617 = bitcast ptr %uglygep1415 to ptr
%tmp34 = load i32, ptr %scevgep617, align 4
%uglygep8 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep89 = bitcast ptr %uglygep8 to ptr
%scevgep418 = bitcast ptr %uglygep89 to ptr
%tmp35 = load i32, ptr %scevgep418, align 4
%tmp36 = xor i32 %tmp35, %tmp34
%uglygep2 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep23 = bitcast i8* %uglygep2 to i32*
%scevgep219 = bitcast i32* %uglygep23 to i32*
%tmp37 = load i32, i32* %scevgep219, align 4
%uglygep2 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep23 = bitcast ptr %uglygep2 to ptr
%scevgep219 = bitcast ptr %uglygep23 to ptr
%tmp37 = load i32, ptr %scevgep219, align 4
%tmp38 = add nsw i32 %tmp37, %tmp36
store i32 %tmp38, i32* %scevgep219, align 4
%uglygep33 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep3334 = bitcast i8* %uglygep33 to i32*
%scevgep14 = getelementptr i32, i32* %uglygep3334, i32 1
%tmp39 = load i32, i32* %scevgep14, align 4
%uglygep27 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep2728 = bitcast i8* %uglygep27 to i32*
%scevgep11 = getelementptr i32, i32* %uglygep2728, i32 1
%tmp40 = load i32, i32* %scevgep11, align 4
store i32 %tmp38, ptr %scevgep219, align 4
%uglygep33 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep3334 = bitcast ptr %uglygep33 to ptr
%scevgep14 = getelementptr i32, ptr %uglygep3334, i32 1
%tmp39 = load i32, ptr %scevgep14, align 4
%uglygep27 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep2728 = bitcast ptr %uglygep27 to ptr
%scevgep11 = getelementptr i32, ptr %uglygep2728, i32 1
%tmp40 = load i32, ptr %scevgep11, align 4
%tmp41 = xor i32 %tmp40, %tmp39
%uglygep20 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep2021 = bitcast i8* %uglygep20 to i32*
%scevgep9 = getelementptr i32, i32* %uglygep2021, i32 1
%tmp42 = load i32, i32* %scevgep9, align 4
%uglygep20 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep2021 = bitcast ptr %uglygep20 to ptr
%scevgep9 = getelementptr i32, ptr %uglygep2021, i32 1
%tmp42 = load i32, ptr %scevgep9, align 4
%tmp43 = add nsw i32 %tmp42, %tmp41
store i32 %tmp43, i32* %scevgep9, align 4
%uglygep30 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep3031 = bitcast i8* %uglygep30 to i32*
%scevgep12 = getelementptr i32, i32* %uglygep3031, i32 2
%tmp44 = load i32, i32* %scevgep12, align 4
%uglygep24 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep2425 = bitcast i8* %uglygep24 to i32*
%scevgep10 = getelementptr i32, i32* %uglygep2425, i32 2
%tmp45 = load i32, i32* %scevgep10, align 4
store i32 %tmp43, ptr %scevgep9, align 4
%uglygep30 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep3031 = bitcast ptr %uglygep30 to ptr
%scevgep12 = getelementptr i32, ptr %uglygep3031, i32 2
%tmp44 = load i32, ptr %scevgep12, align 4
%uglygep24 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep2425 = bitcast ptr %uglygep24 to ptr
%scevgep10 = getelementptr i32, ptr %uglygep2425, i32 2
%tmp45 = load i32, ptr %scevgep10, align 4
%tmp46 = xor i32 %tmp45, %tmp44
%uglygep17 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep1718 = bitcast i8* %uglygep17 to i32*
%scevgep8 = getelementptr i32, i32* %uglygep1718, i32 2
%tmp47 = load i32, i32* %scevgep8, align 4
%uglygep17 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep1718 = bitcast ptr %uglygep17 to ptr
%scevgep8 = getelementptr i32, ptr %uglygep1718, i32 2
%tmp47 = load i32, ptr %scevgep8, align 4
%tmp48 = add nsw i32 %tmp47, %tmp46
store i32 %tmp48, i32* %scevgep8, align 4
%uglygep11 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep1112 = bitcast i8* %uglygep11 to i32*
%scevgep5 = getelementptr i32, i32* %uglygep1112, i32 3
%tmp49 = load i32, i32* %scevgep5, align 4
%uglygep5 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep56 = bitcast i8* %uglygep5 to i32*
%scevgep3 = getelementptr i32, i32* %uglygep56, i32 3
%tmp50 = load i32, i32* %scevgep3, align 4
store i32 %tmp48, ptr %scevgep8, align 4
%uglygep11 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep1112 = bitcast ptr %uglygep11 to ptr
%scevgep5 = getelementptr i32, ptr %uglygep1112, i32 3
%tmp49 = load i32, ptr %scevgep5, align 4
%uglygep5 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep56 = bitcast ptr %uglygep5 to ptr
%scevgep3 = getelementptr i32, ptr %uglygep56, i32 3
%tmp50 = load i32, ptr %scevgep3, align 4
%tmp51 = xor i32 %tmp50, %tmp49
%uglygep = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep1 = bitcast i8* %uglygep to i32*
%scevgep1 = getelementptr i32, i32* %uglygep1, i32 3
%tmp52 = load i32, i32* %scevgep1, align 4
%uglygep = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep1 = bitcast ptr %uglygep to ptr
%scevgep1 = getelementptr i32, ptr %uglygep1, i32 3
%tmp52 = load i32, ptr %scevgep1, align 4
%tmp53 = add nsw i32 %tmp52, %tmp51
store i32 %tmp53, i32* %scevgep1, align 4
store i32 %tmp53, ptr %scevgep1, align 4
%tmp54 = add nuw i32 %tmp29, 4
%lsr.iv.next = add i32 %lsr.iv, 16
%loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv15, i32 1)
Expand All @@ -119,29 +119,29 @@
br i1 %tmp56, label %bb28, label %bb13

bb57: ; preds = %bb16
%tmp58 = getelementptr inbounds i32, i32* %arg1, i32 %tmp25
%tmp59 = load i32, i32* %tmp58, align 4
%tmp60 = getelementptr inbounds i32, i32* %arg2, i32 %tmp25
%tmp61 = load i32, i32* %tmp60, align 4
%tmp58 = getelementptr inbounds i32, ptr %arg1, i32 %tmp25
%tmp59 = load i32, ptr %tmp58, align 4
%tmp60 = getelementptr inbounds i32, ptr %arg2, i32 %tmp25
%tmp61 = load i32, ptr %tmp60, align 4
%tmp62 = xor i32 %tmp61, %tmp59
%tmp63 = getelementptr inbounds i32, i32* %arg, i32 %tmp25
%tmp64 = load i32, i32* %tmp63, align 4
%tmp63 = getelementptr inbounds i32, ptr %arg, i32 %tmp25
%tmp64 = load i32, ptr %tmp63, align 4
%tmp65 = add nsw i32 %tmp64, %tmp62
store i32 %tmp65, i32* %tmp63, align 4
store i32 %tmp65, ptr %tmp63, align 4
%tmp66 = add nuw i32 %tmp14, 2
%tmp67 = icmp eq i32 %tmp6, 2
br i1 %tmp67, label %bb27, label %bb68

bb68: ; preds = %bb57
%tmp69 = getelementptr inbounds i32, i32* %arg1, i32 %tmp66
%tmp70 = load i32, i32* %tmp69, align 4
%tmp71 = getelementptr inbounds i32, i32* %arg2, i32 %tmp66
%tmp72 = load i32, i32* %tmp71, align 4
%tmp69 = getelementptr inbounds i32, ptr %arg1, i32 %tmp66
%tmp70 = load i32, ptr %tmp69, align 4
%tmp71 = getelementptr inbounds i32, ptr %arg2, i32 %tmp66
%tmp72 = load i32, ptr %tmp71, align 4
%tmp73 = xor i32 %tmp72, %tmp70
%tmp74 = getelementptr inbounds i32, i32* %arg, i32 %tmp66
%tmp75 = load i32, i32* %tmp74, align 4
%tmp74 = getelementptr inbounds i32, ptr %arg, i32 %tmp66
%tmp75 = load i32, ptr %tmp74, align 4
%tmp76 = add nsw i32 %tmp75, %tmp73
store i32 %tmp76, i32* %tmp74, align 4
store i32 %tmp76, ptr %tmp74, align 4
br label %bb27
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s

--- |
define dso_local arm_aapcscc i32 @test1(i32* nocapture %arg, i32* nocapture readonly %arg1, i32* nocapture readonly %arg2, i32 %arg3) {
define dso_local arm_aapcscc i32 @test1(ptr nocapture %arg, ptr nocapture readonly %arg1, ptr nocapture readonly %arg2, i32 %arg3) {
bb:
%tmp = icmp eq i32 %arg3, 0
br i1 %tmp, label %bb27, label %bb4
Expand All @@ -28,15 +28,15 @@
br i1 %tmp15, label %bb27, label %bb16

bb16: ; preds = %bb13
%tmp17 = getelementptr inbounds i32, i32* %arg1, i32 %tmp14
%tmp18 = load i32, i32* %tmp17, align 4
%tmp19 = getelementptr inbounds i32, i32* %arg2, i32 %tmp14
%tmp20 = load i32, i32* %tmp19, align 4
%tmp17 = getelementptr inbounds i32, ptr %arg1, i32 %tmp14
%tmp18 = load i32, ptr %tmp17, align 4
%tmp19 = getelementptr inbounds i32, ptr %arg2, i32 %tmp14
%tmp20 = load i32, ptr %tmp19, align 4
%tmp21 = xor i32 %tmp20, %tmp18
%tmp22 = getelementptr inbounds i32, i32* %arg, i32 %tmp14
%tmp23 = load i32, i32* %tmp22, align 4
%tmp22 = getelementptr inbounds i32, ptr %arg, i32 %tmp14
%tmp23 = load i32, ptr %tmp22, align 4
%tmp24 = add nsw i32 %tmp23, %tmp21
store i32 %tmp24, i32* %tmp22, align 4
store i32 %tmp24, ptr %tmp22, align 4
%tmp25 = add nuw i32 %tmp14, 1
%tmp26 = icmp eq i32 %tmp6, 1
br i1 %tmp26, label %bb27, label %bb57
Expand All @@ -49,69 +49,69 @@
%lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
%lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
%tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
%0 = bitcast i32* %arg1 to i8*
%1 = bitcast i32* %arg2 to i8*
%2 = bitcast i32* %arg to i8*
%uglygep14 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep1415 = bitcast i8* %uglygep14 to i32*
%scevgep617 = bitcast i32* %uglygep1415 to i32*
%tmp34 = load i32, i32* %scevgep617, align 4
%uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep89 = bitcast i8* %uglygep8 to i32*
%scevgep418 = bitcast i32* %uglygep89 to i32*
%tmp35 = load i32, i32* %scevgep418, align 4
%0 = bitcast ptr %arg1 to ptr
%1 = bitcast ptr %arg2 to ptr
%2 = bitcast ptr %arg to ptr
%uglygep14 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep1415 = bitcast ptr %uglygep14 to ptr
%scevgep617 = bitcast ptr %uglygep1415 to ptr
%tmp34 = load i32, ptr %scevgep617, align 4
%uglygep8 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep89 = bitcast ptr %uglygep8 to ptr
%scevgep418 = bitcast ptr %uglygep89 to ptr
%tmp35 = load i32, ptr %scevgep418, align 4
%tmp36 = xor i32 %tmp35, %tmp34
%uglygep2 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep23 = bitcast i8* %uglygep2 to i32*
%scevgep219 = bitcast i32* %uglygep23 to i32*
%tmp37 = load i32, i32* %scevgep219, align 4
%uglygep2 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep23 = bitcast ptr %uglygep2 to ptr
%scevgep219 = bitcast ptr %uglygep23 to ptr
%tmp37 = load i32, ptr %scevgep219, align 4
%tmp38 = add nsw i32 %tmp37, %tmp36
store i32 %tmp38, i32* %scevgep219, align 4
%uglygep33 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep3334 = bitcast i8* %uglygep33 to i32*
%scevgep14 = getelementptr i32, i32* %uglygep3334, i32 1
%tmp39 = load i32, i32* %scevgep14, align 4
%uglygep27 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep2728 = bitcast i8* %uglygep27 to i32*
%scevgep11 = getelementptr i32, i32* %uglygep2728, i32 1
%tmp40 = load i32, i32* %scevgep11, align 4
store i32 %tmp38, ptr %scevgep219, align 4
%uglygep33 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep3334 = bitcast ptr %uglygep33 to ptr
%scevgep14 = getelementptr i32, ptr %uglygep3334, i32 1
%tmp39 = load i32, ptr %scevgep14, align 4
%uglygep27 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep2728 = bitcast ptr %uglygep27 to ptr
%scevgep11 = getelementptr i32, ptr %uglygep2728, i32 1
%tmp40 = load i32, ptr %scevgep11, align 4
%tmp41 = xor i32 %tmp40, %tmp39
%uglygep20 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep2021 = bitcast i8* %uglygep20 to i32*
%scevgep9 = getelementptr i32, i32* %uglygep2021, i32 1
%tmp42 = load i32, i32* %scevgep9, align 4
%uglygep20 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep2021 = bitcast ptr %uglygep20 to ptr
%scevgep9 = getelementptr i32, ptr %uglygep2021, i32 1
%tmp42 = load i32, ptr %scevgep9, align 4
%tmp43 = add nsw i32 %tmp42, %tmp41
store i32 %tmp43, i32* %scevgep9, align 4
%uglygep30 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep3031 = bitcast i8* %uglygep30 to i32*
%scevgep12 = getelementptr i32, i32* %uglygep3031, i32 2
%tmp44 = load i32, i32* %scevgep12, align 4
%uglygep24 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep2425 = bitcast i8* %uglygep24 to i32*
%scevgep10 = getelementptr i32, i32* %uglygep2425, i32 2
%tmp45 = load i32, i32* %scevgep10, align 4
store i32 %tmp43, ptr %scevgep9, align 4
%uglygep30 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep3031 = bitcast ptr %uglygep30 to ptr
%scevgep12 = getelementptr i32, ptr %uglygep3031, i32 2
%tmp44 = load i32, ptr %scevgep12, align 4
%uglygep24 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep2425 = bitcast ptr %uglygep24 to ptr
%scevgep10 = getelementptr i32, ptr %uglygep2425, i32 2
%tmp45 = load i32, ptr %scevgep10, align 4
%tmp46 = xor i32 %tmp45, %tmp44
%uglygep17 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep1718 = bitcast i8* %uglygep17 to i32*
%scevgep8 = getelementptr i32, i32* %uglygep1718, i32 2
%tmp47 = load i32, i32* %scevgep8, align 4
%uglygep17 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep1718 = bitcast ptr %uglygep17 to ptr
%scevgep8 = getelementptr i32, ptr %uglygep1718, i32 2
%tmp47 = load i32, ptr %scevgep8, align 4
%tmp48 = add nsw i32 %tmp47, %tmp46
store i32 %tmp48, i32* %scevgep8, align 4
%uglygep11 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep1112 = bitcast i8* %uglygep11 to i32*
%scevgep5 = getelementptr i32, i32* %uglygep1112, i32 3
%tmp49 = load i32, i32* %scevgep5, align 4
%uglygep5 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep56 = bitcast i8* %uglygep5 to i32*
%scevgep3 = getelementptr i32, i32* %uglygep56, i32 3
%tmp50 = load i32, i32* %scevgep3, align 4
store i32 %tmp48, ptr %scevgep8, align 4
%uglygep11 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep1112 = bitcast ptr %uglygep11 to ptr
%scevgep5 = getelementptr i32, ptr %uglygep1112, i32 3
%tmp49 = load i32, ptr %scevgep5, align 4
%uglygep5 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep56 = bitcast ptr %uglygep5 to ptr
%scevgep3 = getelementptr i32, ptr %uglygep56, i32 3
%tmp50 = load i32, ptr %scevgep3, align 4
%tmp51 = xor i32 %tmp50, %tmp49
%uglygep = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep1 = bitcast i8* %uglygep to i32*
%scevgep1 = getelementptr i32, i32* %uglygep1, i32 3
%tmp52 = load i32, i32* %scevgep1, align 4
%uglygep = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep1 = bitcast ptr %uglygep to ptr
%scevgep1 = getelementptr i32, ptr %uglygep1, i32 3
%tmp52 = load i32, ptr %scevgep1, align 4
%tmp53 = add nsw i32 %tmp52, %tmp51
store i32 %tmp53, i32* %scevgep1, align 4
store i32 %tmp53, ptr %scevgep1, align 4
%tmp54 = add nuw i32 %tmp29, 4
%lsr.iv.next = add i32 %lsr.iv, 16
%loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv15, i32 1)
Expand All @@ -120,29 +120,29 @@
br i1 %tmp56, label %bb28, label %bb13

bb57: ; preds = %bb16
%tmp58 = getelementptr inbounds i32, i32* %arg1, i32 %tmp25
%tmp59 = load i32, i32* %tmp58, align 4
%tmp60 = getelementptr inbounds i32, i32* %arg2, i32 %tmp25
%tmp61 = load i32, i32* %tmp60, align 4
%tmp58 = getelementptr inbounds i32, ptr %arg1, i32 %tmp25
%tmp59 = load i32, ptr %tmp58, align 4
%tmp60 = getelementptr inbounds i32, ptr %arg2, i32 %tmp25
%tmp61 = load i32, ptr %tmp60, align 4
%tmp62 = xor i32 %tmp61, %tmp59
%tmp63 = getelementptr inbounds i32, i32* %arg, i32 %tmp25
%tmp64 = load i32, i32* %tmp63, align 4
%tmp63 = getelementptr inbounds i32, ptr %arg, i32 %tmp25
%tmp64 = load i32, ptr %tmp63, align 4
%tmp65 = add nsw i32 %tmp64, %tmp62
store i32 %tmp65, i32* %tmp63, align 4
store i32 %tmp65, ptr %tmp63, align 4
%tmp66 = add nuw i32 %tmp14, 2
%tmp67 = icmp eq i32 %tmp6, 2
br i1 %tmp67, label %bb27, label %bb68

bb68: ; preds = %bb57
%tmp69 = getelementptr inbounds i32, i32* %arg1, i32 %tmp66
%tmp70 = load i32, i32* %tmp69, align 4
%tmp71 = getelementptr inbounds i32, i32* %arg2, i32 %tmp66
%tmp72 = load i32, i32* %tmp71, align 4
%tmp69 = getelementptr inbounds i32, ptr %arg1, i32 %tmp66
%tmp70 = load i32, ptr %tmp69, align 4
%tmp71 = getelementptr inbounds i32, ptr %arg2, i32 %tmp66
%tmp72 = load i32, ptr %tmp71, align 4
%tmp73 = xor i32 %tmp72, %tmp70
%tmp74 = getelementptr inbounds i32, i32* %arg, i32 %tmp66
%tmp75 = load i32, i32* %tmp74, align 4
%tmp74 = getelementptr inbounds i32, ptr %arg, i32 %tmp66
%tmp75 = load i32, ptr %tmp74, align 4
%tmp76 = add nsw i32 %tmp75, %tmp73
store i32 %tmp76, i32* %tmp74, align 4
store i32 %tmp76, ptr %tmp74, align 4
br label %bb27
}

Expand Down
154 changes: 77 additions & 77 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-dec-liveout.mir
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s

--- |
define dso_local arm_aapcscc i32 @test1(i32* nocapture %arg, i32* nocapture readonly %arg1, i32* nocapture readonly %arg2, i32 %arg3) {
define dso_local arm_aapcscc i32 @test1(ptr nocapture %arg, ptr nocapture readonly %arg1, ptr nocapture readonly %arg2, i32 %arg3) {
bb:
%tmp = icmp eq i32 %arg3, 0
br i1 %tmp, label %bb27, label %bb4
Expand All @@ -28,15 +28,15 @@
br i1 %tmp15, label %bb27, label %bb16

bb16: ; preds = %bb13
%tmp17 = getelementptr inbounds i32, i32* %arg1, i32 %tmp14
%tmp18 = load i32, i32* %tmp17, align 4
%tmp19 = getelementptr inbounds i32, i32* %arg2, i32 %tmp14
%tmp20 = load i32, i32* %tmp19, align 4
%tmp17 = getelementptr inbounds i32, ptr %arg1, i32 %tmp14
%tmp18 = load i32, ptr %tmp17, align 4
%tmp19 = getelementptr inbounds i32, ptr %arg2, i32 %tmp14
%tmp20 = load i32, ptr %tmp19, align 4
%tmp21 = xor i32 %tmp20, %tmp18
%tmp22 = getelementptr inbounds i32, i32* %arg, i32 %tmp14
%tmp23 = load i32, i32* %tmp22, align 4
%tmp22 = getelementptr inbounds i32, ptr %arg, i32 %tmp14
%tmp23 = load i32, ptr %tmp22, align 4
%tmp24 = add nsw i32 %tmp23, %tmp21
store i32 %tmp24, i32* %tmp22, align 4
store i32 %tmp24, ptr %tmp22, align 4
%tmp25 = add nuw i32 %tmp14, 1
%tmp26 = icmp eq i32 %tmp6, 1
br i1 %tmp26, label %bb27, label %bb57
Expand All @@ -49,69 +49,69 @@
%lsr.iv15 = phi i32 [ %lsr.iv.next16, %bb28 ], [ %start, %bb12 ]
%lsr.iv = phi i32 [ %lsr.iv.next, %bb28 ], [ 0, %bb12 ]
%tmp29 = phi i32 [ 0, %bb12 ], [ %tmp54, %bb28 ]
%0 = bitcast i32* %arg1 to i8*
%1 = bitcast i32* %arg2 to i8*
%2 = bitcast i32* %arg to i8*
%uglygep14 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep1415 = bitcast i8* %uglygep14 to i32*
%scevgep617 = bitcast i32* %uglygep1415 to i32*
%tmp34 = load i32, i32* %scevgep617, align 4
%uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep89 = bitcast i8* %uglygep8 to i32*
%scevgep418 = bitcast i32* %uglygep89 to i32*
%tmp35 = load i32, i32* %scevgep418, align 4
%0 = bitcast ptr %arg1 to ptr
%1 = bitcast ptr %arg2 to ptr
%2 = bitcast ptr %arg to ptr
%uglygep14 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep1415 = bitcast ptr %uglygep14 to ptr
%scevgep617 = bitcast ptr %uglygep1415 to ptr
%tmp34 = load i32, ptr %scevgep617, align 4
%uglygep8 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep89 = bitcast ptr %uglygep8 to ptr
%scevgep418 = bitcast ptr %uglygep89 to ptr
%tmp35 = load i32, ptr %scevgep418, align 4
%tmp36 = xor i32 %tmp35, %tmp34
%uglygep2 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep23 = bitcast i8* %uglygep2 to i32*
%scevgep219 = bitcast i32* %uglygep23 to i32*
%tmp37 = load i32, i32* %scevgep219, align 4
%uglygep2 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep23 = bitcast ptr %uglygep2 to ptr
%scevgep219 = bitcast ptr %uglygep23 to ptr
%tmp37 = load i32, ptr %scevgep219, align 4
%tmp38 = add nsw i32 %tmp37, %tmp36
store i32 %tmp38, i32* %scevgep219, align 4
%uglygep33 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep3334 = bitcast i8* %uglygep33 to i32*
%scevgep14 = getelementptr i32, i32* %uglygep3334, i32 1
%tmp39 = load i32, i32* %scevgep14, align 4
%uglygep27 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep2728 = bitcast i8* %uglygep27 to i32*
%scevgep11 = getelementptr i32, i32* %uglygep2728, i32 1
%tmp40 = load i32, i32* %scevgep11, align 4
store i32 %tmp38, ptr %scevgep219, align 4
%uglygep33 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep3334 = bitcast ptr %uglygep33 to ptr
%scevgep14 = getelementptr i32, ptr %uglygep3334, i32 1
%tmp39 = load i32, ptr %scevgep14, align 4
%uglygep27 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep2728 = bitcast ptr %uglygep27 to ptr
%scevgep11 = getelementptr i32, ptr %uglygep2728, i32 1
%tmp40 = load i32, ptr %scevgep11, align 4
%tmp41 = xor i32 %tmp40, %tmp39
%uglygep20 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep2021 = bitcast i8* %uglygep20 to i32*
%scevgep9 = getelementptr i32, i32* %uglygep2021, i32 1
%tmp42 = load i32, i32* %scevgep9, align 4
%uglygep20 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep2021 = bitcast ptr %uglygep20 to ptr
%scevgep9 = getelementptr i32, ptr %uglygep2021, i32 1
%tmp42 = load i32, ptr %scevgep9, align 4
%tmp43 = add nsw i32 %tmp42, %tmp41
store i32 %tmp43, i32* %scevgep9, align 4
%uglygep30 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep3031 = bitcast i8* %uglygep30 to i32*
%scevgep12 = getelementptr i32, i32* %uglygep3031, i32 2
%tmp44 = load i32, i32* %scevgep12, align 4
%uglygep24 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep2425 = bitcast i8* %uglygep24 to i32*
%scevgep10 = getelementptr i32, i32* %uglygep2425, i32 2
%tmp45 = load i32, i32* %scevgep10, align 4
store i32 %tmp43, ptr %scevgep9, align 4
%uglygep30 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep3031 = bitcast ptr %uglygep30 to ptr
%scevgep12 = getelementptr i32, ptr %uglygep3031, i32 2
%tmp44 = load i32, ptr %scevgep12, align 4
%uglygep24 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep2425 = bitcast ptr %uglygep24 to ptr
%scevgep10 = getelementptr i32, ptr %uglygep2425, i32 2
%tmp45 = load i32, ptr %scevgep10, align 4
%tmp46 = xor i32 %tmp45, %tmp44
%uglygep17 = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep1718 = bitcast i8* %uglygep17 to i32*
%scevgep8 = getelementptr i32, i32* %uglygep1718, i32 2
%tmp47 = load i32, i32* %scevgep8, align 4
%uglygep17 = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep1718 = bitcast ptr %uglygep17 to ptr
%scevgep8 = getelementptr i32, ptr %uglygep1718, i32 2
%tmp47 = load i32, ptr %scevgep8, align 4
%tmp48 = add nsw i32 %tmp47, %tmp46
store i32 %tmp48, i32* %scevgep8, align 4
%uglygep11 = getelementptr i8, i8* %0, i32 %lsr.iv
%uglygep1112 = bitcast i8* %uglygep11 to i32*
%scevgep5 = getelementptr i32, i32* %uglygep1112, i32 3
%tmp49 = load i32, i32* %scevgep5, align 4
%uglygep5 = getelementptr i8, i8* %1, i32 %lsr.iv
%uglygep56 = bitcast i8* %uglygep5 to i32*
%scevgep3 = getelementptr i32, i32* %uglygep56, i32 3
%tmp50 = load i32, i32* %scevgep3, align 4
store i32 %tmp48, ptr %scevgep8, align 4
%uglygep11 = getelementptr i8, ptr %0, i32 %lsr.iv
%uglygep1112 = bitcast ptr %uglygep11 to ptr
%scevgep5 = getelementptr i32, ptr %uglygep1112, i32 3
%tmp49 = load i32, ptr %scevgep5, align 4
%uglygep5 = getelementptr i8, ptr %1, i32 %lsr.iv
%uglygep56 = bitcast ptr %uglygep5 to ptr
%scevgep3 = getelementptr i32, ptr %uglygep56, i32 3
%tmp50 = load i32, ptr %scevgep3, align 4
%tmp51 = xor i32 %tmp50, %tmp49
%uglygep = getelementptr i8, i8* %2, i32 %lsr.iv
%uglygep1 = bitcast i8* %uglygep to i32*
%scevgep1 = getelementptr i32, i32* %uglygep1, i32 3
%tmp52 = load i32, i32* %scevgep1, align 4
%uglygep = getelementptr i8, ptr %2, i32 %lsr.iv
%uglygep1 = bitcast ptr %uglygep to ptr
%scevgep1 = getelementptr i32, ptr %uglygep1, i32 3
%tmp52 = load i32, ptr %scevgep1, align 4
%tmp53 = add nsw i32 %tmp52, %tmp51
store i32 %tmp53, i32* %scevgep1, align 4
store i32 %tmp53, ptr %scevgep1, align 4
%tmp54 = add nuw i32 %tmp29, 4
%lsr.iv.next = add i32 %lsr.iv, 16
%loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv15, i32 1)
Expand All @@ -120,29 +120,29 @@
br i1 %tmp56, label %bb28, label %bb13

bb57: ; preds = %bb16
%tmp58 = getelementptr inbounds i32, i32* %arg1, i32 %tmp25
%tmp59 = load i32, i32* %tmp58, align 4
%tmp60 = getelementptr inbounds i32, i32* %arg2, i32 %tmp25
%tmp61 = load i32, i32* %tmp60, align 4
%tmp58 = getelementptr inbounds i32, ptr %arg1, i32 %tmp25
%tmp59 = load i32, ptr %tmp58, align 4
%tmp60 = getelementptr inbounds i32, ptr %arg2, i32 %tmp25
%tmp61 = load i32, ptr %tmp60, align 4
%tmp62 = xor i32 %tmp61, %tmp59
%tmp63 = getelementptr inbounds i32, i32* %arg, i32 %tmp25
%tmp64 = load i32, i32* %tmp63, align 4
%tmp63 = getelementptr inbounds i32, ptr %arg, i32 %tmp25
%tmp64 = load i32, ptr %tmp63, align 4
%tmp65 = add nsw i32 %tmp64, %tmp62
store i32 %tmp65, i32* %tmp63, align 4
store i32 %tmp65, ptr %tmp63, align 4
%tmp66 = add nuw i32 %tmp14, 2
%tmp67 = icmp eq i32 %tmp6, 2
br i1 %tmp67, label %bb27, label %bb68

bb68: ; preds = %bb57
%tmp69 = getelementptr inbounds i32, i32* %arg1, i32 %tmp66
%tmp70 = load i32, i32* %tmp69, align 4
%tmp71 = getelementptr inbounds i32, i32* %arg2, i32 %tmp66
%tmp72 = load i32, i32* %tmp71, align 4
%tmp69 = getelementptr inbounds i32, ptr %arg1, i32 %tmp66
%tmp70 = load i32, ptr %tmp69, align 4
%tmp71 = getelementptr inbounds i32, ptr %arg2, i32 %tmp66
%tmp72 = load i32, ptr %tmp71, align 4
%tmp73 = xor i32 %tmp72, %tmp70
%tmp74 = getelementptr inbounds i32, i32* %arg, i32 %tmp66
%tmp75 = load i32, i32* %tmp74, align 4
%tmp74 = getelementptr inbounds i32, ptr %arg, i32 %tmp66
%tmp75 = load i32, ptr %tmp74, align 4
%tmp76 = add nsw i32 %tmp75, %tmp73
store i32 %tmp76, i32* %tmp74, align 4
store i32 %tmp76, ptr %tmp74, align 4
br label %bb27
}

Expand Down
Loading