469 changes: 469 additions & 0 deletions llvm/lib/Target/ARM/MVETailPredication.cpp

Large diffs are not rendered by default.

385 changes: 385 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll

Large diffs are not rendered by default.

152 changes: 152 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
; RUN: opt -mtriple=armv8.1m.main -mattr=+mve -S -mve-tail-predication -disable-mve-tail-predication=false %s -o - | FileCheck %s

; TODO: Support extending loads
; CHECK-LABEL: mat_vec_sext_i16
; CHECK-NOT: call {{.*}} @llvm.arm.vctp
define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
entry:
%cmp24 = icmp eq i32 %N, 0
br i1 %cmp24, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader

for.cond1.preheader.us.preheader: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp = add i32 %n.vec, -4
%tmp1 = lshr i32 %tmp, 2
%tmp2 = add nuw nsw i32 %tmp1, 1
br label %for.cond1.preheader.us

for.cond1.preheader.us: ; preds = %middle.block, %for.cond1.preheader.us.preheader
%i.025.us = phi i32 [ %inc10.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ]
%arrayidx.us = getelementptr inbounds i16*, i16** %A, i32 %i.025.us
%tmp3 = load i16*, i16** %arrayidx.us, align 4
%arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.025.us
%arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
%tmp4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx8.promoted.us, i32 0
call void @llvm.set.loop.iterations.i32(i32 %tmp2)
br label %vector.body

vector.body: ; preds = %vector.body, %for.cond1.preheader.us
%index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp14, %vector.body ]
%tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index
%tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29
%tmp8 = bitcast i16* %tmp6 to <4 x i16>*
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef)
%tmp9 = sext <4 x i16> %wide.masked.load to <4 x i32>
%tmp10 = getelementptr inbounds i16, i16* %B, i32 %index
%tmp11 = bitcast i16* %tmp10 to <4 x i16>*
%wide.masked.load30 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp11, i32 2, <4 x i1> %tmp7, <4 x i16> undef)
%tmp12 = sext <4 x i16> %wide.masked.load30 to <4 x i32>
%tmp13 = mul nsw <4 x i32> %tmp12, %tmp9
%tmp14 = add nsw <4 x i32> %tmp13, %vec.phi
%index.next = add i32 %index, 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp5, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %middle.block

middle.block: ; preds = %vector.body
%tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi
%tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17)
store i32 %tmp18, i32* %arrayidx8.us, align 4
%inc10.us = add nuw i32 %i.025.us, 1
%exitcond27 = icmp eq i32 %inc10.us, %N
br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us

for.cond.cleanup: ; preds = %middle.block, %entry
ret void
}

; CHECK-LABEL: mat_vec_i32
; CHECK: phi
; CHECK: phi
; CHECK: phi
; CHECK: [[IV:%[^ ]+]] = phi i32 [ %N, %for.cond1.preheader.us ], [ [[REM:%[^ ]+]], %vector.body ]
; CHECK: [[REM]] = sub i32 [[IV]], 4
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REM]])
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
entry:
%cmp23 = icmp eq i32 %N, 0
br i1 %cmp23, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader

for.cond1.preheader.us.preheader: ; preds = %entry
%n.rnd.up = add i32 %N, 3
%n.vec = and i32 %n.rnd.up, -4
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp = add i32 %n.vec, -4
%tmp1 = lshr i32 %tmp, 2
%tmp2 = add nuw nsw i32 %tmp1, 1
br label %for.cond1.preheader.us

for.cond1.preheader.us: ; preds = %middle.block, %for.cond1.preheader.us.preheader
%i.024.us = phi i32 [ %inc9.us, %middle.block ], [ 0, %for.cond1.preheader.us.preheader ]
%arrayidx.us = getelementptr inbounds i32*, i32** %A, i32 %i.024.us
%tmp3 = load i32*, i32** %arrayidx.us, align 4
%arrayidx7.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
%arrayidx7.promoted.us = load i32, i32* %arrayidx7.us, align 4
%tmp4 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %arrayidx7.promoted.us, i32 0
call void @llvm.set.loop.iterations.i32(i32 %tmp2)
br label %vector.body

vector.body: ; preds = %vector.body, %for.cond1.preheader.us
%index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ %tmp4, %for.cond1.preheader.us ], [ %tmp12, %vector.body ]
%tmp5 = phi i32 [ %tmp2, %for.cond1.preheader.us ], [ %tmp13, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index
%tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28
%tmp8 = bitcast i32* %tmp6 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef)
%tmp9 = getelementptr inbounds i32, i32* %B, i32 %index
%tmp10 = bitcast i32* %tmp9 to <4 x i32>*
%wide.masked.load29 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp10, i32 4, <4 x i1> %tmp7, <4 x i32> undef)
%tmp11 = mul nsw <4 x i32> %wide.masked.load29, %wide.masked.load
%tmp12 = add nsw <4 x i32> %vec.phi, %tmp11
%index.next = add i32 %index, 4
%tmp13 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp5, i32 1)
%tmp14 = icmp ne i32 %tmp13, 0
br i1 %tmp14, label %vector.body, label %middle.block

middle.block: ; preds = %vector.body
%tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi
%tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp15)
store i32 %tmp16, i32* %arrayidx7.us, align 4
%inc9.us = add nuw i32 %i.024.us, 1
%exitcond26 = icmp eq i32 %inc9.us, %N
br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us

for.cond.cleanup: ; preds = %middle.block, %entry
ret void
}

; Function Attrs: argmemonly nounwind readonly willreturn
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #0

; Function Attrs: argmemonly nounwind readonly willreturn
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #0

; Function Attrs: nounwind readnone willreturn
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #1

; Function Attrs: noduplicate nounwind
declare void @llvm.set.loop.iterations.i32(i32) #2

; Function Attrs: noduplicate nounwind
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #2

attributes #0 = { argmemonly nounwind readonly willreturn }
attributes #1 = { nounwind readnone willreturn }
attributes #2 = { noduplicate nounwind }
54 changes: 54 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-narrow.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s

; TODO: We should be able to generate a vctp for the loads.
; CHECK-LABEL: trunc_v4i32_v4i16
; CHECK-NOT: vcpt
define void @trunc_v4i32_v4i16(i32* readonly %a, i32* readonly %b, i16* %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 3
%tmp9 = lshr i32 %tmp8, 2
%tmp10 = shl nuw i32 %tmp9, 2
%tmp11 = add i32 %tmp10, -4
%tmp12 = lshr i32 %tmp11, 2
%tmp13 = add nuw nsw i32 %tmp12, 1
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
call void @llvm.set.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
%trunc = trunc <4 x i32> %mul to <4 x i16>
%tmp6 = getelementptr inbounds i16, i16* %c, i32 %index
%tmp7 = bitcast i16* %tmp6 to <4 x i16>*
tail call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %trunc, <4 x i16>* %tmp7, i32 4, <4 x i1> %tmp1)
%index.next = add i32 %index, 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}

declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>)
declare void @llvm.set.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
505 changes: 505 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll

Large diffs are not rendered by default.

173 changes: 173 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s

; CHECK-LABEL: expand_v8i16_v8i32
; CHECK-NOT: call i32 @llvm.arm.vctp
define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 7
%tmp9 = lshr i32 %tmp8, 3
%tmp10 = shl nuw i32 %tmp9, 3
%tmp11 = add i32 %tmp10, -8
%tmp12 = lshr i32 %tmp11, 3
%tmp13 = add nuw nsw i32 %tmp12, 1
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
call void @llvm.set.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
%tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
%tmp4 = bitcast i16* %tmp3 to <8 x i16>*
%wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%expand.1 = zext <8 x i16> %wide.masked.load to <8 x i32>
%expand.2 = zext <8 x i16> %wide.masked.load2 to <8 x i32>
%mul = mul nsw <8 x i32> %expand.2, %expand.1
%tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
%tmp7 = bitcast i32* %tmp6 to <8 x i32>*
tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %mul, <8 x i32>* %tmp7, i32 4, <8 x i1> %tmp1)
%index.next = add i32 %index, 8
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}

; CHECK-LABEL: expand_v8i16_v4i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS_REM]])
; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
define void @expand_v8i16_v4i32(i16* readonly %a, i16* readonly %b, i32* %c, i32* %d, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 7
%tmp9 = lshr i32 %tmp8, 3
%tmp10 = shl nuw i32 %tmp9, 3
%tmp11 = add i32 %tmp10, -8
%tmp12 = lshr i32 %tmp11, 3
%tmp13 = add nuw nsw i32 %tmp12, 1
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
call void @llvm.set.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
%tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
%tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i16* %tmp to <8 x i16>*
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
%tmp4 = bitcast i16* %tmp3 to <8 x i16>*
%wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
%extract.2.low = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extract.2.high = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> <i32 4, i32 5, i32 6, i32 7>
%expand.1 = zext <4 x i16> %extract.2.low to <4 x i32>
%expand.2 = zext <4 x i16> %extract.2.high to <4 x i32>
%mul = mul nsw <4 x i32> %expand.2, %expand.1
%sub = mul nsw <4 x i32> %expand.1, %expand.2
%broadcast.splatinsert.store = insertelement <4 x i32> undef, i32 %store.idx, i32 0
%broadcast.splat.store = shufflevector <4 x i32> %broadcast.splatinsert.store, <4 x i32> undef, <4 x i32> zeroinitializer
%induction.store = add <4 x i32> %broadcast.splat.store, <i32 0, i32 1, i32 2, i32 3>
%store.pred = icmp ule <4 x i32> %induction.store, %broadcast.splat11.store
%tmp6 = getelementptr inbounds i32, i32* %c, i32 %store.idx
%tmp7 = bitcast i32* %tmp6 to <4 x i32>*
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %store.pred)
%gep = getelementptr inbounds i32, i32* %d, i32 %store.idx
%cast.gep = bitcast i32* %gep to <4 x i32>*
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %sub, <4 x i32>* %cast.gep, i32 4, <4 x i1> %store.pred)
%store.idx.next = add i32 %store.idx, 4
%index.next = add i32 %index, 8
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}

; CHECK-LABEL: expand_v4i32_v4i64
; CHECK-NOT: call i32 @llvm.arm.vctp
define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 3
%tmp9 = lshr i32 %tmp8, 2
%tmp10 = shl nuw i32 %tmp9, 2
%tmp11 = add i32 %tmp10, -4
%tmp12 = lshr i32 %tmp11, 2
%tmp13 = add nuw nsw i32 %tmp12, 1
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
call void @llvm.set.loop.iterations.i32(i32 %tmp13)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
%tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i32* %tmp to <4 x i32>*
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
%wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
%expand.1 = zext <4 x i32> %wide.masked.load to <4 x i64>
%expand.2 = zext <4 x i32> %wide.masked.load2 to <4 x i64>
%mul = mul nsw <4 x i64> %expand.2, %expand.1
%tmp6 = getelementptr inbounds i64, i64* %c, i32 %index
%tmp7 = bitcast i64* %tmp6 to <4 x i64>*
tail call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %mul, <4 x i64>* %tmp7, i32 4, <4 x i1> %tmp1)
%index.next = add i32 %index, 4
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}

declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>)
declare void @llvm.set.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
118 changes: 118 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s

; CHECK-LABEL: reduction_i32
; CHECK: phi i32 [ 0, %entry ]
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) {
entry:
%tmp = add i32 %N, -1
%n.rnd.up = add nuw nsw i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
%1 = lshr i32 %0, 3
%2 = add nuw nsw i32 %1, 1
call void @llvm.set.loop.iterations.i32(i32 %2)
br label %vector.body

vector.body: ; preds = %vector.body, %entry
%index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp8, %vector.body ]
%3 = phi i32 [ %2, %entry ], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
%tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = getelementptr inbounds i16, i16* %B, i32 %index
%tmp6 = bitcast i16* %tmp5 to <8 x i16>*
%wide.masked.load3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp7 = add <8 x i16> %wide.masked.load, %vec.phi
%tmp8 = add <8 x i16> %tmp7, %wide.masked.load3
%index.next = add nuw nsw i32 %index, 8
%4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1)
%5 = icmp ne i32 %4, 0
br i1 %5, label %vector.body, label %middle.block

middle.block: ; preds = %vector.body
%vec.phi.lcssa = phi <8 x i16> [ %vec.phi, %vector.body ]
%.lcssa3 = phi <8 x i1> [ %tmp3, %vector.body ]
%.lcssa = phi <8 x i16> [ %tmp8, %vector.body ]
%tmp10 = select <8 x i1> %.lcssa3, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa
%rdx.shuf = shufflevector <8 x i16> %tmp10, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <8 x i16> %rdx.shuf, %tmp10
%rdx.shuf4 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx5 = add <8 x i16> %rdx.shuf4, %bin.rdx
%rdx.shuf6 = shufflevector <8 x i16> %bin.rdx5, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx7 = add <8 x i16> %rdx.shuf6, %bin.rdx5
%tmp11 = extractelement <8 x i16> %bin.rdx7, i32 0
ret i16 %tmp11
}

; CHECK-LABEL: reduction_i32_with_scalar
; CHECK: phi i32 [ 0, %entry ]
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
entry:
%tmp = add i32 %N, -1
%n.rnd.up = add nuw nsw i32 %tmp, 8
%n.vec = and i32 %n.rnd.up, -8
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
%broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
%broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
%broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
%0 = add i32 %n.vec, -8
%1 = lshr i32 %0, 3
%2 = add nuw nsw i32 %1, 1
call void @llvm.set.loop.iterations.i32(i32 %2)
br label %vector.body

vector.body: ; preds = %vector.body, %entry
%index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
%vec.phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %tmp6, %vector.body ]
%3 = phi i32 [ %2, %entry ], [ %4, %vector.body ]
%broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
%tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
%tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
%tmp6 = add <8 x i16> %tmp5, %wide.masked.load
%index.next = add nuw nsw i32 %index, 8
%4 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %3, i32 1)
%5 = icmp ne i32 %4, 0
br i1 %5, label %vector.body, label %middle.block

middle.block: ; preds = %vector.body
%tmp8 = select <8 x i1> %tmp3, <8 x i16> %tmp6, <8 x i16> %vec.phi
%rdx.shuf = shufflevector <8 x i16> %tmp8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <8 x i16> %rdx.shuf, %tmp8
%rdx.shuf5 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx6 = add <8 x i16> %rdx.shuf5, %bin.rdx
%rdx.shuf7 = shufflevector <8 x i16> %bin.rdx6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <8 x i16> %rdx.shuf7, %bin.rdx6
%tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0
ret i16 %tmp9
}

declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
declare void @llvm.set.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)


118 changes: 118 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s

; TODO: The unrolled pattern is preventing the transform
; CHECK-LABEL: mul_v16i8_unroll
; CHECK-NOT: call i32 @llvm.arm.vcpt
define void @mul_v16i8_unroll(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 15
%tmp9 = lshr i32 %tmp8, 4
%tmp10 = shl nuw i32 %tmp9, 4
%tmp11 = add i32 %tmp10, -16
%tmp12 = lshr i32 %tmp11, 4
%tmp13 = add nuw nsw i32 %tmp12, 1
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph

vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
%xtraiter = and i32 %tmp13, 1
%0 = icmp ult i32 %tmp12, 1
br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new

vector.ph.new: ; preds = %vector.ph
call void @llvm.set.loop.iterations.i32(i32 %tmp13)
%unroll_iter = sub i32 %tmp13, %xtraiter
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph.new
%index = phi i32 [ 0, %vector.ph.new ], [ %index.next.1, %vector.body ]
%niter = phi i32 [ %unroll_iter, %vector.ph.new ], [ %niter.nsub.1, %vector.body ]
%broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
%induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tmp = getelementptr inbounds i8, i8* %a, i32 %index
%tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
%tmp2 = bitcast i8* %tmp to <16 x i8>*
%wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
%tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
%tmp4 = bitcast i8* %tmp3 to <16 x i8>*
%wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
%mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
%tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
%tmp7 = bitcast i8* %tmp6 to <16 x i8>*
tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1)
%index.next = add nuw nsw i32 %index, 16
%niter.nsub = sub i32 %niter, 1
%broadcast.splatinsert.1 = insertelement <16 x i32> undef, i32 %index.next, i32 0
%broadcast.splat.1 = shufflevector <16 x i32> %broadcast.splatinsert.1, <16 x i32> undef, <16 x i32> zeroinitializer
%induction.1 = add <16 x i32> %broadcast.splat.1, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tmp.1 = getelementptr inbounds i8, i8* %a, i32 %index.next
%tmp1.1 = icmp ule <16 x i32> %induction.1, %broadcast.splat11
%tmp2.1 = bitcast i8* %tmp.1 to <16 x i8>*
%wide.masked.load.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
%tmp3.1 = getelementptr inbounds i8, i8* %b, i32 %index.next
%tmp4.1 = bitcast i8* %tmp3.1 to <16 x i8>*
%wide.masked.load2.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
%mul.1 = mul nsw <16 x i8> %wide.masked.load2.1, %wide.masked.load.1
%tmp6.1 = getelementptr inbounds i8, i8* %c, i32 %index.next
%tmp7.1 = bitcast i8* %tmp6.1 to <16 x i8>*
tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul.1, <16 x i8>* %tmp7.1, i32 4, <16 x i1> %tmp1.1)
%index.next.1 = add i32 %index.next, 16
%niter.nsub.1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %niter.nsub, i32 1)
%niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0
br i1 %niter.ncmp.1, label %vector.body, label %for.cond.cleanup.loopexit.unr-lcssa.loopexit

for.cond.cleanup.loopexit.unr-lcssa.loopexit: ; preds = %vector.body
%index.unr.ph = phi i32 [ %index.next.1, %vector.body ]
%tmp14.unr.ph = phi i32 [ -2, %vector.body ]
br label %for.cond.cleanup.loopexit.unr-lcssa

for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.cond.cleanup.loopexit.unr-lcssa.loopexit, %vector.ph
%index.unr = phi i32 [ 0, %vector.ph ], [ %index.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
%tmp14.unr = phi i32 [ %tmp13, %vector.ph ], [ %tmp14.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
%lcmp.mod = icmp ne i32 %xtraiter, 0
br i1 %lcmp.mod, label %vector.body.epil.preheader, label %for.cond.cleanup.loopexit

vector.body.epil.preheader: ; preds = %for.cond.cleanup.loopexit.unr-lcssa
br label %vector.body.epil

vector.body.epil: ; preds = %vector.body.epil.preheader
%index.epil = phi i32 [ %index.unr, %vector.body.epil.preheader ]
%tmp14.epil = phi i32 [ %tmp14.unr, %vector.body.epil.preheader ]
%broadcast.splatinsert.epil = insertelement <16 x i32> undef, i32 %index.epil, i32 0
%broadcast.splat.epil = shufflevector <16 x i32> %broadcast.splatinsert.epil, <16 x i32> undef, <16 x i32> zeroinitializer
%induction.epil = add <16 x i32> %broadcast.splat.epil, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tmp.epil = getelementptr inbounds i8, i8* %a, i32 %index.epil
%tmp1.epil = icmp ule <16 x i32> %induction.epil, %broadcast.splat11
%tmp2.epil = bitcast i8* %tmp.epil to <16 x i8>*
%wide.masked.load.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
%tmp3.epil = getelementptr inbounds i8, i8* %b, i32 %index.epil
%tmp4.epil = bitcast i8* %tmp3.epil to <16 x i8>*
%wide.masked.load2.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
%mul.epil = mul nsw <16 x i8> %wide.masked.load2.epil, %wide.masked.load.epil
%tmp6.epil = getelementptr inbounds i8, i8* %c, i32 %index.epil
%tmp7.epil = bitcast i8* %tmp6.epil to <16 x i8>*
tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul.epil, <16 x i8>* %tmp7.epil, i32 4, <16 x i1> %tmp1.epil)
%index.next.epil = add i32 %index.epil, 16
%tmp15.epil = add nuw nsw i32 %tmp14.epil, -1
%tmp16.epil = icmp ne i32 %tmp15.epil, 0
br label %for.cond.cleanup.loopexit.epilog-lcssa

for.cond.cleanup.loopexit.epilog-lcssa: ; preds = %vector.body.epil
br label %for.cond.cleanup.loopexit

for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.cond.cleanup.loopexit.epilog-lcssa
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
}

declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) #2
declare void @llvm.set.loop.iterations.i32(i32) #3
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3