diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll new file mode 100644 index 0000000000000..9efe97004fd92 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll @@ -0,0 +1,188 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable +define dso_local void @foo(i32 noundef %limit, ptr nocapture noundef %out, ptr nocapture noundef readonly %y) local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: cmp w0, #1 +; CHECK-NEXT: b.lt .LBB0_10 +; CHECK-NEXT: // %bb.1: // %for.cond1.preheader.us.preheader +; CHECK-NEXT: mov w10, w0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: and x11, x10, #0xfffffff0 +; CHECK-NEXT: add x12, x1, #32 +; CHECK-NEXT: ubfiz x13, x0, #2, #32 +; CHECK-NEXT: add x14, x2, #16 +; CHECK-NEXT: b .LBB0_3 +; CHECK-NEXT: .p2align 5, , 16 +; CHECK-NEXT: .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: add x12, x12, x13 +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: b.eq .LBB0_10 +; CHECK-NEXT: .LBB0_3: // %for.cond1.preheader.us +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB0_6 Depth 2 +; CHECK-NEXT: // Child Loop BB0_9 Depth 2 +; CHECK-NEXT: ldrsh w15, [x2, x9, lsl #1] +; CHECK-NEXT: cmp w0, #16 +; CHECK-NEXT: b.hs .LBB0_5 +; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: mov x18, xzr +; CHECK-NEXT: b .LBB0_8 +; CHECK-NEXT: .p2align 5, , 16 +; CHECK-NEXT: .LBB0_5: // %vector.ph +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: mov x16, x14 +; CHECK-NEXT: mov x17, x12 +; CHECK-NEXT: mov x18, x11 +; CHECK-NEXT: .p2align 5, , 16 +; CHECK-NEXT: .LBB0_6: // %vector.body +; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: dup v0.8h, w15 +; CHECK-NEXT: subs x18, x18, #16 +; CHECK-NEXT: ldp q1, q2, [x16, #-16] +; CHECK-NEXT: add x16, x16, #32 +; CHECK-NEXT: ldp q4, q3, [x17, #-32] +; CHECK-NEXT: smlal v4.4s, v0.4h, v1.4h +; CHECK-NEXT: smlal2 v3.4s, v0.8h, v1.8h +; CHECK-NEXT: ldp q6, q5, [x17] +; CHECK-NEXT: smlal v6.4s, v0.4h, v2.4h +; CHECK-NEXT: smlal2 v5.4s, v0.8h, v2.8h +; CHECK-NEXT: stp q4, q3, [x17, #-32] +; CHECK-NEXT: stp q6, q5, [x17], #64 +; CHECK-NEXT: b.ne .LBB0_6 +; CHECK-NEXT: // %bb.7: // %middle.block +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: cmp x11, x10 +; CHECK-NEXT: mov x18, x11 +; CHECK-NEXT: b.eq .LBB0_2 +; CHECK-NEXT: .LBB0_8: // %for.body4.us.preheader +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: add x17, x18, x8 +; CHECK-NEXT: sub x16, x10, x18 +; CHECK-NEXT: add x18, x2, x18, lsl #1 +; CHECK-NEXT: add x17, x1, x17, lsl #2 +; CHECK-NEXT: .p2align 5, , 16 +; CHECK-NEXT: .LBB0_9: // %for.body4.us +; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldrsh w3, [x18], #2 +; CHECK-NEXT: subs x16, x16, #1 +; CHECK-NEXT: ldr w4, [x17] +; CHECK-NEXT: madd w3, w3, w15, w4 +; CHECK-NEXT: str w3, [x17], #4 +; CHECK-NEXT: b.ne .LBB0_9 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %cmp26 = icmp sgt i32 %limit, 0 + br i1 %cmp26, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup + +for.cond1.preheader.us.preheader: ; preds = %entry + %0 = zext i32 %limit to i64 + %wide.trip.count34 = zext i32 %limit to i64 + %min.iters.check = icmp ult i32 %limit, 16 + %n.vec = and i64 %wide.trip.count34, 4294967280 + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count34 + br label %for.cond1.preheader.us + +for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us + %indvars.iv30 = phi i64 [ 0, %for.cond1.preheader.us.preheader ], [ %indvars.iv.next31, %for.cond1.for.cond.cleanup3_crit_edge.us ] + %arrayidx.us = getelementptr inbounds i16, ptr %y, i64 %indvars.iv30 + %1 = load i16, ptr %arrayidx.us, align 2, !tbaa !6 + %conv.us = sext i16 %1 to i32 + %2 = mul nsw i64 %indvars.iv30, %0 + br i1 %min.iters.check, label %for.body4.us.preheader, label %vector.ph + +vector.ph: ; preds = %for.cond1.preheader.us + %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv.us, i64 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer + %broadcast.splatinsert37 = insertelement <8 x i32> poison, i32 %conv.us, i64 0 + %broadcast.splat38 = shufflevector <8 x i32> %broadcast.splatinsert37, <8 x i32> poison, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %3 = getelementptr inbounds i16, ptr %y, i64 %index + %wide.load = load <8 x i16>, ptr %3, align 2, !tbaa !6 + %4 = getelementptr inbounds i16, ptr %3, i64 8 + %wide.load36 = load <8 x i16>, ptr %4, align 2, !tbaa !6 + %5 = sext <8 x i16> %wide.load to <8 x i32> + %6 = sext <8 x i16> %wide.load36 to <8 x i32> + %7 = mul nsw <8 x i32> %broadcast.splat, %5 + %8 = mul nsw <8 x i32> %broadcast.splat38, %6 + %9 = add nuw nsw i64 %index, %2 + %10 = getelementptr inbounds i32, ptr %out, i64 %9 + %wide.load39 = load <8 x i32>, ptr %10, align 4, !tbaa !10 + %11 = getelementptr inbounds i32, ptr %10, i64 8 + %wide.load40 = load <8 x i32>, ptr %11, align 4, !tbaa !10 + %12 = add nsw <8 x i32> %7, %wide.load39 + %13 = add nsw <8 x i32> %8, %wide.load40 + store <8 x i32> %12, ptr %10, align 4, !tbaa !10 + store <8 x i32> %13, ptr %11, align 4, !tbaa !10 + %index.next = add nuw i64 %index, 16 + %14 = icmp eq i64 %index.next, %n.vec + br i1 %14, label %middle.block, label %vector.body, !llvm.loop !12 + +middle.block: ; preds = %vector.body + br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader + +for.body4.us.preheader: ; preds = %for.cond1.preheader.us, %middle.block + %indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ] + br label %for.body4.us + +for.body4.us: ; preds = %for.body4.us.preheader, %for.body4.us + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ] + %arrayidx6.us = getelementptr inbounds i16, ptr %y, i64 %indvars.iv + %15 = load i16, ptr %arrayidx6.us, align 2, !tbaa !6 + %conv7.us = sext i16 %15 to i32 + %mul.us = mul nsw i32 %conv7.us, %conv.us + %16 = add nuw nsw i64 %indvars.iv, %2 + %arrayidx10.us = getelementptr inbounds i32, ptr %out, i64 %16 + %17 = load i32, ptr %arrayidx10.us, align 4, !tbaa !10 + %add11.us = add nsw i32 %mul.us, %17 + store i32 %add11.us, ptr %arrayidx10.us, align 4, !tbaa !10 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count34 + br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us, !llvm.loop !16 + +for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us, %middle.block + %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1 + %exitcond35.not = icmp eq i64 %indvars.iv.next31, %wide.trip.count34 + br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us, !llvm.loop !17 + +for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry + ret void +} + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+outline-atomics,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.1a,+v8.2a,+v8a,-fmv" } + +!llvm.module.flags = !{!0, !1, !2, !3, !4} +!llvm.ident = !{!5} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 1} +!5 = !{!"clang version 17.0.0 (ssh://eu-gerrit-1.euhpc.arm.com:29418/dsg-dps/mirrors/llvm 2b55808b298965d2d49727b291198d4e2551d24a)"} +!6 = !{!7, !7, i64 0} +!7 = !{!"short", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C/C++ TBAA"} +!10 = !{!11, !11, i64 0} +!11 = !{!"int", !8, i64 0} +!12 = distinct !{!12, !13, !14, !15} +!13 = !{!"llvm.loop.mustprogress"} +!14 = !{!"llvm.loop.isvectorized", i32 1} +!15 = !{!"llvm.loop.unroll.runtime.disable"} +!16 = distinct !{!16, !13, !15, !14} +!17 = distinct !{!17, !13}