diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c04cbc80bc5b6..230ad821a8a1e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4894,15 +4894,14 @@ void AArch64TTIImpl::getUnrollingPreferences( // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; + // No need to unroll auto-vectorized loops + if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) + return; + // Scan the loop: don't unroll loops with calls as this could prevent - // inlining. Don't unroll vector loops either, as they don't benefit much from - // unrolling. + // inlining. for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { - // Don't unroll vectorised loop. - if (I.getType()->isVectorTy()) - return; - if (isa(I)) { if (isa(I) || isa(I)) if (const Function *F = cast(I).getCalledFunction()) diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll new file mode 100644 index 0000000000000..8baded897fd7d --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll @@ -0,0 +1,496 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s +; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55 + +define void @reverse(ptr %dst, ptr %src, i64 %len) { +; APPLE-LABEL: define void @reverse( +; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1 +; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7 +; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 +; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; APPLE: [[ENTRY_NEW]]: +; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]] +; APPLE-NEXT: br label %[[FOR_BODY:.*]] +; APPLE: [[FOR_BODY]]: +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ] +; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ] +; APPLE-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]] +; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]] +; APPLE-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 +; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]] +; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16 +; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]] +; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]] +; APPLE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]] +; APPLE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16 +; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; APPLE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]] +; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]] +; APPLE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]] +; APPLE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16 +; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; APPLE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]] +; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]] +; APPLE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]] +; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16 +; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4 +; APPLE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]] +; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]] +; APPLE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]] +; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16 +; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5 +; APPLE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]] +; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]] +; APPLE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]] +; APPLE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16 +; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6 +; APPLE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]] +; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]] +; APPLE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]] +; APPLE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16 +; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7 +; APPLE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]] +; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]] +; APPLE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]] +; APPLE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16 +; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8 +; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8 +; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]] +; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]] +; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ] +; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]] +; APPLE: [[EXIT_UNR_LCSSA]]: +; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; APPLE: [[FOR_BODY_EPIL_PREHEADER]]: +; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]] +; APPLE: [[FOR_BODY_EPIL]]: +; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ] +; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ] +; APPLE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]] +; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]] +; APPLE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16 +; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]] +; APPLE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16 +; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1 +; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]] +; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 +; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] +; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; APPLE: [[EXIT_EPILOG_LCSSA]]: +; APPLE-NEXT: br label %[[EXIT]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: ret void +; +; CORTEXA55-LABEL: define void @reverse( +; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { +; CORTEXA55-NEXT: [[ENTRY:.*]]: +; CORTEXA55-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1 +; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 3 +; CORTEXA55-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3 +; CORTEXA55-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; CORTEXA55: [[ENTRY_NEW]]: +; CORTEXA55-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]] +; CORTEXA55-NEXT: br label %[[FOR_BODY:.*]] +; CORTEXA55: [[FOR_BODY]]: +; CORTEXA55-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[FOR_BODY]] ] +; CORTEXA55-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ] +; CORTEXA55-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]] +; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]] +; CORTEXA55-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 +; CORTEXA55-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]] +; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16 +; CORTEXA55-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CORTEXA55-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]] +; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]] +; CORTEXA55-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16 +; CORTEXA55-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]] +; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16 +; CORTEXA55-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CORTEXA55-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]] +; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]] +; CORTEXA55-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16 +; CORTEXA55-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]] +; CORTEXA55-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16 +; CORTEXA55-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CORTEXA55-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]] +; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]] +; CORTEXA55-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16 +; CORTEXA55-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]] +; CORTEXA55-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16 +; CORTEXA55-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CORTEXA55-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 +; CORTEXA55-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] +; CORTEXA55-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CORTEXA55: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; CORTEXA55-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[FOR_BODY]] ] +; CORTEXA55-NEXT: br label %[[EXIT_UNR_LCSSA]] +; CORTEXA55: [[EXIT_UNR_LCSSA]]: +; CORTEXA55-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; CORTEXA55: [[FOR_BODY_EPIL_PREHEADER]]: +; CORTEXA55-NEXT: br label %[[FOR_BODY_EPIL:.*]] +; CORTEXA55: [[FOR_BODY_EPIL]]: +; CORTEXA55-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]] +; CORTEXA55-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]] +; CORTEXA55-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16 +; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]] +; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_EPIL]], align 16 +; CORTEXA55-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1 +; CORTEXA55-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]] +; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]] +; CORTEXA55: [[FOR_BODY_EPIL_1]]: +; CORTEXA55-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]] +; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]] +; CORTEXA55-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16 +; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]] +; CORTEXA55-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_EPIL_1]], align 16 +; CORTEXA55-NEXT: [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2 +; CORTEXA55-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]] +; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]] +; CORTEXA55: [[FOR_BODY_EPIL_2]]: +; CORTEXA55-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]] +; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]] +; CORTEXA55-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16 +; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]] +; CORTEXA55-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_EPIL_2]], align 16 +; CORTEXA55-NEXT: br label %[[EXIT_EPILOG_LCSSA]] +; CORTEXA55: [[EXIT_EPILOG_LCSSA]]: +; CORTEXA55-NEXT: br label %[[EXIT]] +; CORTEXA55: [[EXIT]]: +; CORTEXA55-NEXT: ret void +; +entry: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %1 = sub nsw i64 %len, %iv + %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1 + %2 = load <4 x float>, ptr %arrayidx, align 16 + %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv + store <4 x float> %2, ptr %arrayidx2, align 16 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %len + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + + +define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) { +; APPLE-LABEL: define void @saxpy_tripcount8_full_unroll( +; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*:]] +; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; APPLE-NEXT: br label %[[VECTOR_BODY:.*]] +; APPLE: [[VECTOR_BODY]]: +; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4 +; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4 +; APPLE-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +; APPLE-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4 +; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4 +; APPLE-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4 +; APPLE-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; APPLE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]]) +; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; APPLE-NEXT: ret void +; +; CORTEXA55-LABEL: define void @saxpy_tripcount8_full_unroll( +; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; CORTEXA55-NEXT: [[ENTRY:.*:]] +; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]] +; CORTEXA55: [[VECTOR_BODY]]: +; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4 +; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4 +; CORTEXA55-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4 +; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4 +; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CORTEXA55-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4 +; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CORTEXA55-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4 +; CORTEXA55-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds nuw float, ptr %src, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index + %wide.load12 = load <4 x float>, ptr %1, align 4 + %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12) + store <4 x float> %2, ptr %1, align 4 + %index.next = add nuw i64 %index, 4 + %3 = icmp eq i64 %index.next, 8 + br i1 %3, label %exit, label %vector.body + +exit: ; preds = %vector.body + ret void +} + + +define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) { +; APPLE-LABEL: define void @saxpy_tripcount1K_av0( +; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; APPLE-NEXT: br label %[[VECTOR_BODY:.*]] +; APPLE: [[VECTOR_BODY]]: +; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]] +; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]] +; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4 +; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: ret void +; +; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av0( +; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; CORTEXA55-NEXT: [[ENTRY:.*]]: +; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]] +; CORTEXA55: [[VECTOR_BODY]]: +; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ] +; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]] +; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]] +; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 4 +; CORTEXA55-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]] +; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 +; CORTEXA55-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CORTEXA55-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8 +; CORTEXA55-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]] +; CORTEXA55-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4 +; CORTEXA55-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 +; CORTEXA55-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12 +; CORTEXA55-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]] +; CORTEXA55-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4 +; CORTEXA55-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4 +; CORTEXA55-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_3:%.*]] = add nuw nsw i64 [[INDEX]], 16 +; CORTEXA55-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_3]] +; CORTEXA55-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP12]], align 4 +; CORTEXA55-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_3]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4 +; CORTEXA55-NEXT: [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20 +; CORTEXA55-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]] +; CORTEXA55-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4 +; CORTEXA55-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4 +; CORTEXA55-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP17]], ptr [[TMP16]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24 +; CORTEXA55-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]] +; CORTEXA55-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4 +; CORTEXA55-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4 +; CORTEXA55-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP20]], ptr [[TMP19]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28 +; CORTEXA55-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]] +; CORTEXA55-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4 +; CORTEXA55-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4 +; CORTEXA55-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP23]], ptr [[TMP22]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32 +; CORTEXA55-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]] +; CORTEXA55-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4 +; CORTEXA55-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4 +; CORTEXA55-NEXT: [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP26]], ptr [[TMP25]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36 +; CORTEXA55-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]] +; CORTEXA55-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4 +; CORTEXA55-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4 +; CORTEXA55-NEXT: [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40 +; CORTEXA55-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]] +; CORTEXA55-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4 +; CORTEXA55-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4 +; CORTEXA55-NEXT: [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP32]], ptr [[TMP31]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44 +; CORTEXA55-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]] +; CORTEXA55-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4 +; CORTEXA55-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4 +; CORTEXA55-NEXT: [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP35]], ptr [[TMP34]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48 +; CORTEXA55-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]] +; CORTEXA55-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4 +; CORTEXA55-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4 +; CORTEXA55-NEXT: [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP38]], ptr [[TMP37]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52 +; CORTEXA55-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]] +; CORTEXA55-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4 +; CORTEXA55-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4 +; CORTEXA55-NEXT: [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP41]], ptr [[TMP40]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56 +; CORTEXA55-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]] +; CORTEXA55-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4 +; CORTEXA55-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4 +; CORTEXA55-NEXT: [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP44]], ptr [[TMP43]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60 +; CORTEXA55-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]] +; CORTEXA55-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4 +; CORTEXA55-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]] +; CORTEXA55-NEXT: [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4 +; CORTEXA55-NEXT: [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP47]], ptr [[TMP46]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64 +; CORTEXA55-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024 +; CORTEXA55-NEXT: br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]] +; CORTEXA55: [[EXIT]]: +; CORTEXA55-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds nuw float, ptr %src, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index + %wide.load12 = load <4 x float>, ptr %1, align 4 + %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12) + store <4 x float> %2, ptr %1, align 4 + %index.next = add nuw i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %exit, label %vector.body + +exit: ; preds = %vector.body + ret void +} + + +define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) { +; APPLE-LABEL: define void @saxpy_tripcount1K_av1( +; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; APPLE-NEXT: br label %[[VECTOR_BODY:.*]] +; APPLE: [[VECTOR_BODY]]: +; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]] +; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]] +; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4 +; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: ret void +; +; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av1( +; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; CORTEXA55-NEXT: [[ENTRY:.*]]: +; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]] +; CORTEXA55: [[VECTOR_BODY]]: +; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]] +; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]] +; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]]) +; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4 +; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CORTEXA55-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CORTEXA55-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CORTEXA55: [[EXIT]]: +; CORTEXA55-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds nuw float, ptr %src, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index + %wide.load12 = load <4 x float>, ptr %1, align 4 + %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12) + store <4 x float> %2, ptr %1, align 4 + %index.next = add nuw i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %exit, label %vector.body, !llvm.loop !0 + +exit: ; preds = %vector.body + ret void +} +!0 = !{!0, !1} +!1 = !{!"llvm.loop.isvectorized", i32 1} + +;. +; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"} +; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]} +; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} +;. +; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +;.