diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c6233461be655..5374339570132 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4897,16 +4897,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, // Limit to loops with trip counts that are cheap to expand. UP.SCEVExpansionBudget = 1; - // Try to unroll small, single block loops, if they have load/store - // dependencies, to expose more parallel memory access streams. + // Try to unroll small loops, of few-blocks with low budget, if they have + // load/store dependencies, to expose more parallel memory access streams, + // or if they do little work inside a block (i.e. load -> X -> store pattern). BasicBlock *Header = L->getHeader(); if (Header == L->getLoopLatch()) { // Estimate the size of the loop. unsigned Size; - if (!isLoopSizeWithinBudget(L, TTI, 8, &Size)) + unsigned Width = 10; + if (!isLoopSizeWithinBudget(L, TTI, Width, &Size)) return; - SmallPtrSet LoadedValues; + SmallPtrSet LoadedValuesPlus; SmallVector Stores; for (auto *BB : L->blocks()) { for (auto &I : *BB) { @@ -4916,9 +4918,13 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, const SCEV *PtrSCEV = SE.getSCEV(Ptr); if (SE.isLoopInvariant(PtrSCEV, L)) continue; - if (isa(&I)) - LoadedValues.insert(&I); - else + if (isa(&I)) { + LoadedValuesPlus.insert(&I); + // Include in-loop 1st users of loaded values. + for (auto *U : I.users()) + if (L->contains(cast(U))) + LoadedValuesPlus.insert(U); + } else Stores.push_back(cast(&I)); } } @@ -4941,8 +4947,8 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, UC++; } - if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) { - return LoadedValues.contains(SI->getOperand(0)); + if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) { + return LoadedValuesPlus.contains(SI->getOperand(0)); })) return; diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll index 0b78beea54aa9..a62e45d27a5c5 100644 --- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll @@ -165,6 +165,204 @@ exit: ret void } +define void @load_op_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) { +; APPLE-LABEL: define void @load_op_store_loop( +; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1 +; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1 +; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; APPLE: [[ENTRY_NEW]]: +; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; APPLE-NEXT: br label %[[LOOP:.*]] +; APPLE: [[LOOP]]: +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]] +; APPLE-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; APPLE-NEXT: [[O:%.*]] = fadd float [[L]], [[K]] +; APPLE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]] +; APPLE-NEXT: store float [[O]], ptr [[GEP_DST]], align 4 +; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]] +; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 +; APPLE-NEXT: [[O_1:%.*]] = fadd float [[L_1]], [[K]] +; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]] +; APPLE-NEXT: store float [[O_1]], ptr [[GEP_DST_1]], align 4 +; APPLE-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 +; APPLE-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; APPLE-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; APPLE-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]] +; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ] +; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]] +; APPLE: [[EXIT_UNR_LCSSA]]: +; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; APPLE: [[LOOP_EPIL_PREHEADER]]: +; APPLE-NEXT: br label %[[LOOP_EPIL:.*]] +; APPLE: [[LOOP_EPIL]]: +; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]] +; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4 +; APPLE-NEXT: [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]] +; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]] +; APPLE-NEXT: store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4 +; APPLE-NEXT: br label %[[EXIT]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: ret void +; +; OTHER-LABEL: define void @load_op_store_loop( +; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] { +; OTHER-NEXT: [[ENTRY:.*]]: +; OTHER-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; OTHER-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1 +; OTHER-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1 +; OTHER-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; OTHER: [[ENTRY_NEW]]: +; OTHER-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; OTHER-NEXT: br label %[[LOOP:.*]] +; OTHER: [[LOOP]]: +; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]] +; OTHER-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]] +; OTHER-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; OTHER-NEXT: [[O:%.*]] = fadd float [[L]], [[K]] +; OTHER-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]] +; OTHER-NEXT: store float [[O]], ptr [[GEP_DST]], align 4 +; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; OTHER-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]] +; OTHER-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]] +; OTHER-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4 +; OTHER-NEXT: [[O_1:%.*]] = fadd float [[L_1]], [[K]] +; OTHER-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]] +; OTHER-NEXT: store float [[O_1]], ptr [[GEP_DST_1]], align 4 +; OTHER-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 +; OTHER-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; OTHER-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; OTHER-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]] +; OTHER: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; OTHER-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ] +; OTHER-NEXT: br label %[[EXIT_UNR_LCSSA]] +; OTHER: [[EXIT_UNR_LCSSA]]: +; OTHER-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; OTHER-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; OTHER-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; OTHER: [[LOOP_EPIL_PREHEADER]]: +; OTHER-NEXT: br label %[[LOOP_EPIL:.*]] +; OTHER: [[LOOP_EPIL]]: +; OTHER-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]] +; OTHER-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]] +; OTHER-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4 +; OTHER-NEXT: [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]] +; OTHER-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]] +; OTHER-NEXT: store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4 +; OTHER-NEXT: br label %[[EXIT]] +; OTHER: [[EXIT]]: +; OTHER-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %scaled.iv = mul nuw nsw i64 %iv, %scale + %gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv + %l = load float, ptr %gep.src, align 4 + %o = fadd float %l, %k + %gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv + store float %o, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @load_op_store_loop_multiblock(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) { +; APPLE-LABEL: define void @load_op_store_loop_multiblock( +; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: br label %[[LOOP:.*]] +; APPLE: [[LOOP]]: +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ] +; APPLE-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]] +; APPLE-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]] +; APPLE-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; APPLE-NEXT: [[AND:%.*]] = and i64 [[IV]], 1 +; APPLE-NEXT: [[ODD:%.*]] = icmp eq i64 [[AND]], 1 +; APPLE-NEXT: br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]] +; APPLE: [[LOOPCONT]]: +; APPLE-NEXT: [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ] +; APPLE-NEXT: [[O:%.*]] = fadd float [[D]], [[K]] +; APPLE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]] +; APPLE-NEXT: store float [[O]], ptr [[GEP_DST]], align 4 +; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE: [[LOOPODD]]: +; APPLE-NEXT: [[L2]] = fneg float [[L]] +; APPLE-NEXT: br label %[[LOOPCONT]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: ret void +; +; OTHER-LABEL: define void @load_op_store_loop_multiblock( +; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] { +; OTHER-NEXT: [[ENTRY:.*]]: +; OTHER-NEXT: br label %[[LOOP:.*]] +; OTHER: [[LOOP]]: +; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ] +; OTHER-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]] +; OTHER-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]] +; OTHER-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; OTHER-NEXT: [[AND:%.*]] = and i64 [[IV]], 1 +; OTHER-NEXT: [[ODD:%.*]] = icmp eq i64 [[AND]], 1 +; OTHER-NEXT: br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]] +; OTHER: [[LOOPCONT]]: +; OTHER-NEXT: [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ] +; OTHER-NEXT: [[O:%.*]] = fadd float [[D]], [[K]] +; OTHER-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]] +; OTHER-NEXT: store float [[O]], ptr [[GEP_DST]], align 4 +; OTHER-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; OTHER-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; OTHER-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; OTHER: [[LOOPODD]]: +; OTHER-NEXT: [[L2]] = fneg float [[L]] +; OTHER-NEXT: br label %[[LOOPCONT]] +; OTHER: [[EXIT]]: +; OTHER-NEXT: ret void +; +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loopcont ] + %scaled.iv = mul nuw nsw i64 %iv, %scale + %gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv + %l1 = load float, ptr %gep.src, align 4 + %and = and i64 %iv, 1 + %odd = icmp eq i64 %and, 1 + br i1 %odd, label %loopodd, label %loopcont +loopcont: + %d = phi float [ %l2, %loopodd ], [ %l1, %loop] + %o = fadd float %d, %k + %gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv + store float %o, ptr %gep.dst, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop +loopodd: + %l2 = fneg float %l1 + br label %loopcont +exit: + ret void +} + @A = external constant [9 x i8], align 1 @B = external constant [8 x i32], align 4 @C = external constant [8 x i32], align 4