diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index bc07c6740326f..e1d0a2d0c5dcd 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -52,6 +52,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" @@ -70,6 +71,12 @@ STATISTIC(NumChanged, "Number of insts reassociated"); STATISTIC(NumAnnihil, "Number of expr tree annihilated"); STATISTIC(NumFactor , "Number of multiplies factored"); +static cl::opt + UseCSELocalOpt(DEBUG_TYPE "-use-cse-local", + cl::desc("Only reorder expressions within a basic block " + "when exposing CSE opportunities"), + cl::init(true), cl::Hidden); + #ifndef NDEBUG /// Print out the expression identified in the Ops list. static void PrintOps(Instruction *I, const SmallVectorImpl &Ops) { @@ -2407,8 +2414,67 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { unsigned BestRank = 0; std::pair BestPair; unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin; - for (unsigned i = 0; i < Ops.size() - 1; ++i) - for (unsigned j = i + 1; j < Ops.size(); ++j) { + unsigned LimitIdx = 0; + // With the CSE-driven heuristic, we are about to slap two values at the + // beginning of the expression whereas they could live very late in the CFG. + // When using the CSE-local heuristic we avoid creating dependences from + // completely unrelated part of the CFG by limiting the expression + // reordering on the values that live in the first seen basic block. + // The main idea is that we want to avoid forming expressions that would + // become loop dependent. + if (UseCSELocalOpt) { + const BasicBlock *FirstSeenBB = nullptr; + int StartIdx = Ops.size() - 1; + // Skip the first value of the expression since we need at least two + // values to materialize an expression. I.e., even if this value is + // anchored in a different basic block, the actual first sub expression + // will be anchored on the second value. + for (int i = StartIdx - 1; i != -1; --i) { + const Value *Val = Ops[i].Op; + const auto *CurrLeafInstr = dyn_cast(Val); + const BasicBlock *SeenBB = nullptr; + if (!CurrLeafInstr) { + // The value is free of any CFG dependencies. + // Do as if it lives in the entry block. + // + // We do this to make sure all the values falling on this path are + // seen through the same anchor point. The rationale is these values + // can be combined together to from a sub expression free of any CFG + // dependencies so we want them to stay together. + // We could be cleverer and postpone the anchor down to the first + // anchored value, but that's likely complicated to get right. + // E.g., we wouldn't want to do that if that means being stuck in a + // loop. + // + // For instance, we wouldn't want to change: + // res = arg1 op arg2 op arg3 op ... op loop_val1 op loop_val2 ... + // into + // res = loop_val1 op arg1 op arg2 op arg3 op ... op loop_val2 ... + // Because all the sub expressions with arg2..N would be stuck between + // two loop dependent values. + SeenBB = &I->getParent()->getParent()->getEntryBlock(); + } else { + SeenBB = CurrLeafInstr->getParent(); + } + + if (!FirstSeenBB) { + FirstSeenBB = SeenBB; + continue; + } + if (FirstSeenBB != SeenBB) { + // ith value is in a different basic block. + // Rewind the index once to point to the last value on the same basic + // block. + LimitIdx = i + 1; + LLVM_DEBUG(dbgs() << "CSE reordering: Consider values between [" + << LimitIdx << ", " << StartIdx << "]\n"); + break; + } + } + } + for (unsigned i = Ops.size() - 1; i > LimitIdx; --i) { + // We must use int type to go below zero when LimitIdx is 0. + for (int j = i - 1; j >= (int)LimitIdx; --j) { unsigned Score = 0; Value *Op0 = Ops[i].Op; Value *Op1 = Ops[j].Op; @@ -2426,12 +2492,26 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { } unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank); + + // By construction, the operands are sorted in reverse order of their + // topological order. + // So we tend to form (sub) expressions with values that are close to + // each other. + // + // Now to expose more CSE opportunities we want to expose the pair of + // operands that occur the most (as statically computed in + // BuildPairMap.) as the first sub-expression. + // + // If two pairs occur as many times, we pick the one with the + // lowest rank, meaning the one with both operands appearing first in + // the topological order. if (Score > Max || (Score == Max && MaxRank < BestRank)) { - BestPair = {i, j}; + BestPair = {j, i}; Max = Score; BestRank = MaxRank; } } + } if (Max > 1) { auto Op0 = Ops[BestPair.first]; auto Op1 = Ops[BestPair.second]; @@ -2441,6 +2521,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { Ops.push_back(Op1); } } + LLVM_DEBUG(dbgs() << "RAOut after CSE reorder:\t"; PrintOps(I, Ops); + dbgs() << '\n'); // Now that we ordered and optimized the expressions, splat them back into // the expression tree, removing any unneeded nodes. RewriteExprTree(I, Ops); diff --git a/llvm/test/Transforms/Reassociate/defeat-licm.ll b/llvm/test/Transforms/Reassociate/defeat-licm.ll new file mode 100644 index 0000000000000..fadb20207716e --- /dev/null +++ b/llvm/test/Transforms/Reassociate/defeat-licm.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; Check that the default heuristic use the local cse constraints. +; RUN: opt -S -passes=reassociate %s -o - | FileCheck %s -check-prefix=LOCAL_CSE +; RUN: opt -S -passes=reassociate %s -reassociate-use-cse-local=true -o - | FileCheck %s -check-prefix=LOCAL_CSE +; RUN: opt -S -passes=reassociate %s -reassociate-use-cse-local=false -o - | FileCheck %s -check-prefix=CSE + +; In the local CSE mode, we check that we don't create loop dependent +; expressions in order to expose more CSE opportunities. +; This can be seen with %inv4 and %inv5 that should stay in the entry block. +; +; For the non-local CSE mode, we form CSE-able expression regardless of where +; they would be materialized. In this case, %inv4 and %inv5 are pushed +; down the loop body in order to make loop_dependent, %inv2 appear as a +; sub-expression. +; +; Related to issue PR61458. +define void @reassoc_defeats_licm(i64 %inv1, i64 %inv2, i64 %inv3) { +; LOCAL_CSE-LABEL: define void @reassoc_defeats_licm +; LOCAL_CSE-SAME: (i64 [[INV1:%.*]], i64 [[INV2:%.*]], i64 [[INV3:%.*]]) { +; LOCAL_CSE-NEXT: bb: +; LOCAL_CSE-NEXT: [[INV4:%.*]] = add nuw nsw i64 [[INV2]], [[INV1]] +; LOCAL_CSE-NEXT: [[INV5:%.*]] = add nuw nsw i64 [[INV3]], [[INV2]] +; LOCAL_CSE-NEXT: br label [[BB214:%.*]] +; LOCAL_CSE: bb214: +; LOCAL_CSE-NEXT: [[IV1:%.*]] = phi i64 [ [[IV2:%.*]], [[BB214]] ], [ 0, [[BB:%.*]] ] +; LOCAL_CSE-NEXT: [[IV2]] = phi i64 [ [[IV2_PLUS_1:%.*]], [[BB214]] ], [ 1, [[BB]] ] +; LOCAL_CSE-NEXT: [[LOOP_DEPENDENT:%.*]] = shl nuw nsw i64 [[IV1]], 13 +; LOCAL_CSE-NEXT: [[LOOP_DEPENDENT2:%.*]] = add nsw i64 [[INV4]], [[LOOP_DEPENDENT]] +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[LOOP_DEPENDENT2]]) +; LOCAL_CSE-NEXT: [[LOOP_DEPENDENT3:%.*]] = add i64 [[INV5]], [[LOOP_DEPENDENT]] +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[LOOP_DEPENDENT3]]) +; LOCAL_CSE-NEXT: [[IV2_PLUS_1]] = add i64 [[IV2]], 1 +; LOCAL_CSE-NEXT: br label [[BB214]] +; +; CSE-LABEL: define void @reassoc_defeats_licm +; CSE-SAME: (i64 [[INV1:%.*]], i64 [[INV2:%.*]], i64 [[INV3:%.*]]) { +; CSE-NEXT: bb: +; CSE-NEXT: br label [[BB214:%.*]] +; CSE: bb214: +; CSE-NEXT: [[IV1:%.*]] = phi i64 [ [[IV2:%.*]], [[BB214]] ], [ 0, [[BB:%.*]] ] +; CSE-NEXT: [[IV2]] = phi i64 [ [[IV2_PLUS_1:%.*]], [[BB214]] ], [ 1, [[BB]] ] +; CSE-NEXT: [[LOOP_DEPENDENT:%.*]] = shl nuw nsw i64 [[IV1]], 13 +; CSE-NEXT: [[INV4:%.*]] = add i64 [[LOOP_DEPENDENT]], [[INV2]] +; CSE-NEXT: [[LOOP_DEPENDENT2:%.*]] = add i64 [[INV4]], [[INV1]] +; CSE-NEXT: call void @keep_alive(i64 [[LOOP_DEPENDENT2]]) +; CSE-NEXT: [[INV5:%.*]] = add i64 [[LOOP_DEPENDENT]], [[INV2]] +; CSE-NEXT: [[LOOP_DEPENDENT3:%.*]] = add i64 [[INV5]], [[INV3]] +; CSE-NEXT: call void @keep_alive(i64 [[LOOP_DEPENDENT3]]) +; CSE-NEXT: [[IV2_PLUS_1]] = add i64 [[IV2]], 1 +; CSE-NEXT: br label [[BB214]] +; +bb: + %inv4 = add nuw nsw i64 %inv1, %inv2 + %inv5 = add nuw nsw i64 %inv2, %inv3 + br label %bb214 + +bb214: ; preds = %bb214, %bb + %iv1 = phi i64 [ %iv2, %bb214 ], [ 0, %bb ] + %iv2 = phi i64 [ %iv2_plus_1, %bb214 ], [ 1, %bb ] + %loop_dependent = shl nuw nsw i64 %iv1, 13 + %loop_dependent2 = add nsw i64 %inv4, %loop_dependent + call void @keep_alive(i64 %loop_dependent2) + %loop_dependent3 = add i64 %inv5, %loop_dependent + call void @keep_alive(i64 %loop_dependent3) + %iv2_plus_1 = add i64 %iv2, 1 + br label %bb214 +} + +declare void @keep_alive(i64) diff --git a/llvm/test/Transforms/Reassociate/local-cse.ll b/llvm/test/Transforms/Reassociate/local-cse.ll new file mode 100644 index 0000000000000..aef199ec20d48 --- /dev/null +++ b/llvm/test/Transforms/Reassociate/local-cse.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; Check that the default heuristic use the local cse constraints. +; RUN: opt -S -passes=reassociate,early-cse %s -o - | FileCheck %s -check-prefix=LOCAL_CSE +; RUN: opt -S -passes=reassociate,early-cse %s -reassociate-use-cse-local=true -o - | FileCheck %s -check-prefix=LOCAL_CSE +; RUN: opt -S -passes=reassociate,early-cse %s -reassociate-use-cse-local=false -o - | FileCheck %s -check-prefix=CSE + + +; Check that when we use the heuristic to expose only local (to the first +; encountered block) CSE opportunities, we choose the right sub expression +; to expose. +; +; In these example we have three chains of expressions: +; chain a: inv1, val_bb2, inv2, inv4 +; chain b: inv1, val_bb2, inv2, inv5 +; chain c: inv1, val_bb2, inv3 +; +; The CSE-able pairs with there respective occurrences are: +; inv1, val_bb2: 3 +; inv1, inv2: 2 +; +; val_bb2 is anchored in bb2 but inv1 and inv2 can start in bb1. +; With the local heuristic we will push inv1, inv2 at the beginning +; of chain_a and chain_b. +; With the non-local heuristic we will push inv1, val_bb2. +define void @chain_spanning_several_blocks(i64 %inv1, i64 %inv2, i64 %inv3, i64 %inv4, i64 %inv5) { +; LOCAL_CSE-LABEL: define void @chain_spanning_several_blocks +; LOCAL_CSE-SAME: (i64 [[INV1:%.*]], i64 [[INV2:%.*]], i64 [[INV3:%.*]], i64 [[INV4:%.*]], i64 [[INV5:%.*]]) { +; LOCAL_CSE-NEXT: bb1: +; LOCAL_CSE-NEXT: [[CHAIN_A0:%.*]] = add i64 [[INV2]], [[INV1]] +; LOCAL_CSE-NEXT: br label [[BB2:%.*]] +; LOCAL_CSE: bb2: +; LOCAL_CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() +; LOCAL_CSE-NEXT: [[CHAIN_A1:%.*]] = add i64 [[CHAIN_A0]], [[INV4]] +; LOCAL_CSE-NEXT: [[CHAIN_A2:%.*]] = add i64 [[CHAIN_A1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_B1:%.*]] = add i64 [[CHAIN_A0]], [[INV5]] +; LOCAL_CSE-NEXT: [[CHAIN_B2:%.*]] = add i64 [[CHAIN_B1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_C0:%.*]] = add i64 [[INV3]], [[INV1]] +; LOCAL_CSE-NEXT: [[CHAIN_C1:%.*]] = add i64 [[CHAIN_C0]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) +; LOCAL_CSE-NEXT: ret void +; +; CSE-LABEL: define void @chain_spanning_several_blocks +; CSE-SAME: (i64 [[INV1:%.*]], i64 [[INV2:%.*]], i64 [[INV3:%.*]], i64 [[INV4:%.*]], i64 [[INV5:%.*]]) { +; CSE-NEXT: bb1: +; CSE-NEXT: br label [[BB2:%.*]] +; CSE: bb2: +; CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() +; CSE-NEXT: [[CHAIN_A0:%.*]] = add i64 [[VAL_BB2]], [[INV1]] +; CSE-NEXT: [[CHAIN_A1:%.*]] = add i64 [[CHAIN_A0]], [[INV2]] +; CSE-NEXT: [[CHAIN_A2:%.*]] = add i64 [[CHAIN_A1]], [[INV4]] +; CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5]] +; CSE-NEXT: [[CHAIN_C1:%.*]] = add i64 [[CHAIN_A0]], [[INV3]] +; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) +; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) +; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) +; CSE-NEXT: ret void +; +bb1: + %chain_a0 = add nuw nsw i64 %inv1, %inv2 + br label %bb2 + +bb2: + %val_bb2 = call i64 @get_val() + %chain_a1 = add nuw nsw i64 %chain_a0, %val_bb2 + %chain_a2 = add nuw nsw i64 %chain_a1, %inv4 + %chain_b0 = add nuw nsw i64 %val_bb2, %inv1 + %chain_b1 = add nuw nsw i64 %chain_b0, %inv2 + %chain_b2 = add nuw nsw i64 %chain_b1, %inv5 + %chain_c0 = add nuw nsw i64 %val_bb2, %inv3 + %chain_c1 = add nuw nsw i64 %chain_c0, %inv1 + call void @keep_alive(i64 %chain_a2) + call void @keep_alive(i64 %chain_b2) + call void @keep_alive(i64 %chain_c1) + ret void +} + +; Same as @chain_spanning_several_blocks, but with values that are all anchored +; on the non-entry block. +; I.e., same pair map as previous but with invX_bbY instead of invX. +; Note: Although %inv1_bb0 is anchored in the entry block, it doesn't constrain +; the sub expressions on the entry block because we need to see at least two +; values to be able to form a sub-expression and thus only the second one +; add a constraint. +define void @chain_spanning_several_blocks_no_entry_anchor() { +; LOCAL_CSE-LABEL: define void @chain_spanning_several_blocks_no_entry_anchor() { +; LOCAL_CSE-NEXT: bb0: +; LOCAL_CSE-NEXT: [[INV2_BB0:%.*]] = call i64 @get_val() +; LOCAL_CSE-NEXT: br label [[BB1:%.*]] +; LOCAL_CSE: bb1: +; LOCAL_CSE-NEXT: [[INV1_BB1:%.*]] = call i64 @get_val() +; LOCAL_CSE-NEXT: [[CHAIN_A0:%.*]] = add i64 [[INV1_BB1]], [[INV2_BB0]] +; LOCAL_CSE-NEXT: br label [[BB2:%.*]] +; LOCAL_CSE: bb2: +; LOCAL_CSE-NEXT: [[INV3_BB2:%.*]] = call i64 @get_val() +; LOCAL_CSE-NEXT: [[INV4_BB2:%.*]] = call i64 @get_val() +; LOCAL_CSE-NEXT: [[INV5_BB2:%.*]] = call i64 @get_val() +; LOCAL_CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() +; LOCAL_CSE-NEXT: [[CHAIN_A1:%.*]] = add i64 [[CHAIN_A0]], [[INV4_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_A2:%.*]] = add i64 [[CHAIN_A1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_B1:%.*]] = add i64 [[CHAIN_A0]], [[INV5_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_B2:%.*]] = add i64 [[CHAIN_B1]], [[VAL_BB2]] +; LOCAL_CSE-NEXT: [[CHAIN_C0:%.*]] = add i64 [[VAL_BB2]], [[INV1_BB1]] +; LOCAL_CSE-NEXT: [[CHAIN_C1:%.*]] = add i64 [[CHAIN_C0]], [[INV3_BB2]] +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) +; LOCAL_CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) +; LOCAL_CSE-NEXT: ret void +; +; CSE-LABEL: define void @chain_spanning_several_blocks_no_entry_anchor() { +; CSE-NEXT: bb0: +; CSE-NEXT: [[INV2_BB0:%.*]] = call i64 @get_val() +; CSE-NEXT: br label [[BB1:%.*]] +; CSE: bb1: +; CSE-NEXT: [[INV1_BB1:%.*]] = call i64 @get_val() +; CSE-NEXT: br label [[BB2:%.*]] +; CSE: bb2: +; CSE-NEXT: [[INV3_BB2:%.*]] = call i64 @get_val() +; CSE-NEXT: [[INV4_BB2:%.*]] = call i64 @get_val() +; CSE-NEXT: [[INV5_BB2:%.*]] = call i64 @get_val() +; CSE-NEXT: [[VAL_BB2:%.*]] = call i64 @get_val() +; CSE-NEXT: [[CHAIN_A0:%.*]] = add i64 [[VAL_BB2]], [[INV1_BB1]] +; CSE-NEXT: [[CHAIN_A1:%.*]] = add i64 [[CHAIN_A0]], [[INV2_BB0]] +; CSE-NEXT: [[CHAIN_A2:%.*]] = add i64 [[CHAIN_A1]], [[INV4_BB2]] +; CSE-NEXT: [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5_BB2]] +; CSE-NEXT: [[CHAIN_C1:%.*]] = add i64 [[CHAIN_A0]], [[INV3_BB2]] +; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_A2]]) +; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_B2]]) +; CSE-NEXT: call void @keep_alive(i64 [[CHAIN_C1]]) +; CSE-NEXT: ret void +; +bb0: + %inv2_bb0 = call i64 @get_val() + br label %bb1 + +bb1: + %inv1_bb1 = call i64 @get_val() + %chain_a0 = add nuw nsw i64 %inv1_bb1, %inv2_bb0 + br label %bb2 + +bb2: + %inv3_bb2 = call i64 @get_val() + %inv4_bb2 = call i64 @get_val() + %inv5_bb2 = call i64 @get_val() + %val_bb2 = call i64 @get_val() + %chain_a1 = add nuw nsw i64 %chain_a0, %val_bb2 + %chain_a2 = add nuw nsw i64 %chain_a1, %inv4_bb2 + %chain_b0 = add nuw nsw i64 %val_bb2, %inv1_bb1 + %chain_b1 = add nuw nsw i64 %chain_b0, %inv2_bb0 + %chain_b2 = add nuw nsw i64 %chain_b1, %inv5_bb2 + %chain_c0 = add nuw nsw i64 %val_bb2, %inv3_bb2 + %chain_c1 = add nuw nsw i64 %chain_c0, %inv1_bb1 + call void @keep_alive(i64 %chain_a2) + call void @keep_alive(i64 %chain_b2) + call void @keep_alive(i64 %chain_c1) + ret void +} +declare i64 @get_val() +declare void @keep_alive(i64)