diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index 5a682e8c7b5eb..84bdd5c2379f1 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -93,11 +93,14 @@
 #include "llvm/Transforms/Utils/SizeOpts.h"
 
 namespace llvm {
-// Map of potential specializations for each function. The FunctionSpecializer
-// keeps the discovered specialisation opportunities for the module in a single
-// vector, where the specialisations of each function form a contiguous range.
-// This map's value is the beginning and the end of that range.
-using SpecMap = DenseMap<Function *, std::pair<unsigned, unsigned>>;
+struct Spec;
+
+// Map of potential specializations for each function.
+using SpecMap = DenseMap<Function *, SmallVector<unsigned>>;
+
+using CallUserT = SmallMapVector<
+    CallBase *,
+    std::pair<SmallVector<std::pair<unsigned, Constant *>, 4>, Function *>, 4>;
 
 // Just a shorter abbreviation to improve indentation.
 using Cost = InstructionCost;
@@ -124,6 +127,14 @@ struct SpecSig {
   }
 };
 
+enum CallSiteStatusT { AWAITING_PARENT, HAS_PARENT, NO_PARENT };
+
+struct SpecCall {
+  CallBase *CallSite;
+  CallSiteStatusT Status;
+  unsigned Parent;
+};
+
 // Specialization instance.
 struct Spec {
   // Original function.
@@ -141,13 +152,47 @@ struct Spec {
   // Number of instructions in the specialization.
   unsigned CodeSize;
 
+  // Cumulative function size of the chain
+  unsigned FuncSize;
+
+  // Latency savings
+  unsigned Latency;
+
+  // Benefit from inlining
+  unsigned InlineScore;
+
   // List of call sites, matching this specialization.
-  SmallVector<CallBase *> CallSites;
+  SmallVector<SpecCall> CallSites;
 
-  Spec(Function *F, const SpecSig &S, unsigned Score, unsigned CodeSize)
-      : F(F), Sig(S), Score(Score), CodeSize(CodeSize) {}
-  Spec(Function *F, const SpecSig &&S, unsigned Score, unsigned CodeSize)
-      : F(F), Sig(S), Score(Score), CodeSize(CodeSize) {}
+  bool AllChains = true;
+
+  void addCall(SpecCall SC) {
+    CallSites.push_back(SC);
+    AllChains = AllChains && SC.Status != CallSiteStatusT::NO_PARENT;
+  }
+
+  // List Sub-Specializations
+  SmallVector<unsigned> SubSpecs;
+
+  // Index within AllSpecs
+  unsigned Loc = 0;
+
+  bool SpecializeOnOwn = true;
+
+  Spec(Function *F, CallBase *CallSite, const SpecSig &S,
+       CallSiteStatusT Status)
+      : F(F), Clone(nullptr), Sig(S), Score(), CodeSize(), FuncSize(),
+        InlineScore(), CallSites() {
+    addCall({CallSite, Status, /*Parent*/ 0});
+  }
+  Spec(Function *F, CallBase *CallSite, CallSiteStatusT Status)
+      : F(F), Clone(nullptr), Sig(), Score(), CodeSize(), FuncSize(),
+        InlineScore(), CallSites() {
+    addCall({CallSite, Status, /*Parent*/ 0});
+  }
+  Spec(Function *F)
+      : F(F), Clone(nullptr), Sig(), Score(), CodeSize(), FuncSize(),
+        InlineScore(), CallSites() {}
 };
 
 class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
@@ -180,9 +225,11 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
     return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
   }
 
-  LLVM_ABI Cost getCodeSizeSavingsForArg(Argument *A, Constant *C);
+  LLVM_ABI Cost getCodeSizeSavingsForArg(Argument *A, Constant *C,
+                                         CallUserT *CallUsers = nullptr);
 
-  LLVM_ABI Cost getCodeSizeSavingsFromPendingPHIs();
+  LLVM_ABI Cost
+  getCodeSizeSavingsFromPendingPHIs(CallUserT *CallUsers = nullptr);
 
   LLVM_ABI Cost getLatencySavingsForKnownConstants();
 
@@ -194,7 +241,9 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ) const;
 
   Cost getCodeSizeSavingsForUser(Instruction *User, Value *Use = nullptr,
-                                 Constant *C = nullptr);
+                                 Constant *C = nullptr,
+                                 CallUserT *CallUsers = nullptr,
+                                 llvm::Use *UseEdge = nullptr);
 
   Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
   Cost estimateSwitchInst(SwitchInst &I);
@@ -247,6 +296,7 @@ class FunctionSpecializer {
 
   SmallPtrSet<Function *, 32> Specializations;
   SmallPtrSet<Function *, 32> DeadFunctions;
+  SmallPtrSet<Function *, 32> VisitedFunctions;
   DenseMap<Function *, CodeMetrics> FunctionMetrics;
   DenseMap<Function *, unsigned> FunctionGrowth;
   unsigned NGlobals = 0;
@@ -295,9 +345,26 @@ class FunctionSpecializer {
   /// @param FuncSize Cost of specializing a function.
   /// @param AllSpecs A vector to add potential specializations to.
   /// @param SM  A map for a function's specialisation range
+  /// @param CurrentChain Current chain of function calls.
+  /// @return True, if any potential specializations were found
+  bool findSpecializations(unsigned FuncSize, SmallVectorImpl<Spec> &AllSpecs,
+                           SpecMap &SM, Spec &InS,
+                           DenseMap<SpecSig, unsigned> &UniqueSpecs,
+                           SmallPtrSet<Function *, 4> &CurrentChain);
+
+  /// @brief Find specialization opportunities for a given function.
+  /// @param S Specialization to complete, possibly with a Callsite attached.
+  /// @param Chained Is this call part of a chain build?
+  /// @param SM  A map for a function's specialisation range
+  /// @param AllSpecs A vector to add potential specializations to.
+  /// @param UniqueSpecs Map of existing specializations.
+  /// @param CurrentChain Current chain of function calls.
+  /// site.
   /// @return True, if any potential specializations were found
-  bool findSpecializations(Function *F, unsigned FuncSize,
-                           SmallVectorImpl<Spec> &AllSpecs, SpecMap &SM);
+  bool runOneSpec(Spec &S, bool Chained, SpecMap &SM,
+                  SmallVectorImpl<Spec> &AllSpecs,
+                  DenseMap<SpecSig, unsigned> &UniqueSpecs,
+                  SmallPtrSet<Function *, 4> CurrentChain);
 
   /// Compute the inlining bonus for replacing argument \p A with constant \p C.
   unsigned getInliningBonus(Argument *A, Constant *C);
@@ -308,7 +375,8 @@ class FunctionSpecializer {
   /// @param F Function to specialize
   /// @param S Which specialization to create
   /// @return The new, cloned function
-  Function *createSpecialization(Function *F, const SpecSig &S);
+  Function *createSpecialization(Function *F, const SpecSig &S,
+                                 ValueToValueMapTy &Mappings);
 
   /// Determine if it is possible to specialise the function for constant values
   /// of the formal parameter \p A.
@@ -320,9 +388,9 @@ class FunctionSpecializer {
 
   /// @brief Find and update calls to \p F, which match a specialization
   /// @param F Orginal function
-  /// @param Begin Start of a range of possibly matching specialisations
-  /// @param End End of a range (exclusive) of possibly matching specialisations
-  void updateCallSites(Function *F, const Spec *Begin, const Spec *End);
+  /// @param Specs Vector of possibly matching specialisations
+  void updateCallSites(Function *F, const SmallVector<unsigned> &Specs,
+                       SmallVector<Spec, 32> AllSpecs);
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 150a2dc5d48e2..746193eb2e547 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -154,27 +154,31 @@ Constant *InstCostVisitor::findConstantFor(Value *V) const {
   return KnownConstants.lookup(V);
 }
 
-Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs() {
+Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs(CallUserT *CallUsers) {
   Cost CodeSize;
   while (!PendingPHIs.empty()) {
     Instruction *Phi = PendingPHIs.pop_back_val();
     // The pending PHIs could have been proven dead by now.
     if (isBlockExecutable(Phi->getParent()))
-      CodeSize += getCodeSizeSavingsForUser(Phi);
+      CodeSize +=
+          getCodeSizeSavingsForUser(Phi, nullptr, nullptr, CallUsers, nullptr);
   }
   return CodeSize;
 }
 
 /// Compute the codesize savings for replacing argument \p A with constant \p C.
-Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) {
+Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C,
+                                               CallUserT *CallUsers) {
   LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
                     << C->getNameOrAsOperand() << "\n");
   Cost CodeSize;
-  for (auto *U : A->users())
-    if (auto *UI = dyn_cast<Instruction>(U))
+  for (Use &UseEdge : A->uses()) {
+    User *U = UseEdge.getUser();
+    if (auto *UI = dyn_cast<Instruction>(U)) {
       if (isBlockExecutable(UI->getParent()))
-        CodeSize += getCodeSizeSavingsForUser(UI, A, C);
-
+        CodeSize += getCodeSizeSavingsForUser(UI, A, C, CallUsers, &UseEdge);
+    }
+  }
   LLVM_DEBUG(dbgs() << "FnSpecialization:   Accumulated bonus {CodeSize = "
                     << CodeSize << "} for argument " << *A << "\n");
   return CodeSize;
@@ -217,7 +221,9 @@ Cost InstCostVisitor::getLatencySavingsForKnownConstants() {
 }
 
 Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,
-                                                Constant *C) {
+                                                Constant *C,
+                                                CallUserT *CallUsers,
+                                                llvm::Use *UseEdge) {
   // We have already propagated a constant for this user.
   if (KnownConstants.contains(User))
     return 0;
@@ -227,10 +233,45 @@ Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,
                     : KnownConstants.end();
 
   Cost CodeSize = 0;
+  auto isChainableCall = [&](Instruction *I) -> bool {
+    if (!CallUsers || !UseEdge)
+      return false;
+    if (CallInst *CI = dyn_cast<CallInst>(I);
+        CI && CI->getIntrinsicID() == llvm::Intrinsic::not_intrinsic) {
+      LLVM_DEBUG(
+          dbgs() << "FnSpecialization:   Found constant forwarded via a call "
+                 << *C << "\n");
+      Function *F = CI->getCalledFunction();
+      if (F) { // Avoid function pointers
+        unsigned Idx = CI->getArgOperandNo(UseEdge);
+        LLVM_DEBUG(dbgs() << "FnSpecialization:   Function called: "
+                          << F->getName() << " argument number: " << Idx
+                          << "\n");
+        (*CallUsers)[CI].first.push_back({Idx, C});
+        (*CallUsers)[CI].second = F;
+        return true;
+      } else if (Use == CI->getCalledOperand()) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization:   Found call to constant "
+                             "function pointer.\n");
+        Function *CF = dyn_cast<Function>(C);
+        assert(CF && "Indirect call to a non-Function type");
+        (*CallUsers)[CI].second = CF;
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "FnSpecialization:   Could not find call function.\n");
+        unsigned Idx = CI->getArgOperandNo(UseEdge);
+        (*CallUsers)[CI].first.push_back({Idx, C});
+      }
+    }
+    return false;
+  };
   if (auto *I = dyn_cast<SwitchInst>(User)) {
     CodeSize = estimateSwitchInst(*I);
   } else if (auto *I = dyn_cast<BranchInst>(User)) {
     CodeSize = estimateBranchInst(*I);
+  } else if (isChainableCall(User)) {
+    // Will get benefit from recusive call to findSpecializations()
+    return 0;
   } else {
     C = visit(*User);
     if (!C)
@@ -246,11 +287,12 @@ Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,
 
   LLVM_DEBUG(dbgs() << "FnSpecialization:     {CodeSize = " << CodeSize
                     << "} for user " << *User << "\n");
-
-  for (auto *U : User->users())
+  for (llvm::Use &UE : User->uses()) {
+    llvm::User *U = UE.getUser();
     if (auto *UI = dyn_cast<Instruction>(U))
       if (UI != User && isBlockExecutable(UI->getParent()))
-        CodeSize += getCodeSizeSavingsForUser(UI, User, C);
+        CodeSize += getCodeSizeSavingsForUser(UI, User, C, CallUsers, &UE);
+  }
 
   return CodeSize;
 }
@@ -668,6 +710,82 @@ static unsigned getCostValue(const Cost &C) {
   return static_cast<unsigned>(Value);
 }
 
+bool FunctionSpecializer::runOneSpec(Spec &S, bool Chained, SpecMap &SM,
+                                     SmallVectorImpl<Spec> &AllSpecs,
+                                     DenseMap<SpecSig, unsigned> &UniqueSpecs,
+                                     SmallPtrSet<Function *, 4> CurrentChain) {
+  Function &F = *(S.F);
+  if (!isCandidateFunction(&F))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "FnSpecialization: Trying function " << F.getName()
+                    << ", Chain=" << Chained << "\n");
+
+  auto [It, Inserted] = FunctionMetrics.try_emplace(&F);
+  CodeMetrics &Metrics = It->second;
+  // Analyze the function.
+  if (Inserted) {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(&F, &GetAC(F), EphValues);
+    for (BasicBlock &BB : F)
+      Metrics.analyzeBasicBlock(&BB, GetTTI(F), EphValues);
+  }
+
+  // When specializing literal constants is enabled, always require functions
+  // to be larger than MinFunctionSize, to prevent excessive specialization.
+  const bool RequireMinSize =
+      !ForceSpecialization &&
+      (SpecializeLiteralConstant || !F.hasFnAttribute(Attribute::NoInline));
+
+  // If the code metrics reveal that we shouldn't duplicate the function,
+  // or if the code size implies that this function is easy to get inlined,
+  // then we shouldn't specialize it.
+  if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid())
+    return false;
+
+  if (RequireMinSize && Metrics.NumInsts < MinFunctionSize) {
+    if (Chained) {
+      // Want to specialize as part of chain still so we can more accurately
+      // assess the chain specialization
+      S.SpecializeOnOwn = false;
+    } else {
+      return false;
+    }
+  }
+
+  // When specialization on literal constants is disabled, only consider
+  // recursive functions when running multiple times to save wasted analysis,
+  // as we will not be able to specialize on any newly found literal constant
+  // return values.
+  if (!Chained && !SpecializeLiteralConstant && VisitedFunctions.contains(&F) &&
+      !Metrics.isRecursive)
+    return false;
+
+  // Don't want to mistake this chain for checking all of the CallSites for F
+  if (!Chained)
+    VisitedFunctions.insert(&F);
+
+  int64_t Sz = Metrics.NumInsts.getValue();
+  assert(Sz > 0 && "CodeSize should be positive");
+  // It is safe to down cast from int64_t, NumInsts is always positive.
+  unsigned FuncSize = static_cast<unsigned>(Sz);
+
+  LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
+                    << F.getName() << " is " << FuncSize << "\n");
+
+  if (Inserted && Metrics.isRecursive)
+    promoteConstantStackValues(&F);
+
+  if (!findSpecializations(FuncSize, AllSpecs, SM, S, UniqueSpecs,
+                           CurrentChain)) {
+    LLVM_DEBUG(
+        dbgs() << "FnSpecialization: No possible specializations found for "
+               << F.getName() << "\n");
+    return false;
+  }
+  return true;
+}
+
 /// Attempt to specialize functions in the module to enable constant
 /// propagation across function boundaries.
 ///
@@ -676,63 +794,24 @@ bool FunctionSpecializer::run() {
   // Find possible specializations for each function.
   SpecMap SM;
   SmallVector<Spec, 32> AllSpecs;
+  // A mapping from a specialisation signature to the index of the respective
+  // entry in the all specialisation array. Used to ensure uniqueness of
+  // specialisations.
+  DenseMap<SpecSig, unsigned> UniqueSpecs;
   unsigned NumCandidates = 0;
   for (Function &F : M) {
-    if (!isCandidateFunction(&F))
-      continue;
-
-    auto [It, Inserted] = FunctionMetrics.try_emplace(&F);
-    CodeMetrics &Metrics = It->second;
-    //Analyze the function.
-    if (Inserted) {
-      SmallPtrSet<const Value *, 32> EphValues;
-      CodeMetrics::collectEphemeralValues(&F, &GetAC(F), EphValues);
-      for (BasicBlock &BB : F)
-        Metrics.analyzeBasicBlock(&BB, GetTTI(F), EphValues);
-    }
-
-    // When specializing literal constants is enabled, always require functions
-    // to be larger than MinFunctionSize, to prevent excessive specialization.
-    const bool RequireMinSize =
-        !ForceSpecialization &&
-        (SpecializeLiteralConstant || !F.hasFnAttribute(Attribute::NoInline));
-
-    // If the code metrics reveal that we shouldn't duplicate the function,
-    // or if the code size implies that this function is easy to get inlined,
-    // then we shouldn't specialize it.
-    if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() ||
-        (RequireMinSize && Metrics.NumInsts < MinFunctionSize))
-      continue;
-
-    // When specialization on literal constants is disabled, only consider
-    // recursive functions when running multiple times to save wasted analysis,
-    // as we will not be able to specialize on any newly found literal constant
-    // return values.
-    if (!SpecializeLiteralConstant && !Inserted && !Metrics.isRecursive)
-      continue;
-
-    int64_t Sz = Metrics.NumInsts.getValue();
-    assert(Sz > 0 && "CodeSize should be positive");
-    // It is safe to down cast from int64_t, NumInsts is always positive.
-    unsigned FuncSize = static_cast<unsigned>(Sz);
-
-    LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
-                      << F.getName() << " is " << FuncSize << "\n");
-
-    if (Inserted && Metrics.isRecursive)
-      promoteConstantStackValues(&F);
-
-    if (!findSpecializations(&F, FuncSize, AllSpecs, SM)) {
-      LLVM_DEBUG(
-          dbgs() << "FnSpecialization: No possible specializations found for "
-                 << F.getName() << "\n");
-      continue;
-    }
-
-    ++NumCandidates;
+    Spec S(&F);
+    SmallPtrSet<Function *, 4> CurrentChain;
+    if (runOneSpec(S, /*Chained*/ false, SM, AllSpecs, UniqueSpecs,
+                   CurrentChain))
+      ++NumCandidates;
   }
 
-  if (!NumCandidates) {
+  unsigned IndepSpecs = 0;
+  for (auto &S : AllSpecs)
+    if (S.SpecializeOnOwn && !S.AllChains)
+      ++IndepSpecs;
+  if (!NumCandidates || !IndepSpecs) {
     LLVM_DEBUG(
         dbgs()
         << "FnSpecialization: No possible specializations found in module\n");
@@ -743,12 +822,16 @@ bool FunctionSpecializer::run() {
   // specialization budget, which is derived from maximum number of
   // specializations per specialization candidate function.
   auto CompareScore = [&AllSpecs](unsigned I, unsigned J) {
+    if (!AllSpecs[J].SpecializeOnOwn || AllSpecs[J].AllChains)
+      return true;
+    if (!AllSpecs[I].SpecializeOnOwn || AllSpecs[I].AllChains)
+      return false;
     if (AllSpecs[I].Score != AllSpecs[J].Score)
       return AllSpecs[I].Score > AllSpecs[J].Score;
     return I > J;
   };
-  const unsigned NSpecs =
-      std::min(NumCandidates * MaxClones, unsigned(AllSpecs.size()));
+  const unsigned NSpecs = std::min(
+      {NumCandidates * MaxClones, unsigned(AllSpecs.size()), IndepSpecs});
   SmallVector<unsigned> BestSpecs(NSpecs + 1);
   std::iota(BestSpecs.begin(), BestSpecs.begin() + NSpecs, 0);
   if (AllSpecs.size() > NSpecs) {
@@ -780,47 +863,90 @@ bool FunctionSpecializer::run() {
   // Create the chosen specializations.
   SmallPtrSet<Function *, 8> OriginalFuncs;
   SmallVector<Function *> Clones;
+  // Does this also need to include the base function in the hash, or is the
+  // SpecSig sufficient
+  DenseMap<SpecSig, Function *> UniqueClones;
   for (unsigned I = 0; I < NSpecs; ++I) {
     Spec &S = AllSpecs[BestSpecs[I]];
 
-    // Accumulate the codesize growth for the function, now we are creating the
-    // specialization.
-    FunctionGrowth[S.F] += S.CodeSize;
-
-    S.Clone = createSpecialization(S.F, S.Sig);
-
     // Update the known call sites to call the clone.
-    for (CallBase *Call : S.CallSites) {
-      Function *Clone = S.Clone;
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call
-                        << " to call " << Clone->getName() << "\n");
-      Call->setCalledFunction(S.Clone);
-      auto &BFI = GetBFI(*Call->getFunction());
-      std::optional<uint64_t> Count =
-          BFI.getBlockProfileCount(Call->getParent());
-      if (Count && !ProfcheckDisableMetadataFixes) {
-        std::optional<llvm::Function::ProfileCount> MaybeCloneCount =
-            Clone->getEntryCount();
-        if (MaybeCloneCount) {
-          uint64_t CallCount = *Count + MaybeCloneCount->getCount();
-          Clone->setEntryCount(CallCount);
-          if (std::optional<llvm::Function::ProfileCount> MaybeOriginalCount =
-                  S.F->getEntryCount()) {
-            uint64_t OriginalCount = MaybeOriginalCount->getCount();
-            if (OriginalCount >= *Count) {
-              S.F->setEntryCount(OriginalCount - *Count);
-            } else {
-              // This should generally not happen as that would mean there are
-              // more computed calls to the function than what was recorded.
-              LLVM_DEBUG(S.F->setEntryCount(0));
+    ValueToValueMapTy Mappings;
+
+    auto actuallySpecialize = [&](auto &&actuallySpecialize, Spec &S,
+                                  CallSiteStatusT Status, unsigned Parent,
+                                  ValueToValueMapTy &Mappings) -> void {
+      if (Status == CallSiteStatusT::HAS_PARENT) {
+        for (auto &CS : S.CallSites) {
+          if (CS.Status == Status && CS.Parent == Parent) {
+            CallBase *&Call = CS.CallSite;
+            Value *V = Mappings[Call];
+            Call = dyn_cast<CallBase>(V);
+          }
+        }
+      }
+
+      bool NewClone;
+      ValueToValueMapTy CurrMappings;
+      if (auto It = UniqueClones.find(S.Sig); It != UniqueClones.end()) {
+        NewClone = false;
+        S.Clone = It->second;
+      } else {
+        NewClone = true;
+        S.Clone = createSpecialization(S.F, S.Sig, CurrMappings);
+
+        // Accumulate the codesize growth for the function, now we are creating
+        // the specialization.
+        FunctionGrowth[S.F] += S.CodeSize;
+
+        UniqueClones[S.Sig] = S.Clone;
+        Clones.push_back(S.Clone);
+        OriginalFuncs.insert(S.F);
+      }
+      for (auto &CS : S.CallSites) {
+        if (CS.Status == Status && CS.Parent == Parent) {
+          Function *Clone = S.Clone;
+          CallBase *&Call = CS.CallSite;
+          LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call
+                            << " to call " << Clone->getName() << "\n");
+          Call->setCalledFunction(Clone);
+          auto &BFI = GetBFI(*Call->getFunction());
+          std::optional<uint64_t> Count =
+              BFI.getBlockProfileCount(Call->getParent());
+          if (Count && !ProfcheckDisableMetadataFixes) {
+            std::optional<llvm::Function::ProfileCount> MaybeCloneCount =
+                Clone->getEntryCount();
+            if (MaybeCloneCount) {
+              uint64_t CallCount = *Count + MaybeCloneCount->getCount();
+              Clone->setEntryCount(CallCount);
+              if (std::optional<llvm::Function::ProfileCount>
+                      MaybeOriginalCount = S.F->getEntryCount()) {
+                uint64_t OriginalCount = MaybeOriginalCount->getCount();
+                if (OriginalCount >= *Count) {
+                  S.F->setEntryCount(OriginalCount - *Count);
+                } else {
+                  // This should generally not happen as that would mean there
+                  // are more computed calls to the function than what was
+                  // recorded.
+                  LLVM_DEBUG(S.F->setEntryCount(0));
+                }
+              }
             }
           }
         }
       }
-    }
+      if (!NewClone)
+        return;
+      for (auto &SSI : S.SubSpecs) {
+        Spec &SS = AllSpecs[SSI];
+        actuallySpecialize(actuallySpecialize, SS,
+                           /*Status*/ CallSiteStatusT::HAS_PARENT,
+                           /*Parent*/ S.Loc, CurrMappings);
+      }
+    };
 
-    Clones.push_back(S.Clone);
-    OriginalFuncs.insert(S.F);
+    actuallySpecialize(actuallySpecialize, S,
+                       /*hasParent*/ CallSiteStatusT::NO_PARENT, /*Parent*/ 0,
+                       Mappings);
   }
 
   Solver.solveWhileResolvedUndefsIn(Clones);
@@ -828,10 +954,8 @@ bool FunctionSpecializer::run() {
   // Update the rest of the call sites - these are the recursive calls, calls
   // to discarded specialisations and calls that may match a specialisation
   // after the solver runs.
-  for (Function *F : OriginalFuncs) {
-    auto [Begin, End] = SM[F];
-    updateCallSites(F, AllSpecs.begin() + Begin, AllSpecs.begin() + End);
-  }
+  for (Function *F : OriginalFuncs)
+    updateCallSites(F, SM[F], AllSpecs);
 
   for (Function *F : Clones) {
     if (F->getReturnType()->isVoidTy())
@@ -890,21 +1014,20 @@ void FunctionSpecializer::removeDeadFunctions() {
 
 /// Clone the function \p F and remove the ssa_copy intrinsics added by
 /// the SCCPSolver in the cloned version.
-static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) {
-  ValueToValueMapTy Mappings;
+static Function *cloneCandidateFunction(Function *F, unsigned NSpecs,
+                                        ValueToValueMapTy &Mappings) {
   Function *Clone = CloneFunction(F, Mappings);
   Clone->setName(F->getName() + ".specialized." + Twine(NSpecs));
   removeSSACopy(*Clone);
   return Clone;
 }
 
-bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
-                                              SmallVectorImpl<Spec> &AllSpecs,
-                                              SpecMap &SM) {
-  // A mapping from a specialisation signature to the index of the respective
-  // entry in the all specialisation array. Used to ensure uniqueness of
-  // specialisations.
-  DenseMap<SpecSig, unsigned> UniqueSpecs;
+bool FunctionSpecializer::findSpecializations(
+    unsigned FuncSize, SmallVectorImpl<Spec> &AllSpecs, SpecMap &SM, Spec &InS,
+    DenseMap<SpecSig, unsigned> &UniqueSpecs,
+    SmallPtrSet<Function *, 4> &CurrentChain) {
+  Function *F = InS.F;
+  bool FoundSpecialization = false;
 
   // Get a list of interesting arguments.
   SmallVector<Argument *> Args;
@@ -915,15 +1038,32 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
   if (Args.empty())
     return false;
 
-  for (User *U : F->users()) {
-    if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
-      continue;
-    auto &CS = *cast<CallBase>(U);
+  SmallVector<CallBase *, 8> CallSites;
+  CallSiteStatusT Status;
+  if (InS.CallSites.size()) {
+    assert(InS.CallSites.size() == 1 &&
+           "Should only be passing single call spec as part of a chain");
+    CallSites.push_back(InS.CallSites[0].CallSite);
+    Status = CallSiteStatusT::AWAITING_PARENT;
+  } else {
+    Status = CallSiteStatusT::NO_PARENT;
+    for (User *U : F->users()) {
+      // If multiple funcs, check that user is proceeding func
+      if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+        continue;
+      auto *CS = cast<CallBase>(U);
 
-    // The user instruction does not call our function.
-    if (CS.getCalledFunction() != F)
-      continue;
+      // The user instruction does not call our function.
+      if (CS->getCalledFunction() != F)
+        continue;
 
+      CallSites.push_back(CS);
+    }
+  }
+
+  for (auto *CSP : CallSites) {
+    auto &CS = *CSP;
+    Spec Chain(F, /*CallSite*/ CSP, Status);
     // If the call site has attribute minsize set, that callsite won't be
     // specialized.
     if (CS.hasFnAttr(Attribute::MinSize))
@@ -938,18 +1078,44 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
     // constant operands of this call site.
     SpecSig S;
     for (Argument *A : Args) {
-      Constant *C = getCandidateConstant(CS.getArgOperand(A->getArgNo()));
-      if (!C)
-        continue;
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument "
-                        << A->getName() << " : " << C->getNameOrAsOperand()
-                        << "\n");
-      S.Args.push_back({A, C});
+      // Check if this argument is constant from the call chain propogation
+      unsigned Idx;
+      auto &As = InS.Sig.Args;
+      for (Idx = 0; Idx < As.size(); ++Idx) {
+        if (As[Idx].Formal == A)
+          break;
+      }
+      if (As.size() == Idx) {
+        unsigned ArgNo = A->getArgNo();
+        if (ArgNo >= CS.arg_size())
+          continue;
+        Value *PossC = CS.getArgOperand(ArgNo);
+        Constant *C = getCandidateConstant(PossC);
+        if (!C)
+          continue;
+        LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument "
+                          << A->getName() << " : " << C->getNameOrAsOperand()
+                          << "\n");
+        S.Args.push_back({A, C});
+        if (InS.CallSites.size()) {
+          assert(InS.CallSites.size() == 1 &&
+                 "Should only be passing single call spec as part of a chain");
+          InS.Sig.Args.push_back({A, C});
+        }
+      } else {
+        Constant *C = InS.Sig.Args[Idx].Actual;
+        S.Args.push_back({A, C});
+        LLVM_DEBUG(dbgs() << "FnSpecialization: Found passed argument "
+                          << A->getName() << " : " << C->getNameOrAsOperand()
+                          << "\n");
+      }
     }
 
     if (S.Args.empty())
       continue;
 
+    CallUserT CallUsers;
+
     // Check if we have encountered the same specialisation already.
     if (auto It = UniqueSpecs.find(S); It != UniqueSpecs.end()) {
       // Existing specialisation. Add the call to the list to rewrite, unless
@@ -961,21 +1127,80 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       if (CS.getFunction() == F)
         continue;
       const unsigned Index = It->second;
-      AllSpecs[Index].CallSites.push_back(&CS);
+      AllSpecs[Index].addCall({&CS, Status, /*Parent*/ 0});
     } else {
       // Calculate the specialisation gain.
       Cost CodeSize;
       unsigned Score = 0;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
       for (ArgInfo &A : S.Args) {
-        CodeSize += Visitor.getCodeSizeSavingsForArg(A.Formal, A.Actual);
+        CodeSize +=
+            Visitor.getCodeSizeSavingsForArg(A.Formal, A.Actual, &CallUsers);
         Score += getInliningBonus(A.Formal, A.Actual);
       }
-      CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs();
+
+      CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs(&CallUsers);
+      CurrentChain.insert(F);
+
+      for (auto &CU : CallUsers) {
+        Function *NewF = CU.second.second;
+
+        // Recurse only if constants found for the function
+        if (!NewF || NewF->isVarArg())
+          continue;
+
+        // Don't allow any recursion in chains
+        bool isRecursion = CurrentChain.contains(NewF);
+        if (isRecursion)
+          continue;
+
+        LLVM_DEBUG(
+            dbgs() << "FnSpecialization:   Recursively calling runOneSpec() on "
+                   << NewF->getName() << "\n");
+
+        // Since the function might not yet be known when processing the
+        // constants due to a function pointer, wait to extract the argument
+        // pointer at a given index.
+        SpecSig NewS;
+        for (auto &P : CU.second.first)
+          NewS.Args.push_back({NewF->getArg(P.first), P.second});
+
+        Spec CallSpec(NewF, /*CallSite*/ CU.first, NewS,
+                      /*Status*/ CallSiteStatusT::AWAITING_PARENT);
+        runOneSpec(CallSpec, /*Chained*/ true, SM, AllSpecs, UniqueSpecs,
+                   CurrentChain);
+
+        // Use CallSpec.Sig since may have been added to within findSpec()
+        if (auto It = UniqueSpecs.find(CallSpec.Sig); It != UniqueSpecs.end()) {
+          const unsigned Index = It->second;
+          Chain.SubSpecs.push_back(Index);
+        }
+      }
 
       unsigned CodeSizeSavings = getCostValue(CodeSize);
       unsigned SpecSize = FuncSize - CodeSizeSavings;
 
+      // Cache savings information in the chain to use for profitibility
+      // analysis of the entire chain
+      Chain.CodeSize = SpecSize;
+      Chain.InlineScore = Score;
+      Chain.FuncSize = FuncSize;
+      unsigned CumulCodeSize = 0;
+      unsigned CumulFuncSize = 0;
+      unsigned CumulInlineScore = 0;
+      unsigned CumulLatency = 0;
+      auto getCumulScores = [&](auto &&getCumulScores, Spec &CurrSpec) -> void {
+        CumulCodeSize += CurrSpec.CodeSize;
+        CumulFuncSize += CurrSpec.FuncSize;
+        CumulInlineScore += CurrSpec.InlineScore;
+        CumulLatency += CurrSpec.Latency;
+        for (auto SSI : CurrSpec.SubSpecs) {
+          getCumulScores(getCumulScores, AllSpecs[SSI]);
+        }
+      };
+      getCumulScores(getCumulScores, Chain);
+      unsigned CumulCodeSizeSavings = CumulFuncSize - CumulCodeSize;
+
       auto IsProfitable = [&]() -> bool {
         // No check required.
         if (ForceSpecialization)
@@ -984,56 +1209,105 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
         LLVM_DEBUG(
             dbgs() << "FnSpecialization: Specialization bonus {Inlining = "
                    << Score << " (" << (Score * 100 / FuncSize) << "%)}\n");
+        LLVM_DEBUG(
+            dbgs()
+            << "FnSpecialization: Chain specialization bonus {Inlining = "
+            << CumulInlineScore << " ("
+            << (CumulInlineScore * 100 / CumulFuncSize) << "%)}\n");
 
         // Minimum inlining bonus.
-        if (Score > MinInliningBonus * FuncSize / 100)
+        if ((Score > MinInliningBonus * FuncSize / 100) &&
+            (CumulInlineScore > MinInliningBonus * CumulFuncSize / 100))
           return true;
 
         LLVM_DEBUG(
             dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
                    << CodeSizeSavings << " ("
                    << (CodeSizeSavings * 100 / FuncSize) << "%)}\n");
+        LLVM_DEBUG(dbgs() << "FnSpecialization: Cumulative specialization "
+                             "bonus {CodeSize = "
+                          << CumulCodeSizeSavings << " ("
+                          << (CumulCodeSizeSavings * 100 / CumulFuncSize)
+                          << "%)}\n");
 
         // Minimum codesize savings.
-        if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100)
+        if ((CodeSizeSavings <= MinCodeSizeSavings * FuncSize / 100) &&
+            (CumulCodeSizeSavings <= MinCodeSizeSavings * CumulFuncSize / 100))
           return false;
 
         // Lazily compute the Latency, to avoid unnecessarily computing BFI.
-        unsigned LatencySavings =
+        Chain.Latency =
             getCostValue(Visitor.getLatencySavingsForKnownConstants());
+        CumulLatency += Chain.Latency;
 
         LLVM_DEBUG(
             dbgs() << "FnSpecialization: Specialization bonus {Latency = "
-                   << LatencySavings << " ("
-                   << (LatencySavings * 100 / FuncSize) << "%)}\n");
+                   << CumulLatency << " ("
+                   << (CumulLatency * 100 / CumulFuncSize) << "%)}\n");
 
         // Minimum latency savings.
-        if (LatencySavings < MinLatencySavings * FuncSize / 100)
+        if (CumulLatency < MinLatencySavings * CumulFuncSize / 100)
           return false;
         // Maximum codesize growth.
         if ((FunctionGrowth[F] + SpecSize) / FuncSize > MaxCodeSizeGrowth)
           return false;
 
-        Score += std::max(CodeSizeSavings, LatencySavings);
+        Score = CumulInlineScore + std::max(CumulCodeSizeSavings, CumulLatency);
         return true;
       };
 
-      // Discard unprofitable specialisations.
-      if (!IsProfitable())
+      auto RemoveFromSubSpecs = [&](Spec &S) -> void {
+        for (unsigned &SSI : S.SubSpecs) {
+          Spec &SS = AllSpecs[SSI];
+          auto NewEnd = std::remove_if(
+              SS.CallSites.begin(), SS.CallSites.end(),
+              [&](SpecCall &SC) -> bool {
+                return SC.Status == CallSiteStatusT::AWAITING_PARENT;
+              });
+          SS.CallSites.erase(NewEnd, SS.CallSites.end());
+        }
+      };
+
+      // Discard unprofitable specialisations
+      if (!IsProfitable()) {
+        RemoveFromSubSpecs(Chain); // Remove Parent from SubSpecs
         continue;
+      }
+
+      auto AddParentToSubSpecs = [&](Spec &S) -> void {
+        for (unsigned &SSI : S.SubSpecs) {
+          Spec &SS = AllSpecs[SSI];
+          for (SpecCall &SC : SS.CallSites) {
+            if (SC.Status == CallSiteStatusT::AWAITING_PARENT) {
+              SC.Status = CallSiteStatusT::HAS_PARENT;
+              SC.Parent = S.Loc;
+            }
+          }
+        }
+      };
 
       // Create a new specialisation entry.
-      auto &Spec = AllSpecs.emplace_back(F, S, Score, SpecSize);
-      if (CS.getFunction() != F)
-        Spec.CallSites.push_back(&CS);
+      auto &Spec = AllSpecs.emplace_back(Chain);
       const unsigned Index = AllSpecs.size() - 1;
+      Spec.Loc = Index;
+      AddParentToSubSpecs(Spec);
+      // Update the chain's Sig for any new constants at this level
+      Spec.Sig = S;
+      Spec.Score = Score;
+
+      if (CS.getFunction() == F && !Spec.CallSites[0].Parent) {
+        Spec.CallSites.clear();
+        // Don't reset AllChains since this can be standalone specialized
+      }
       UniqueSpecs[S] = Index;
-      if (auto [It, Inserted] = SM.try_emplace(F, Index, Index + 1); !Inserted)
-        It->second.second = Index + 1;
+
+      FoundSpecialization = true;
+
+      SM[F].push_back(Index);
     }
   }
 
-  return !UniqueSpecs.empty();
+  return FoundSpecialization;
 }
 
 bool FunctionSpecializer::isCandidateFunction(Function *F) {
@@ -1065,9 +1339,11 @@ bool FunctionSpecializer::isCandidateFunction(Function *F) {
   return true;
 }
 
-Function *FunctionSpecializer::createSpecialization(Function *F,
-                                                    const SpecSig &S) {
-  Function *Clone = cloneCandidateFunction(F, Specializations.size() + 1);
+Function *
+FunctionSpecializer::createSpecialization(Function *F, const SpecSig &S,
+                                          ValueToValueMapTy &Mappings) {
+  Function *Clone =
+      cloneCandidateFunction(F, Specializations.size() + 1, Mappings);
 
   // The original function does not neccessarily have internal linkage, but the
   // clone must.
@@ -1207,8 +1483,9 @@ Constant *FunctionSpecializer::getCandidateConstant(Value *V) {
   return C;
 }
 
-void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin,
-                                          const Spec *End) {
+void FunctionSpecializer::updateCallSites(Function *F,
+                                          const SmallVector<unsigned> &Specs,
+                                          SmallVector<Spec, 32> AllSpecs) {
   // Collect the call sites that need updating.
   SmallVector<CallBase *> ToUpdate;
   for (User *U : F->users())
@@ -1223,7 +1500,8 @@ void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin,
 
     // Find the best matching specialisation.
     const Spec *BestSpec = nullptr;
-    for (const Spec &S : make_range(Begin, End)) {
+    for (const unsigned SI : Specs) {
+      const auto &S = AllSpecs[SI];
       if (!S.Clone || (BestSpec && S.Score <= BestSpec->Score))
         continue;
 
diff --git a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll
index 668929824cc6f..456480b2cc674 100644
--- a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll
@@ -60,7 +60,7 @@ define i32 @f2(i32 %offset) {
 }
 
 ; Tests that `func` has been specialized and it didn't cause compiler crash.
+; CHECK-DAG: func.specialized.4
+; CHECK-DAG: func.specialized.5
 ; CHECK-DAG: func.specialized.1
-; CHECK-DAG: func.specialized.2
-; CHECK-DAG: func.specialized.3
 
diff --git a/llvm/test/Transforms/FunctionSpecialization/dead-gv-load.ll b/llvm/test/Transforms/FunctionSpecialization/dead-gv-load.ll
index 134a79d349035..337780d0de2e4 100644
--- a/llvm/test/Transforms/FunctionSpecialization/dead-gv-load.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/dead-gv-load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=ipsccp  --funcspec-min-function-size=1 -S < %s | FileCheck %s
-
+; RUN: opt -passes=ipsccp  --funcspec-min-function-size=1 \
+; RUN: -funcspec-min-codesize-savings=1 -S < %s | FileCheck %s
 @gv = internal global ptr null
 
 define i8 @caller() {
diff --git a/llvm/test/Transforms/FunctionSpecialization/maxgrowth.ll b/llvm/test/Transforms/FunctionSpecialization/maxgrowth.ll
index 82d1f7ae4a6e1..7dc7e8ec69f50 100644
--- a/llvm/test/Transforms/FunctionSpecialization/maxgrowth.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/maxgrowth.ll
@@ -26,7 +26,7 @@ entry:
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[NOTSPEC0:%.*]] = call i32 @add(i32 0, i32 [[N]])
 ; CHECK-NEXT:    [[NOTSPEC1:%.*]] = call i32 @add(i32 1, i32 [[N]])
-; CHECK-NEXT:    [[SPEC:%.*]] = call i32 @add.specialized.1(i32 1, i32 1)
+; CHECK-NEXT:    [[SPEC:%.*]] = call i32 @add(i32 1, i32 1)
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -36,9 +36,3 @@ entry:
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X]], [[Y]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
-;
-; CHECK-LABEL: define internal i32 @add.specialized.1(
-; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret i32 poison
-;
diff --git a/llvm/test/Transforms/FunctionSpecialization/recursive-penalty.ll b/llvm/test/Transforms/FunctionSpecialization/recursive-penalty.ll
index fc17387dec94d..ff90634ddd424 100644
--- a/llvm/test/Transforms/FunctionSpecialization/recursive-penalty.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/recursive-penalty.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
 ; RUN: opt -passes="ipsccp<func-spec>,inline,instcombine,simplifycfg" -S \
 ; RUN:     -funcspec-min-function-size=23 -funcspec-max-iters=100 \
@@ -6,11 +7,40 @@
 ; Make sure the number of specializations created are not
 ; linear to the number of iterations (funcspec-max-iters).
 
-; CHECK: FnSpecialization: Created 4 specializations in module
-
 @Global = internal constant i32 1, align 4
 
 define internal void @recursiveFunc(ptr readonly %arg) {
+; CHECK-LABEL: define internal void @recursiveFunc(
+; CHECK-SAME: ptr readonly [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TEMP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ARG_LOAD:%.*]] = load i32, ptr [[ARG]], align 4
+; CHECK-NEXT:    [[ARG_CMP:%.*]] = icmp slt i32 [[ARG_LOAD]], 10000
+; CHECK-NEXT:    br i1 [[ARG_CMP]], label %[[LOOP1:.*]], label %[[RET_BLOCK:.*]]
+; CHECK:       [[LOOP1]]:
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    br label %[[LOOP3:.*]]
+; CHECK:       [[LOOP3]]:
+; CHECK-NEXT:    br label %[[LOOP4:.*]]
+; CHECK:       [[LOOP4]]:
+; CHECK-NEXT:    call void @print_val(i32 [[ARG_LOAD]])
+; CHECK-NEXT:    [[ARG_ADD:%.*]] = add nsw i32 [[ARG_LOAD]], 1
+; CHECK-NEXT:    store i32 [[ARG_ADD]], ptr [[TEMP]], align 4
+; CHECK-NEXT:    call void @recursiveFunc(ptr nonnull [[TEMP]])
+; CHECK-NEXT:    [[EXIT_COND1:%.*]] = call i1 @exit_cond()
+; CHECK-NEXT:    br i1 [[EXIT_COND1]], label %[[LOOP4]], label %[[LOOP3_END:.*]]
+; CHECK:       [[LOOP3_END]]:
+; CHECK-NEXT:    [[EXIT_COND2:%.*]] = call i1 @exit_cond()
+; CHECK-NEXT:    br i1 [[EXIT_COND2]], label %[[LOOP3]], label %[[LOOP2_END:.*]]
+; CHECK:       [[LOOP2_END]]:
+; CHECK-NEXT:    [[EXIT_COND3:%.*]] = call i1 @exit_cond()
+; CHECK-NEXT:    br i1 [[EXIT_COND3]], label %[[LOOP2]], label %[[LOOP1_END:.*]]
+; CHECK:       [[LOOP1_END]]:
+; CHECK-NEXT:    [[EXIT_COND4:%.*]] = call i1 @exit_cond()
+; CHECK-NEXT:    br i1 [[EXIT_COND4]], label %[[LOOP1]], label %[[RET_BLOCK]]
+; CHECK:       [[RET_BLOCK]]:
+; CHECK-NEXT:    ret void
+;
   %temp = alloca i32, align 4
   %arg.load = load i32, ptr %arg, align 4
   %arg.cmp = icmp slt i32 %arg.load, 10000
@@ -56,6 +86,10 @@ ret.block:
 }
 
 define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:    call void @recursiveFunc(ptr nonnull @Global)
+; CHECK-NEXT:    ret i32 0
+;
   call void @recursiveFunc(ptr @Global)
   ret i32 0
 }
diff --git a/llvm/test/Transforms/FunctionSpecialization/specialize-chain.ll b/llvm/test/Transforms/FunctionSpecialization/specialize-chain.ll
new file mode 100644
index 0000000000000..e09f33e7fadaa
--- /dev/null
+++ b/llvm/test/Transforms/FunctionSpecialization/specialize-chain.ll
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+; REQUIRES: asserts
+; RUN: opt -passes=ipsccp -S -funcspec-min-function-size=1 -funcspec-min-codesize-savings=20 -debug-only=function-specialization < %s 2>&1 | FileCheck %s
+
+define i32 @incr(i32 %a) {
+  %b = add nsw i32 %a, 1
+  %c = add nsw i32 %b, 1
+  %d = add nsw i32 %c, 1
+  %e = add nsw i32 %d, 1
+  %f = add nsw i32 %e, 1
+  %g = add nsw i32 %f, 1
+  ret i32 %g
+}
+
+define i32 @forward_outer(i32 %a) {
+entry:
+  %call = call i32 @forward_inner(i32 %a)
+  ret i32 %call
+}
+
+define i32 @multi_call(i32 %a) {
+entry:
+  %call = call i32 @incr(i32 %a)
+  %mul = mul nsw i32 %a, 2
+  %mul_call = call i32 @incr(i32 %mul)
+  ret i32 %call
+}
+
+define i32 @forward_inner(i32 %a) {
+entry:
+  %call = call i32 @incr(i32 %a)
+  ret i32 %call
+}
+
+define i32 @forward_unfold(i32 %a) {
+entry:
+  %b = mul nsw i32 %a, 10
+  %call = call i32 @incr(i32 %b)
+  %c = mul nsw i32 %call, 20
+  ret i32 %c
+}
+
+define dso_local signext i32 @intrinsic(i64 %a) {
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  call void @llvm.memcpy.p0.p0.i64(ptr %local_dest, ptr %local_src, i64 %a, i1 false)
+  ret i32 0
+}
+
+define i32 @main() {
+entry:
+  %add = call i32 @incr(i32 10)
+  %int = call i32 @intrinsic(i32 3)
+  %fwd_unfold = call i32 @forward_unfold(i32 3)
+  %fwd_inner = call i32 @forward_inner(i32 3)
+  %fwd_outer = call i32 @forward_outer(i32 3)
+  %fwd_outer1 = call i32 @forward_outer(i32 3)
+  %multi_call = call i32 @multi_call(i32 5)
+  ret i32 %multi_call
+}
+
+
+
+
+
+; CHECK-LABEL: define range(i32 -2147483642, -2147483648) i32 @incr(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[A]], 1
+; CHECK-NEXT:    [[C:%.*]] = add nsw i32 [[INC]], 1
+; CHECK-NEXT:    [[D:%.*]] = add nsw i32 [[C]], 1
+; CHECK-NEXT:    [[E:%.*]] = add nsw i32 [[D]], 1
+; CHECK-NEXT:    [[F:%.*]] = add nsw i32 [[E]], 1
+; CHECK-NEXT:    [[G:%.*]] = add nsw i32 [[F]], 1
+; CHECK-NEXT:    ret i32 [[G]]
+;
+;
+; CHECK-LABEL: define range(i32 -2147483642, -2147483648) i32 @forward_outer(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @forward_inner(i32 [[A]])
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+;
+; CHECK-LABEL: define range(i32 -2147483642, -2147483648) i32 @multi_call(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @incr(i32 [[A]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[A]], 2
+; CHECK-NEXT:    [[MUL_CALL:%.*]] = call i32 @incr(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+;
+; CHECK-LABEL: define range(i32 -2147483642, -2147483648) i32 @forward_inner(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @incr(i32 [[A]])
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+;
+; CHECK-LABEL: define i32 @forward_unfold(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i32 [[A]], 10
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @incr(i32 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = mul nsw i32 [[CALL]], 20
+; CHECK-NEXT:    ret i32 [[C]]
+;
+;
+; CHECK-LABEL: define dso_local signext i32 @intrinsic(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[LOCAL_DEST]], ptr [[LOCAL_SRC]], i64 [[A]], i1 false)
+; CHECK-NEXT:    ret i32 0
+;
+;
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD:%.*]] = call i32 @incr.specialized.1(i32 10)
+; CHECK-NEXT:    [[INT:%.*]] = call i32 @intrinsic(i32 3)
+; CHECK-NEXT:    [[FWD_UNFOLD:%.*]] = call i32 @forward_unfold.specialized.2(i32 3)
+; CHECK-NEXT:    [[FWD_INNER:%.*]] = call i32 @forward_inner.specialized.4(i32 3)
+; CHECK-NEXT:    [[FWD_OUTER:%.*]] = call i32 @forward_outer.specialized.6(i32 3)
+; CHECK-NEXT:    [[FWD_OUTER1:%.*]] = call i32 @forward_outer.specialized.6(i32 3)
+; CHECK-NEXT:    [[MULTI_CALL:%.*]] = call i32 @multi_call.specialized.7(i32 5)
+; CHECK-NEXT:    ret i32 11
+;
+;
+; CHECK-LABEL: define internal i32 @incr.specialized.1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @forward_unfold.specialized.2(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @incr.specialized.3(i32 30)
+; CHECK-NEXT:    ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @incr.specialized.3(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @forward_inner.specialized.4(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @incr.specialized.5(i32 3)
+; CHECK-NEXT:    ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @incr.specialized.5(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @forward_outer.specialized.6(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @forward_inner.specialized.4(i32 3)
+; CHECK-NEXT:    ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @multi_call.specialized.7(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @incr.specialized.8(i32 5)
+; CHECK-NEXT:    [[MUL_CALL:%.*]] = call i32 @incr.specialized.1(i32 10)
+; CHECK-NEXT:    ret i32 poison
+;
+;
+; CHECK-LABEL: define internal i32 @incr.specialized.8(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret i32 poison
+;
diff --git a/llvm/test/Transforms/FunctionSpecialization/track-ptr-return.ll b/llvm/test/Transforms/FunctionSpecialization/track-ptr-return.ll
index f4ba0e72a1b43..ef40bf12ae59d 100644
--- a/llvm/test/Transforms/FunctionSpecialization/track-ptr-return.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/track-ptr-return.ll
@@ -48,9 +48,8 @@ entry:
 ; CHECK-NEXT:    [[OP1:%.*]] = call ptr @select_op.specialized.1(ptr @global_true)
 ; CHECK-NEXT:    [[OP2:%.*]] = call ptr @select_op.specialized.2(ptr @global_false)
 ; CHECK-NEXT:    [[C1:%.*]] = call i64 @compute.specialized.3(ptr @plus)
-; CHECK-NEXT:    [[C2:%.*]] = call i64 @compute.specialized.4(ptr @minus)
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[C1]], [[C2]]
-; CHECK-NEXT:    ret i64 [[ADD]]
+; CHECK-NEXT:    [[C2:%.*]] = call i64 @compute.specialized.5(ptr @minus)
+; CHECK-NEXT:    ret i64 2
 ;
 ;
 ; CHECK-LABEL: define ptr @select_op(
@@ -87,15 +86,27 @@ entry:
 ; CHECK-LABEL: define internal i64 @compute.specialized.3(
 ; CHECK-SAME: ptr [[OP:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[RES:%.*]] = call i64 @plus(i64 1)
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @plus.specialized.4(i64 1)
+; CHECK-NEXT:    ret i64 poison
 ;
 ;
-; CHECK-LABEL: define internal i64 @compute.specialized.4(
+; CHECK-LABEL: define internal i64 @plus.specialized.4(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i64 poison
+;
+;
+; CHECK-LABEL: define internal i64 @compute.specialized.5(
 ; CHECK-SAME: ptr [[OP:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[RES:%.*]] = call i64 @minus(i64 1)
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @minus.specialized.6(i64 1)
+; CHECK-NEXT:    ret i64 poison
+;
+;
+; CHECK-LABEL: define internal i64 @minus.specialized.6(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i64 poison
 ;
 ;
 ; NOLIT-LABEL: define i64 @main() {
diff --git a/llvm/test/Transforms/FunctionSpecialization/track-return.ll b/llvm/test/Transforms/FunctionSpecialization/track-return.ll
index aaff6c138bbaa..d03d7c872ed79 100644
--- a/llvm/test/Transforms/FunctionSpecialization/track-return.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/track-return.ll
@@ -4,7 +4,7 @@ define i64 @main() {
 ; CHECK:       define i64 @main
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C1:%.*]] = call i64 @foo.specialized.1(i1 true, i64 3, i64 1)
-; CHECK-NEXT:    [[C2:%.*]] = call i64 @foo.specialized.2(i1 false, i64 4, i64 -1)
+; CHECK-NEXT:    [[C2:%.*]] = call i64 @foo.specialized.3(i1 false, i64 4, i64 -1)
 ; CHECK-NEXT:    ret i64 8
 ;
 entry:
@@ -15,27 +15,6 @@ entry:
 }
 
 define internal i64 @foo(i1 %flag, i64 %m, i64 %n) {
-;
-; CHECK:       define internal i64 @foo.specialized.1
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label %plus
-; CHECK:       plus:
-; CHECK-NEXT:    [[N0:%.*]] = call i64 @binop.specialized.4(i64 3, i64 1)
-; CHECK-NEXT:    [[RES0:%.*]] = call i64 @bar.specialized.6(i64 4)
-; CHECK-NEXT:    br label %merge
-; CHECK:       merge:
-; CHECK-NEXT:    ret i64 poison
-;
-; CHECK:       define internal i64 @foo.specialized.2
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label %minus
-; CHECK:       minus:
-; CHECK-NEXT:    [[N1:%.*]] = call i64 @binop.specialized.3(i64 4, i64 -1)
-; CHECK-NEXT:    [[RES1:%.*]] = call i64 @bar.specialized.5(i64 3)
-; CHECK-NEXT:    br label %merge
-; CHECK:       merge:
-; CHECK-NEXT:    ret i64 poison
-;
 entry:
   br i1 %flag, label %plus, label %minus
 
@@ -55,21 +34,61 @@ merge:
 }
 
 define internal i64 @binop(i64 %x, i64 %y) {
+entry:
+  %z = add i64 %x, %y
+  ret i64 %z
+}
+
+define internal i64 @bar(i64 %n) {
+entry:
+  %cmp = icmp sgt i64 %n, 3
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %res0 = sdiv i64 %n, 2
+  br label %if.end
+
+if.else:
+  %res1 = mul i64 %n, 2
+  br label %if.end
+
+if.end:
+  %res = phi i64 [ %res0, %if.then ], [ %res1, %if.else]
+  ret i64 %res
+}
+
+;
+; CHECK:       define internal i64 @foo.specialized.1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label %plus
+; CHECK:       plus:
+; CHECK-NEXT:    [[N0:%.*]] = call i64 @binop.specialized.2(i64 3, i64 1)
+; CHECK-NEXT:    [[RES0:%.*]] = call i64 @bar.specialized.6(i64 4)
+; CHECK-NEXT:    br label %merge
+; CHECK:       merge:
+; CHECK-NEXT:    ret i64 poison
+;
 ;
-; CHECK:       define internal i64 @binop.specialized.3
+; CHECK:       define internal i64 @binop.specialized.2
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    ret i64 poison
 ;
+;
+; CHECK:       define internal i64 @foo.specialized.3
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label %minus
+; CHECK:       minus:
+; CHECK-NEXT:    [[N1:%.*]] = call i64 @binop.specialized.4(i64 4, i64 -1)
+; CHECK-NEXT:    [[RES1:%.*]] = call i64 @bar.specialized.5(i64 3)
+; CHECK-NEXT:    br label %merge
+; CHECK:       merge:
+; CHECK-NEXT:    ret i64 poison
+;
+;
 ; CHECK:       define internal i64 @binop.specialized.4
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    ret i64 poison
 ;
-entry:
-  %z = add i64 %x, %y
-  ret i64 %z
-}
-
-define internal i64 @bar(i64 %n) {
 ;
 ; CHECK:       define internal i64 @bar.specialized.5
 ; CHECK-NEXT:  entry:
@@ -87,20 +106,3 @@ define internal i64 @bar(i64 %n) {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret i64 poison
 ;
-entry:
-  %cmp = icmp sgt i64 %n, 3
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %res0 = sdiv i64 %n, 2
-  br label %if.end
-
-if.else:
-  %res1 = mul i64 %n, 2
-  br label %if.end
-
-if.end:
-  %res = phi i64 [ %res0, %if.then ], [ %res1, %if.else]
-  ret i64 %res
-}
-