[lld-macho] Implement ICF

ICF = Identical C(ode|OMDAT) Folding This is the LLD ELF/COFF algorithm, adapted for MachO. So far, only `-icf all` is supported. In order to support `-icf safe`, we will need to port address-significance tables (`.addrsig` directives) to MachO, which will come in later diffs. `check-{llvm,clang,lld}` have 0 regressions for `lld -icf all` vs. baseline ld64. We only run ICF on `__TEXT,__text` for reasons explained in the block comment in `ConcatOutputSection.cpp`. Here is the perf impact for linking `chromium_framekwork` on a Mac Pro (16-core Xeon W) for the non-ICF case vs. pre-ICF: ``` N Min Max Median Avg Stddev x 20 4.27 4.44 4.34 4.349 0.043029977 + 20 4.37 4.46 4.405 4.4115 0.025188761 Difference at 95.0% confidence 0.0625 +/- 0.0225658 1.43711% +/- 0.518873% (Student's t, pooled s = 0.0352566) ``` Reviewed By: #lld-macho, int3 Differential Revision: https://reviews.llvm.org/D103292
llvm · Jun 17, 2021 · f27e454 · f27e454
1 parent 734d688
commit f27e454
Show file tree

Hide file tree

Showing 20 changed files with 849 additions and 18 deletions.
diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
@@ -15,6 +15,7 @@ add_lld_library(lldMachO2
   DriverUtils.cpp
   Dwarf.cpp
   ExportTrie.cpp
+  ICF.cpp
   InputFiles.cpp
   InputSection.cpp
   LTO.cpp

diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp
@@ -17,8 +17,7 @@
 #include "lld/Common/Memory.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/ScopedPrinter.h"
-
-#include <algorithm>
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
 using namespace llvm::MachO;
@@ -357,3 +356,12 @@ void ConcatOutputSection::mergeFlags(InputSection *input) {
   flags |= input->flags;
   flags &= pureMask;
 }
+
+void ConcatOutputSection::eraseOmittedInputSections() {
+  // Remove the duplicates from inputs
+  inputs.erase(std::remove_if(inputs.begin(), inputs.end(),
+                              [](const ConcatInputSection *isec) -> bool {
+                                return isec->shouldOmitFromOutput();
+                              }),
+               inputs.end());
+}
diff --git a/lld/MachO/ConcatOutputSection.h b/lld/MachO/ConcatOutputSection.h
@@ -40,6 +40,7 @@ class ConcatOutputSection final : public OutputSection {
   void finalize() override;
   bool needsThunks() const;
   uint64_t estimateStubsInRangeVA(size_t callIdx) const;
+  void eraseOmittedInputSections();
 
   void writeTo(uint8_t *buf) const override;
 

diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
@@ -57,6 +57,13 @@ enum class UndefinedSymbolTreatment {
   dynamic_lookup,
 };
 
+enum class ICFLevel {
+  unknown,
+  none,
+  safe,
+  all,
+};
+
 struct SectionAlign {
   llvm::StringRef segName;
   llvm::StringRef sectName;
@@ -126,6 +133,7 @@ struct Configuration {
   NamespaceKind namespaceKind = NamespaceKind::twolevel;
   UndefinedSymbolTreatment undefinedSymbolTreatment =
       UndefinedSymbolTreatment::error;
+  ICFLevel icfLevel = ICFLevel::none;
   llvm::MachO::HeaderFileType outputType;
   std::vector<llvm::StringRef> systemLibraryRoots;
   std::vector<llvm::StringRef> librarySearchPaths;

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
@@ -698,6 +698,29 @@ getUndefinedSymbolTreatment(const ArgList &args) {
   return treatment;
 }
 
+static ICFLevel getICFLevel(const ArgList &args) {
+  bool noDeduplicate = args.hasArg(OPT_no_deduplicate);
+  StringRef icfLevelStr = args.getLastArgValue(OPT_icf);
+  auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr)
+                      .Cases("none", "", ICFLevel::none)
+                      .Case("safe", ICFLevel::safe)
+                      .Case("all", ICFLevel::all)
+                      .Default(ICFLevel::unknown);
+  if (icfLevel == ICFLevel::unknown) {
+    warn(Twine("unknown -icf OPTION `") + icfLevelStr +
+         "', defaulting to `none'");
+    icfLevel = ICFLevel::none;
+  } else if (icfLevel != ICFLevel::none && noDeduplicate) {
+    warn(Twine("`-icf " + icfLevelStr +
+               "' conflicts with -no_deduplicate, setting to `none'"));
+    icfLevel = ICFLevel::none;
+  } else if (icfLevel == ICFLevel::safe) {
+    warn(Twine("`-icf safe' is not yet implemented, reverting to `none'"));
+    icfLevel = ICFLevel::none;
+  }
+  return icfLevel;
+}
+
 static void warnIfDeprecatedOption(const Option &opt) {
   if (!opt.getGroup().isValid())
     return;
@@ -1096,6 +1119,8 @@ bool macho::link(ArrayRef<const char *> argsArr, bool canExitEarly,
 
   config->undefinedSymbolTreatment = getUndefinedSymbolTreatment(args);
 
+  config->icfLevel = getICFLevel(args);
+
   if (config->outputType == MH_EXECUTE)
     config->entry = symtab->addUndefined(args.getLastArgValue(OPT_e, "_main"),
                                          /*file=*/nullptr,

diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
@@ -0,0 +1,257 @@
+//===- ICF.cpp ------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ICF.h"
+#include "ConcatOutputSection.h"
+#include "InputSection.h"
+#include "Symbols.h"
+#include "llvm/Support/Parallel.h"
+
+#include <atomic>
+
+using namespace llvm;
+using namespace lld;
+using namespace lld::macho;
+
+ICF::ICF(std::vector<ConcatInputSection *> &inputs) {
+  icfInputs.assign(inputs.begin(), inputs.end());
+}
+
+// ICF = Identical Code Folding
+//
+// We only fold __TEXT,__text, so this is really "code" folding, and not
+// "COMDAT" folding. String and scalar constant literals are deduplicated
+// elsewhere.
+//
+// Summary of segments & sections:
+//
+// Since folding never occurs across output-section boundaries,
+// ConcatOutputSection is the natural input for ICF.
+//
+// The __TEXT segment is readonly at the MMU. Some sections are already
+// deduplicated elsewhere (__TEXT,__cstring & __TEXT,__literal*) and some are
+// synthetic and inherently free of duplicates (__TEXT,__stubs &
+// __TEXT,__unwind_info). We only run ICF on __TEXT,__text. One might hope ICF
+// could work on __TEXT,__concat, but doing so induces many test failures.
+//
+// The __LINKEDIT segment is readonly at the MMU, yet entirely synthetic, and
+// thus ineligible for ICF.
+//
+// The __DATA_CONST segment is read/write at the MMU, but is logically const to
+// the application after dyld applies fixups to pointer data. Some sections are
+// deduplicated elsewhere (__DATA_CONST,__cfstring), and some are synthetic
+// (__DATA_CONST,__got). There are no ICF opportunities here.
+//
+// The __DATA segment is read/write at the MMU, and as application-writeable
+// data, none of its sections are eligible for ICF.
+//
+// Please see the large block comment in lld/ELF/ICF.cpp for an explanation
+// of the segregation algorithm.
+//
+// FIXME(gkm): implement keep-unique attributes
+// FIXME(gkm): implement address-significance tables for MachO object files
+
+static unsigned icfPass = 0;
+static std::atomic<bool> icfRepeat{false};
+
+// Compare everything except the relocation referents
+static bool equalsConstant(const ConcatInputSection *ia,
+                           const ConcatInputSection *ib) {
+  if (ia->data.size() != ib->data.size())
+    return false;
+  if (ia->data != ib->data)
+    return false;
+  if (ia->flags != ib->flags)
+    return false;
+  if (ia->relocs.size() != ib->relocs.size())
+    return false;
+  auto f = [&](const Reloc &ra, const Reloc &rb) {
+    if (ra.type != rb.type)
+      return false;
+    if (ra.pcrel != rb.pcrel)
+      return false;
+    if (ra.length != rb.length)
+      return false;
+    if (ra.offset != rb.offset)
+      return false;
+    if (ra.addend != rb.addend)
+      return false;
+    if (ra.referent.is<Symbol *>() != rb.referent.is<Symbol *>())
+      return false; // a nice place to breakpoint
+    return true;
+  };
+  return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(),
+                    f);
+}
+
+// Compare only the relocation referents
+static bool equalsVariable(const ConcatInputSection *ia,
+                           const ConcatInputSection *ib) {
+  assert(ia->relocs.size() == ib->relocs.size());
+  auto f = [&](const Reloc &ra, const Reloc &rb) {
+    if (ra.referent == rb.referent)
+      return true;
+    if (ra.referent.is<Symbol *>()) {
+      const auto *sa = ra.referent.get<Symbol *>();
+      const auto *sb = rb.referent.get<Symbol *>();
+      if (sa->kind() != sb->kind())
+        return false;
+      if (isa<Defined>(sa)) {
+        const auto *da = dyn_cast<Defined>(sa);
+        const auto *db = dyn_cast<Defined>(sb);
+        if (da->value != db->value)
+          return false;
+        if (da->isAbsolute() != da->isAbsolute())
+          return false;
+        if (da->isec)
+          if (da->isec->icfEqClass[icfPass % 2] !=
+              db->isec->icfEqClass[icfPass % 2])
+            return false;
+      } else if (isa<DylibSymbol>(sa)) {
+        // There is one DylibSymbol per gotIndex and we already checked for
+        // symbol equality, thus we know that these must be different.
+        return false;
+      } else {
+        llvm_unreachable("equalsVariable symbol kind");
+      }
+    } else {
+      const auto *sa = ra.referent.get<InputSection *>();
+      const auto *sb = rb.referent.get<InputSection *>();
+      if (sa->icfEqClass[icfPass % 2] != sb->icfEqClass[icfPass % 2])
+        return false;
+    }
+    return true;
+  };
+  return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(),
+                    f);
+}
+
+// Find the first InputSection after BEGIN whose equivalence class differs
+size_t ICF::findBoundary(size_t begin, size_t end) {
+  uint64_t beginHash = icfInputs[begin]->icfEqClass[icfPass % 2];
+  for (size_t i = begin + 1; i < end; ++i)
+    if (beginHash != icfInputs[i]->icfEqClass[icfPass % 2])
+      return i;
+  return end;
+}
+
+// Invoke FUNC on subranges with matching equivalence class
+void ICF::forEachClassRange(size_t begin, size_t end,
+                            std::function<void(size_t, size_t)> func) {
+  while (begin < end) {
+    size_t mid = findBoundary(begin, end);
+    func(begin, mid);
+    begin = mid;
+  }
+}
+
+// Split icfInputs into shards, then parallelize invocation of FUNC on subranges
+// with matching equivalence class
+void ICF::forEachClass(std::function<void(size_t, size_t)> func) {
+  // Only use threads when the benefits outweigh the overhead.
+  const size_t threadingThreshold = 1024;
+  if (icfInputs.size() < threadingThreshold) {
+    forEachClassRange(0, icfInputs.size(), func);
+    ++icfPass;
+    return;
+  }
+
+  // Shard into non-overlapping intervals, and call FUNC in parallel.  The
+  // sharding must be completed before any calls to FUNC are made so that FUNC
+  // can modify the InputSection in its shard without causing data races.
+  const size_t shards = 256;
+  size_t step = icfInputs.size() / shards;
+  size_t boundaries[shards + 1];
+  boundaries[0] = 0;
+  boundaries[shards] = icfInputs.size();
+  parallelForEachN(1, shards, [&](size_t i) {
+    boundaries[i] = findBoundary((i - 1) * step, icfInputs.size());
+  });
+  parallelForEachN(1, shards + 1, [&](size_t i) {
+    if (boundaries[i - 1] < boundaries[i]) {
+      forEachClassRange(boundaries[i - 1], boundaries[i], func);
+    }
+  });
+  ++icfPass;
+}
+
+void ICF::run() {
+  // Into each origin-section hash, combine all reloc referent section hashes.
+  for (icfPass = 0; icfPass < 2; ++icfPass) {
+    parallelForEach(icfInputs, [&](InputSection *isec) {
+      uint64_t hash = isec->icfEqClass[icfPass % 2];
+      for (const Reloc &r : isec->relocs) {
+        if (auto *sym = r.referent.dyn_cast<Symbol *>()) {
+          if (auto *dylibSym = dyn_cast<DylibSymbol>(sym))
+            hash += dylibSym->stubsHelperIndex;
+          else if (auto *defined = dyn_cast<Defined>(sym))
+            hash +=
+                defined->value +
+                (defined->isec ? defined->isec->icfEqClass[icfPass % 2] : 0);
+          else
+            llvm_unreachable("foldIdenticalSections symbol kind");
+        }
+      }
+      // Set MSB to 1 to avoid collisions with non-hashed classes.
+      isec->icfEqClass[(icfPass + 1) % 2] = hash | (1ull << 63);
+    });
+  }
+
+  llvm::stable_sort(icfInputs,
+                    [](const InputSection *a, const InputSection *b) {
+                      return a->icfEqClass[0] < b->icfEqClass[0];
+                    });
+  forEachClass(
+      [&](size_t begin, size_t end) { segregate(begin, end, equalsConstant); });
+
+  // Split equivalence groups by comparing relocations until convergence
+  do {
+    icfRepeat = false;
+    forEachClass([&](size_t begin, size_t end) {
+      segregate(begin, end, equalsVariable);
+    });
+  } while (icfRepeat);
+  log("ICF needed " + Twine(icfPass) + " iterations");
+
+  // Fold sections within equivalence classes
+  forEachClass([&](size_t begin, size_t end) {
+    if (end - begin < 2)
+      return;
+    ConcatInputSection *beginIsec = icfInputs[begin];
+    for (size_t i = begin + 1; i < end; ++i)
+      beginIsec->foldIdentical(icfInputs[i]);
+  });
+}
+
+// Split an equivalence class into smaller classes.
+void ICF::segregate(
+    size_t begin, size_t end,
+    std::function<bool(const ConcatInputSection *, const ConcatInputSection *)>
+        equals) {
+  while (begin < end) {
+    // Divide [begin, end) into two. Let mid be the start index of the
+    // second group.
+    auto bound = std::stable_partition(icfInputs.begin() + begin + 1,
+                                       icfInputs.begin() + end,
+                                       [&](ConcatInputSection *isec) {
+                                         return equals(icfInputs[begin], isec);
+                                       });
+    size_t mid = bound - icfInputs.begin();
+
+    // Split [begin, end) into [begin, mid) and [mid, end). We use mid as an
+    // equivalence class ID because every group ends with a unique index.
+    for (size_t i = begin; i < mid; ++i)
+      icfInputs[i]->icfEqClass[(icfPass + 1) % 2] = mid;
+
+    // If we created a group, we need to iterate the main loop again.
+    if (mid != end)
+      icfRepeat = true;
+
+    begin = mid;
+  }
+}