Parallelize string merging.

String merging is one of the most time-consuming functions in lld. This patch parallelize it to speed it up. On my 2-socket 20-core 40-threads Xeon E5-2680 @ 2.8 GHz machine, this patch shorten the clang debug build link time from 7.11s to 5.16s. It's a 27% improvement and actually pretty noticeable. In this test condition, lld is now 4x faster than gold. Differential Revision: https://reviews.llvm.org/D38266 llvm-svn: 314588
llvm · Sep 30, 2017 · c97a70c · c97a70c
1 parent 4db732a
commit c97a70c
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 30 deletions.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Support/SHA1.h"
 #include "llvm/Support/xxhash.h"
 #include <cstdlib>
+#include <thread>
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -48,6 +49,8 @@ using namespace llvm::support::endian;
 using namespace lld;
 using namespace lld::elf;
 
+const size_t MergeNoTailSection::NumShards;
+
 uint64_t SyntheticSection::getVA() const {
   if (OutputSection *Sec = getParent())
     return Sec->Addr + OutSecOff;
@@ -2181,19 +2184,19 @@ template <class ELFT> bool VersionNeedSection<ELFT>::empty() const {
   return getNeedNum() == 0;
 }
 
-MergeSyntheticSection::MergeSyntheticSection(StringRef Name, uint32_t Type,
-                                             uint64_t Flags, uint32_t Alignment)
-    : SyntheticSection(Flags, Type, Alignment, Name),
-      Builder(StringTableBuilder::RAW, Alignment) {}
-
 void MergeSyntheticSection::addSection(MergeInputSection *MS) {
   MS->Parent = this;
   Sections.push_back(MS);
 }
 
-size_t MergeSyntheticSection::getSize() const { return Builder.getSize(); }
+MergeTailSection::MergeTailSection(StringRef Name, uint32_t Type,
+                                   uint64_t Flags, uint32_t Alignment)
+    : MergeSyntheticSection(Name, Type, Flags, Alignment),
+      Builder(StringTableBuilder::RAW, Alignment) {}
+
+size_t MergeTailSection::getSize() const { return Builder.getSize(); }
 
-void MergeSyntheticSection::writeTo(uint8_t *Buf) { Builder.write(Buf); }
+void MergeTailSection::writeTo(uint8_t *Buf) { Builder.write(Buf); }
 
 void MergeTailSection::finalizeContents() {
   // Add all string pieces to the string table builder to create section
@@ -2215,17 +2218,63 @@ void MergeTailSection::finalizeContents() {
         Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I));
 }
 
+void MergeNoTailSection::writeTo(uint8_t *Buf) {
+  for (size_t I = 0; I < NumShards; ++I)
+    Shards[I].write(Buf + ShardOffsets[I]);
+}
+
+// This function is very hot (i.e. it can take several seconds to finish)
+// because sometimes the number of inputs is in an order of magnitude of
+// millions. So, we use multi-threading.
+//
+// For any strings S and T, we know S is not mergeable with T if S's hash
+// value is different from T's. If that's the case, we can safely put S and
+// T into different string builders without worrying about merge misses.
+// We do it in parallel.
 void MergeNoTailSection::finalizeContents() {
-  // Add all string pieces to the string table builder to create section
-  // contents. Because we are not tail-optimizing, offsets of strings are
-  // fixed when they are added to the builder (string table builder contains
-  // a hash table from strings to offsets).
-  for (MergeInputSection *Sec : Sections)
+  // Initializes string table builders.
+  for (size_t I = 0; I < NumShards; ++I)
+    Shards.emplace_back(StringTableBuilder::RAW, Alignment);
+
+  // Concurrency level. Must be a power of 2.
+  size_t Concurrency = 1;
+  if (Config->Threads)
+    if (int N = std::thread::hardware_concurrency())
+      Concurrency = std::min(PowerOf2Floor(N), NumShards);
+
+  // Add section pieces to the builders.
+  parallelForEachN(0, Concurrency, [&](size_t ThreadId) {
+    for (MergeInputSection *Sec : Sections) {
+      for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) {
+        if (!Sec->Pieces[I].Live)
+          continue;
+        CachedHashStringRef Str = Sec->getData(I);
+        size_t ShardId = getShardId(Str.hash());
+        if ((ShardId & (Concurrency - 1)) == ThreadId)
+          Sec->Pieces[I].OutputOff = Shards[ShardId].add(Str);
+      }
+    }
+  });
+
+  // Compute an in-section offset for each shard.
+  size_t Off = 0;
+  for (size_t I = 0; I < NumShards; ++I) {
+    Shards[I].finalizeInOrder();
+    if (Shards[I].getSize() > 0)
+      Off = alignTo(Off, Alignment);
+    ShardOffsets[I] = Off;
+    Off += Shards[I].getSize();
+  }
+  Size = Off;
+
+  // So far, section pieces have offsets from beginning of shards, but
+  // we want offsets from beginning of the whole section. Fix them.
+  parallelForEach(Sections, [&](MergeInputSection *Sec) {
     for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
       if (Sec->Pieces[I].Live)
-        Sec->Pieces[I].OutputOff = Builder.add(Sec->getData(I));
-
-  Builder.finalizeInOrder();
+        Sec->Pieces[I].OutputOff +=
+            ShardOffsets[getShardId(Sec->getData(I).hash())];
+  });
 }
 
 static MergeSyntheticSection *createMergeSynthetic(StringRef Name,

diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
@@ -668,24 +668,26 @@ template <class ELFT> class VersionNeedSection final : public SyntheticSection {
 class MergeSyntheticSection : public SyntheticSection {
 public:
   void addSection(MergeInputSection *MS);
-  size_t getSize() const override;
-  void writeTo(uint8_t *Buf) override;
 
 protected:
   MergeSyntheticSection(StringRef Name, uint32_t Type, uint64_t Flags,
-                        uint32_t Alignment);
+                        uint32_t Alignment)
+      : SyntheticSection(Flags, Type, Alignment, Name) {}
 
   std::vector<MergeInputSection *> Sections;
-  llvm::StringTableBuilder Builder;
 };
 
 class MergeTailSection final : public MergeSyntheticSection {
 public:
   MergeTailSection(StringRef Name, uint32_t Type, uint64_t Flags,
-                   uint32_t Alignment)
-      : MergeSyntheticSection(Name, Type, Flags, Alignment) {}
+                   uint32_t Alignment);
 
+  size_t getSize() const override;
+  void writeTo(uint8_t *Buf) override;
   void finalizeContents() override;
+
+private:
+  llvm::StringTableBuilder Builder;
 };
 
 class MergeNoTailSection final : public MergeSyntheticSection {
@@ -694,7 +696,27 @@ class MergeNoTailSection final : public MergeSyntheticSection {
                      uint32_t Alignment)
       : MergeSyntheticSection(Name, Type, Flags, Alignment) {}
 
+  size_t getSize() const override { return Size; }
+  void writeTo(uint8_t *Buf) override;
   void finalizeContents() override;
+
+private:
+  // We use the most significant bits of a hash as a shard ID.
+  // The reason why we don't want to use the least significant bits is
+  // because DenseMap also uses lower bits to determine a bucket ID.
+  // If we use lower bits, it significantly increases the probability of
+  // hash collisons.
+  size_t getShardId(uint32_t Hash) {
+    return Hash >> (32 - llvm::countTrailingZeros(NumShards));
+  }
+
+  // Section size
+  size_t Size;
+
+  // String table contents
+  constexpr static size_t NumShards = 32;
+  std::vector<llvm::StringTableBuilder> Shards;
+  size_t ShardOffsets[NumShards];
 };
 
 // .MIPS.abiflags section.

diff --git a/lld/test/ELF/comment-gc.s b/lld/test/ELF/comment-gc.s
@@ -5,8 +5,7 @@
 # RUN: llvm-objdump -s %t1 | FileCheck %s
 
 # CHECK:      Contents of section .comment:
-# CHECK-NEXT:  0000 00666f6f 00626172 004c4c44 20312e30 .foo.bar.LLD 1.0
-# CHECK-NEXT:  0010 00 .
+# CHECK-NEXT: foo.LLD 1.0..bar
 
 .ident "foo"
 

diff --git a/lld/test/ELF/compressed-debug-input.s b/lld/test/ELF/compressed-debug-input.s
@@ -61,11 +61,11 @@
 # DATA-NEXT:   AddressAlignment: 1
 # DATA-NEXT:   EntrySize: 0
 # DATA-NEXT:   SectionData (
-# DATA-NEXT:     0000: 73686F72 7420756E 7369676E 65642069  |short unsigned i|
-# DATA-NEXT:     0010: 6E740075 6E736967 6E656420 696E7400  |nt.unsigned int.|
+# DATA-NEXT:     0000: 756E7369 676E6564 20696E74 00636861  |unsigned int.cha|
+# DATA-NEXT:     0010: 7200756E 7369676E 65642063 68617200  |r.unsigned char.|
 # DATA-NEXT:     0020: 6C6F6E67 20756E73 69676E65 6420696E  |long unsigned in|
-# DATA-NEXT:     0030: 74006368 61720075 6E736967 6E656420  |t.char.unsigned |
-# DATA-NEXT:     0040: 63686172 00                          |char.|
+# DATA-NEXT:     0030: 74007368 6F727420 756E7369 676E6564  |t.short unsigned|
+# DATA-NEXT:     0040: 20696E74 00                          | int.|
 # DATA-NEXT:   )
 # DATA-NEXT: }
 

diff --git a/lld/test/ELF/merge-string.s b/lld/test/ELF/merge-string.s
@@ -54,7 +54,7 @@ zed:
 // NOTAIL-NEXT: AddressAlignment: 1
 // NOTAIL-NEXT: EntrySize: 0
 // NOTAIL-NEXT: SectionData (
-// NOTAIL-NEXT:   0000: 61626300 626300                     |abc.bc.|
+// NOTAIL-NEXT:   0000: 62630061 626300                     |bc.abc.|
 // NOTAIL-NEXT: )
 
 // NOMERGE:      Name:    .rodata1

diff --git a/lld/test/ELF/string-gc.s b/lld/test/ELF/string-gc.s
@@ -14,7 +14,7 @@
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
 // CHECK-NEXT:     Name: s3
-// CHECK-NEXT:     Value: 0x200125
+// CHECK-NEXT:     Value: 0x200120
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local (0x0)
 // CHECK-NEXT:     Type: Object (0x1)
@@ -23,7 +23,7 @@
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
 // CHECK-NEXT:     Name: s1
-// CHECK-NEXT:     Value: 0x200120
+// CHECK-NEXT:     Value: 0x200125
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local (0x0)
 // CHECK-NEXT:     Type: Object (0x1)