diff --git a/loadgen/CMakeLists.txt b/loadgen/CMakeLists.txt
index 624dd8b19c..a48841b1f8 100644
--- a/loadgen/CMakeLists.txt
+++ b/loadgen/CMakeLists.txt
@@ -9,7 +9,7 @@ message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSIO
 
 # Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
 if (NOT MSVC)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -W -Wall")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall")
 endif()
 message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}")
 set(CMAKE_CXX_STANDARD "14")
diff --git a/loadgen/benchmark/.gitignore b/loadgen/benchmark/.gitignore
new file mode 100644
index 0000000000..e792c8e55f
--- /dev/null
+++ b/loadgen/benchmark/.gitignore
@@ -0,0 +1,2 @@
+loadgen_build
+build
\ No newline at end of file
diff --git a/loadgen/benchmark/README.md b/loadgen/benchmark/README.md
new file mode 100644
index 0000000000..f4e1f2824b
--- /dev/null
+++ b/loadgen/benchmark/README.md
@@ -0,0 +1,9 @@
+Note: please install jemalloc first. See: http://jemalloc.net/
+Command: bash run.sh <target_qps> <0=Basic,1=Queue> <numCompleteThreads> <maxSizeInComplete> <server_coalesce_queries=0or1>
+
+Experiments:
+- On Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz
+- Basic SUT : 500-600k i/s
+- Basic SUT + jemalloc: 800-900k i/s (`bash run.sh 800000 0`)
+- Queued SUT (2 complete threads) + jemalloc: 1.2-1.3M i/s (`bash run.sh 1200000 1 2 2048`)
+- Queued SUT (2 complete threads) + jemalloc + server_coalesce_queries: 1.4-1.5M is/ (`bash run.sh 1400000 1 2 512 1`)
diff --git a/loadgen/benchmark/repro.cpp b/loadgen/benchmark/repro.cpp
new file mode 100644
index 0000000000..d5bc93bb2b
--- /dev/null
+++ b/loadgen/benchmark/repro.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cassert>
+#include <condition_variable>
+#include <deque>
+#include <iostream>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "loadgen.h"
+#include "query_sample_library.h"
+#include "system_under_test.h"
+#include "test_settings.h"
+
+class QSL : public mlperf::QuerySampleLibrary {
+ public:
+  ~QSL() override{};
+  const std::string& Name() const override { return mName; }
+  size_t TotalSampleCount() override { return 1000000; }
+  size_t PerformanceSampleCount() override { return TotalSampleCount(); }
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {}
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {}
+
+ private:
+  std::string mName{"Dummy QSL"};
+};
+
+class BasicSUT : public mlperf::SystemUnderTest {
+ public:
+  BasicSUT() {
+    // Start with some large value so that we don't reallocate memory.
+    initResponse(10000);
+  }
+  ~BasicSUT() override {}
+  const std::string& Name() const override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    int n = samples.size();
+    if (n > mResponses.size()) {
+      std::cout << "Warning: reallocating response buffer in BasicSUT. Maybe "
+                   "you should initResponse with larger value!?"
+                << std::endl;
+      initResponse(samples.size());
+    }
+    for (int i = 0; i < n; i++) {
+      mResponses[i].id = samples[i].id;
+    }
+    mlperf::QuerySamplesComplete(mResponses.data(), n);
+  }
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override{};
+
+ private:
+  void initResponse(int size) {
+    mResponses.resize(size,
+                      {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+  }
+  int mBuf{0};
+  std::string mName{"BasicSUT"};
+  std::vector<mlperf::QuerySampleResponse> mResponses;
+};
+
+class QueueSUT : public mlperf::SystemUnderTest {
+ public:
+  QueueSUT(int numCompleteThreads, int maxSize) {
+    // Each thread handle at most maxSize at a time.
+    std::cout << "QueueSUT: maxSize = " << maxSize << std::endl;
+    initResponse(numCompleteThreads, maxSize);
+    // Launch complete threads
+    for (int i = 0; i < numCompleteThreads; i++) {
+      mThreads.emplace_back(&QueueSUT::CompleteThread, this, i);
+    }
+  }
+  ~QueueSUT() override {
+    {
+      std::unique_lock<std::mutex> lck(mMtx);
+      mDone = true;
+      mCondVar.notify_all();
+    }
+    for (auto& thread : mThreads) {
+      thread.join();
+    }
+  }
+  const std::string& Name() const override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::unique_lock<std::mutex> lck(mMtx);
+    for (const auto& sample : samples) {
+      mIdQueue.push_back(sample.id);
+    }
+    // Let some worker thread to consume tasks
+    mCondVar.notify_one();
+  }
+  void FlushQueries() override {}
+  void ReportLatencyResults(
+      const std::vector<mlperf::QuerySampleLatency>& latencies_ns) override{};
+
+ private:
+  void CompleteThread(int threadIdx) {
+    auto& responses = mResponses[threadIdx];
+    size_t maxSize{responses.size()};
+    size_t actualSize{0};
+    while (true) {
+      {
+        std::unique_lock<std::mutex> lck(mMtx);
+        mCondVar.wait(lck, [&]() { return !mIdQueue.empty() || mDone; });
+
+        if (mDone) {
+          break;
+        }
+
+        actualSize = std::min(maxSize, mIdQueue.size());
+        for (int i = 0; i < actualSize; i++) {
+          responses[i].id = mIdQueue.front();
+          mIdQueue.pop_front();
+        }
+        mCondVar.notify_one();
+      }
+      mlperf::QuerySamplesComplete(responses.data(), actualSize);
+    }
+  }
+  void initResponse(int numCompleteThreads, int size) {
+    mResponses.resize(numCompleteThreads);
+    for (auto& responses : mResponses) {
+      responses.resize(size,
+                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+    }
+  }
+  int mBuf{0};
+  std::string mName{"QueueSUT"};
+  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
+  std::vector<std::thread> mThreads;
+  std::deque<mlperf::ResponseId> mIdQueue;
+  std::mutex mMtx;
+  std::condition_variable mCondVar;
+  bool mDone{false};
+};
+
+int main(int argc, char** argv) {
+  assert(argc >= 2 && "Need to pass in at least one argument: target_qps");
+  int target_qps = std::stoi(argv[1]);
+  std::cout << "target_qps = " << target_qps << std::endl;
+
+  bool useQueue{false};
+  int numCompleteThreads{4};
+  int maxSize{1};
+  bool server_coalesce_queries{false};
+  if (argc >= 3) {
+    useQueue = std::stoi(argv[2]) != 0;
+  }
+  if (argc >= 4) {
+    numCompleteThreads = std::stoi(argv[3]);
+  }
+  if (argc >= 5) {
+    maxSize = std::stoi(argv[4]);
+  }
+  if (argc >= 6) {
+    server_coalesce_queries = std::stoi(argv[5]) != 0;
+  }
+
+  QSL qsl;
+  std::unique_ptr<mlperf::SystemUnderTest> sut;
+
+  // Configure the test settings
+  mlperf::TestSettings testSettings;
+  testSettings.scenario = mlperf::TestScenario::Server;
+  testSettings.mode = mlperf::TestMode::PerformanceOnly;
+  testSettings.server_target_qps = target_qps;
+  testSettings.server_target_latency_ns = 10000000;  // 10ms
+  testSettings.server_target_latency_percentile = 0.99;
+  testSettings.min_duration_ms = 60000;
+  testSettings.min_query_count = 270000;
+  testSettings.server_coalesce_queries = server_coalesce_queries;
+  std::cout << "testSettings.server_coalesce_queries = "
+            << (server_coalesce_queries ? "True" : "False") << std::endl;
+
+  // Configure the logging settings
+  mlperf::LogSettings logSettings;
+  logSettings.log_output.outdir = "build";
+  logSettings.log_output.prefix = "mlperf_log_";
+  logSettings.log_output.suffix = "";
+  logSettings.log_output.prefix_with_datetime = false;
+  logSettings.log_output.copy_detail_to_stdout = false;
+  logSettings.log_output.copy_summary_to_stdout = true;
+  logSettings.log_mode = mlperf::LoggingMode::AsyncPoll;
+  logSettings.log_mode_async_poll_interval_ms = 1000;
+  logSettings.enable_trace = false;
+
+  // Choose SUT
+  if (useQueue) {
+    std::cout << "Using QueueSUT with " << numCompleteThreads
+              << " complete threads" << std::endl;
+    sut.reset(new QueueSUT(numCompleteThreads, maxSize));
+  } else {
+    std::cout << "Using BasicSUT" << std::endl;
+    sut.reset(new BasicSUT());
+  }
+
+  // Start test
+  std::cout << "Start test..." << std::endl;
+  mlperf::StartTest(sut.get(), &qsl, testSettings, logSettings);
+  std::cout << "Test done. Clean up SUT..." << std::endl;
+  sut.reset();
+  std::cout << "Done!" << std::endl;
+  return 0;
+}
diff --git a/loadgen/benchmark/run.sh b/loadgen/benchmark/run.sh
new file mode 100644
index 0000000000..08a34529e3
--- /dev/null
+++ b/loadgen/benchmark/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake ../.. && make -j && cd ..
+echo "Building test program..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O3 -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 build/repro.exe $1 $2 $3 $4 $5
diff --git a/loadgen/benchmark/run_debug.sh b/loadgen/benchmark/run_debug.sh
new file mode 100644
index 0000000000..341850eee0
--- /dev/null
+++ b/loadgen/benchmark/run_debug.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen in Debug mode..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake -DCMAKE_BUILD_TYPE=Debug ../.. && make -j && cd ..
+echo "Building test program in Debug mode..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O0 -g -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+gdb --args build/repro.exe $1 $2 $3 $4 $5
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index a334523b08..fee8a42d1f 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -151,6 +151,29 @@ class QueryMetadata {
     return all_samples_done_time;
   }
 
+  // When server_coalesce_queries is set to true in Server scenario, we
+  // sometimes coalesce multiple queries into one query. This is done by moving
+  // the other query's sample into current query, while maintaining their
+  // original scheduled_time.
+  void CoalesceQueries(QueryMetadata* queries, size_t first, size_t last) {
+    // Copy sample data over to current query, boldly assuming that each query
+    // only has one sample.
+    auto prev_scheduled_time = scheduled_time;
+    query_to_send.reserve(last - first +
+                          2);  // Extra one for the current query.
+    for (size_t i = first; i <= last; ++i) {
+      auto& q = queries[i];
+      auto& s = q.samples_[0];
+      query_to_send.push_back(
+          {reinterpret_cast<ResponseId>(&s), s.sample_index});
+      q.scheduled_time = prev_scheduled_time + q.scheduled_delta;
+      q.issued_start_time = issued_start_time;
+      prev_scheduled_time = q.scheduled_time;
+    }
+  }
+
+  void Decoalesce() { query_to_send.resize(1); }
+
  public:
   std::vector<QuerySample> query_to_send;
   const std::chrono::nanoseconds scheduled_delta;
@@ -280,7 +303,8 @@ auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) {
     indices.push_back(i);
   }
   std::shuffle(indices.begin(), indices.end(), *rng);
-  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
+  return
+      [ indices = std::move(indices), i = size_t(0) ](auto& /*gen*/) mutable {
     return indices.at(i++);
   };
 }
@@ -291,7 +315,9 @@ auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
                                                    size_t /*stride*/,
                                                    std::mt19937* /*rng*/) {
   return [dist = std::uniform_int_distribution<>(0, sample_count - 1)](
-             auto& gen) mutable { return dist(gen); };
+      auto& gen) mutable {
+    return dist(gen);
+  };
 }
 
 /// \brief Generates queries for the requested settings, templated by
@@ -429,8 +455,10 @@ std::vector<QueryMetadata> GenerateQueries(
     }
   }
 
-  LogDetail([count = queries.size(), spq = settings.samples_per_query,
-             duration = timestamp.count()](AsyncDetail& detail) {
+  LogDetail([
+    count = queries.size(), spq = settings.samples_per_query,
+    duration = timestamp.count()
+  ](AsyncDetail & detail) {
     detail("GeneratedQueries: ", "queries", count, "samples per query", spq,
            "duration", duration);
   });
@@ -561,16 +589,18 @@ struct QueryScheduler<TestScenario::Server> {
                  const PerfClock::time_point start)
       : start(start) {}
 
-  // TODO: Coalesce all queries whose scheduled timestamps have passed.
   PerfClock::time_point Wait(QueryMetadata* next_query) {
     auto tracer =
         MakeScopedTracer([](AsyncTrace& trace) { trace("Scheduling"); });
 
     auto scheduled_time = start + next_query->scheduled_delta;
     next_query->scheduled_time = scheduled_time;
-    std::this_thread::sleep_until(scheduled_time);
 
     auto now = PerfClock::now();
+    if (now < scheduled_time) {
+      std::this_thread::sleep_until(scheduled_time);
+      now = PerfClock::now();
+    }
     next_query->issued_start_time = now;
     return now;
   }
@@ -636,6 +666,7 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
                                          max_latencies_to_record);
 
   size_t queries_issued = 0;
+  size_t queries_count = queries.size();
 
   auto start_for_power = std::chrono::system_clock::now();
   const PerfClock::time_point start = PerfClock::now();
@@ -647,11 +678,34 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
   // the actual issue time.
   bool ran_out_of_generated_queries = scenario != TestScenario::Server;
   size_t expected_latencies = 0;
-  for (auto& query : queries) {
+  while (queries_issued < queries_count) {
+    auto& query = queries[queries_issued];
     auto tracer1 =
         MakeScopedTracer([](AsyncTrace& trace) { trace("SampleLoop"); });
     last_now = query_scheduler.Wait(&query);
 
+    // If in Server scenario and server_coalesce_queries is enabled, multiple
+    // queries are coalesed into one big query if the current time has already
+    // passed the scheduled time of multiple queries.
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      auto current_query_idx = queries_issued;
+      auto scheduled_time = query.scheduled_time;
+      while (queries_issued < queries_count - 1) {
+        auto next_scheduled_time =
+            scheduled_time + queries[queries_issued + 1].scheduled_delta;
+        if (last_now < next_scheduled_time) {
+          break;
+        }
+        scheduled_time = next_scheduled_time;
+        queries_issued++;
+      }
+      if (queries_issued > current_query_idx) {
+        query.CoalesceQueries(queries.data(), current_query_idx + 1,
+                              queries_issued);
+      }
+    }
+
     // Issue the query to the SUT.
     {
       auto tracer3 =
@@ -662,6 +716,12 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
     expected_latencies += query.query_to_send.size();
     queries_issued++;
 
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      // Set the query back to its clean state.
+      query.Decoalesce();
+    }
+
     if (mode == TestMode::AccuracyOnly) {
       // TODO: Rate limit in accuracy mode so accuracy mode works even
       //       if the expected/target performance is way off.
@@ -830,9 +890,9 @@ struct PerformanceSummary {
 #if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
   // MSVC complains if there is no explicit constructor.
   // (target_latency_percentile above depends on construction with settings)
-  PerformanceSummary(
-      const std::string& sut_name_arg, const TestSettingsInternal& settings_arg,
-      const PerformanceResult& pr_arg)
+  PerformanceSummary(const std::string& sut_name_arg,
+                     const TestSettingsInternal& settings_arg,
+                     const PerformanceResult& pr_arg)
       : sut_name(sut_name_arg), settings(settings_arg), pr(pr_arg){};
 #endif
   void ProcessLatencies();
@@ -1268,14 +1328,15 @@ std::pair<PerformanceSummary, PerformanceSummary> FindBoundaries(
   TestSettingsInternal u_settings = l_perf_summary.settings;
   find_peak_performance::WidenPerformanceField<scenario>(&u_settings);
 
-  LogDetail(
-      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           l_perf_summary.settings),
-       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           u_settings)](AsyncDetail& detail) {
-        detail("FindBoundaries: Checking fields [" + l_field + ", " + u_field +
-               ")");
-      });
+  LogDetail([
+    l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+        l_perf_summary.settings),
+    u_field =
+        find_peak_performance::ToStringPerformanceField<scenario>(u_settings)
+  ](AsyncDetail & detail) {
+    detail("FindBoundaries: Checking fields [" + l_field + ", " + u_field +
+           ")");
+  });
 
   std::vector<loadgen::LoadableSampleSet> loadable_sets(
       loadgen::GenerateLoadableSets(qsl, u_settings));
@@ -1313,15 +1374,14 @@ PerformanceSummary FindPeakPerformanceBinarySearch(
       find_peak_performance::MidOfBoundaries<scenario>(l_perf_summary.settings,
                                                        u_perf_summary.settings);
 
-  LogDetail([l_field =
-                 find_peak_performance::ToStringPerformanceField<scenario>(
-                     l_perf_summary.settings),
-             u_field =
-                 find_peak_performance::ToStringPerformanceField<scenario>(
-                     u_perf_summary.settings),
-             m_field =
-                 find_peak_performance::ToStringPerformanceField<scenario>(
-                     m_settings)](AsyncDetail& detail) {
+  LogDetail([
+    l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+        l_perf_summary.settings),
+    u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+        u_perf_summary.settings),
+    m_field =
+        find_peak_performance::ToStringPerformanceField<scenario>(m_settings)
+  ](AsyncDetail & detail) {
     detail(
         "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
         l_field + ", " + u_field + "): " + m_field);
@@ -1409,9 +1469,9 @@ void RunPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
 
   sut->ReportLatencyResults(pr.sample_latencies);
 
-  LogSummary(
-      [perf_summary = PerformanceSummary{sut->Name(), settings, std::move(pr)}](
-          AsyncSummary& summary) mutable { perf_summary.Log(summary); });
+  LogSummary([perf_summary =
+                  PerformanceSummary{sut->Name(), settings, std::move(pr)}](
+      AsyncSummary & summary) mutable { perf_summary.Log(summary); });
 
   qsl->UnloadSamplesFromRam(performance_set.set);
 }
@@ -1436,7 +1496,8 @@ void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
   if (scenario != TestScenario::MultiStream &&
       scenario != TestScenario::MultiStreamFree &&
       scenario != TestScenario::Server) {
-    LogDetail([unsupported_scenario = ToString(scenario)](AsyncDetail& detail) {
+    LogDetail([unsupported_scenario =
+                   ToString(scenario)](AsyncDetail & detail) {
       detail.Error(find_peak_performance::kNotSupportedMsg);
     });
     return;
@@ -1444,7 +1505,7 @@ void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
 
   LogDetail([base_field =
                  find_peak_performance::ToStringPerformanceField<scenario>(
-                     base_settings)](AsyncDetail& detail) {
+                     base_settings)](AsyncDetail & detail) {
     detail("FindPeakPerformance: Check validity of the base settings field: " +
            base_field);
   });
@@ -1475,10 +1536,10 @@ void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
 
     sut->ReportLatencyResults(base_perf_summary.pr.sample_latencies);
 
-    LogSummary(
-        [perf_summary = PerformanceSummary{sut->Name(), base_settings,
-                                           std::move(base_perf_summary.pr)}](
-            AsyncSummary& summary) mutable { perf_summary.Log(summary); });
+    LogSummary([perf_summary =
+                    PerformanceSummary{sut->Name(), base_settings,
+                                       std::move(base_perf_summary.pr)}](
+        AsyncSummary & summary) mutable { perf_summary.Log(summary); });
 
     qsl->UnloadSamplesFromRam(base_performance_set.set);
 
@@ -1494,14 +1555,15 @@ void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
   PerformanceSummary l_perf_summary = boundaries.first;
   PerformanceSummary u_perf_summary = boundaries.second;
 
-  LogDetail(
-      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           l_perf_summary.settings),
-       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           u_perf_summary.settings)](AsyncDetail& detail) {
-        detail("FindPeakPerformance: Found boundaries: [" + l_field + ", " +
-               u_field + ")");
-      });
+  LogDetail([
+    l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+        l_perf_summary.settings),
+    u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+        u_perf_summary.settings)
+  ](AsyncDetail & detail) {
+    detail("FindPeakPerformance: Found boundaries: [" + l_field + ", " +
+           u_field + ")");
+  });
 
   // Reuse performance_set, u_perf_summary has the largest 'samples_per_query'.
   std::vector<loadgen::LoadableSampleSet> loadable_sets(
@@ -1515,16 +1577,16 @@ void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
 
   // Print-out the peak performance test setting.
   LogDetail([field = find_peak_performance::ToStringPerformanceField<scenario>(
-                 perf_summary.settings)](AsyncDetail& detail) {
+                 perf_summary.settings)](AsyncDetail & detail) {
     detail("FindPeakPerformance: Found peak performance field: " + field);
   });
 
   sut->ReportLatencyResults(perf_summary.pr.sample_latencies);
 
-  LogSummary(
-      [perf_summary = PerformanceSummary{sut->Name(), perf_summary.settings,
-                                         std::move(perf_summary.pr)}](
-          AsyncSummary& summary) mutable { perf_summary.Log(summary); });
+  LogSummary([perf_summary =
+                  PerformanceSummary{sut->Name(), perf_summary.settings,
+                                     std::move(perf_summary.pr)}](
+      AsyncSummary & summary) mutable { perf_summary.Log(summary); });
 
   qsl->UnloadSamplesFromRam(performance_set.set);
 }
@@ -1541,10 +1603,8 @@ void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
 
   for (auto& loadable_set : loadable_sets) {
     {
-      auto tracer = MakeScopedTracer(
-          [count = loadable_set.set.size()](AsyncTrace& trace) {
-            trace("LoadSamples", "count", count);
-          });
+      auto tracer = MakeScopedTracer([count = loadable_set.set.size()](
+          AsyncTrace & trace) { trace("LoadSamples", "count", count); });
       LoadSamplesToRam(qsl, loadable_set.set);
     }
 
@@ -1552,10 +1612,8 @@ void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
         sut, settings, loadable_set, sequence_gen));
 
     {
-      auto tracer = MakeScopedTracer(
-          [count = loadable_set.set.size()](AsyncTrace& trace) {
-            trace("UnloadSampes", "count", count);
-          });
+      auto tracer = MakeScopedTracer([count = loadable_set.set.size()](
+          AsyncTrace & trace) { trace("UnloadSampes", "count", count); });
       qsl->UnloadSamplesFromRam(loadable_set.set);
     }
   }
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 49cffb922f..c0f01cb0ce 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -197,8 +197,9 @@ struct TestSettings {
   /// should be set to 0.97 (97%) in v0.5.(As always, check the policy page for
   /// updated values for the benchmark you are running.)
   double server_target_latency_percentile = 0.99;
-  /// \brief TODO: Implement this. Would combine samples from multiple queries
-  /// into a single query if their scheduled issue times have passed.
+  /// \brief If this flag is set to true, LoadGen will combine samples from
+  /// multiple queries into a single query if their scheduled issue times have
+  /// passed.
   bool server_coalesce_queries = false;
   /// \brief The decimal places of QPS precision used to terminate
   /// FindPeakPerformance mode.