diff --git a/src/hotspot/share/gc/g1/g1Analytics.cpp b/src/hotspot/share/gc/g1/g1Analytics.cpp
index a36e7fb131e..d57c70a9d52 100644
--- a/src/hotspot/share/gc/g1/g1Analytics.cpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -79,7 +79,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
     _prev_collection_pause_end_ms(0.0),
     _rs_length_diff_seq(new TruncatedSeq(TruncatedSeqLength)),
     _concurrent_refine_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
-    _logged_cards_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
+    _dirtied_cards_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
     _young_card_merge_to_scan_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
     _mixed_card_merge_to_scan_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
     _young_cost_per_card_scan_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
@@ -107,7 +107,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
   // Start with inverse of maximum STW cost.
   _concurrent_refine_rate_ms_seq->add(1/cost_per_logged_card_ms_defaults[0]);
   // Some applications have very low rates for logging cards.
-  _logged_cards_rate_ms_seq->add(0.0);
+  _dirtied_cards_rate_ms_seq->add(0.0);
   _young_card_merge_to_scan_ratio_seq->add(young_card_merge_to_scan_ratio_defaults[index]);
   _young_cost_per_card_scan_ms_seq->add(young_only_cost_per_card_scan_ms_defaults[index]);
 
@@ -168,8 +168,8 @@ void G1Analytics::report_concurrent_refine_rate_ms(double cards_per_ms) {
   _concurrent_refine_rate_ms_seq->add(cards_per_ms);
 }
 
-void G1Analytics::report_logged_cards_rate_ms(double cards_per_ms) {
-  _logged_cards_rate_ms_seq->add(cards_per_ms);
+void G1Analytics::report_dirtied_cards_rate_ms(double cards_per_ms) {
+  _dirtied_cards_rate_ms_seq->add(cards_per_ms);
 }
 
 void G1Analytics::report_cost_per_card_scan_ms(double cost_per_card_ms, bool for_young_gc) {
@@ -236,8 +236,8 @@ double G1Analytics::predict_concurrent_refine_rate_ms() const {
   return predict_zero_bounded(_concurrent_refine_rate_ms_seq);
 }
 
-double G1Analytics::predict_logged_cards_rate_ms() const {
-  return predict_zero_bounded(_logged_cards_rate_ms_seq);
+double G1Analytics::predict_dirtied_cards_rate_ms() const {
+  return predict_zero_bounded(_dirtied_cards_rate_ms_seq);
 }
 
 double G1Analytics::predict_young_card_merge_to_scan_ratio() const {
diff --git a/src/hotspot/share/gc/g1/g1Analytics.hpp b/src/hotspot/share/gc/g1/g1Analytics.hpp
index f3e566155b1..2acba9f9c5b 100644
--- a/src/hotspot/share/gc/g1/g1Analytics.hpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,7 +47,7 @@ class G1Analytics: public CHeapObj<mtGC> {
 
   TruncatedSeq* _rs_length_diff_seq;
   TruncatedSeq* _concurrent_refine_rate_ms_seq;
-  TruncatedSeq* _logged_cards_rate_ms_seq;
+  TruncatedSeq* _dirtied_cards_rate_ms_seq;
   // The ratio between the number of merged cards and actually scanned cards, for
   // young-only and mixed gcs.
   TruncatedSeq* _young_card_merge_to_scan_ratio_seq;
@@ -115,7 +115,7 @@ class G1Analytics: public CHeapObj<mtGC> {
   void report_concurrent_mark_cleanup_times_ms(double ms);
   void report_alloc_rate_ms(double alloc_rate);
   void report_concurrent_refine_rate_ms(double cards_per_ms);
-  void report_logged_cards_rate_ms(double cards_per_ms);
+  void report_dirtied_cards_rate_ms(double cards_per_ms);
   void report_cost_per_card_scan_ms(double cost_per_remset_card_ms, bool for_young_gc);
   void report_cost_per_card_merge_ms(double cost_per_card_ms, bool for_young_gc);
   void report_card_merge_to_scan_ratio(double cards_per_entry_ratio, bool for_young_gc);
@@ -131,7 +131,7 @@ class G1Analytics: public CHeapObj<mtGC> {
   int num_alloc_rate_ms() const;
 
   double predict_concurrent_refine_rate_ms() const;
-  double predict_logged_cards_rate_ms() const;
+  double predict_dirtied_cards_rate_ms() const;
   double predict_young_card_merge_to_scan_ratio() const;
 
   double predict_mixed_card_merge_to_scan_ratio() const;
diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.cpp b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
index 62dba813d51..9243e1f319f 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
@@ -157,5 +157,5 @@ void G1BarrierSet::on_thread_detach(Thread* thread) {
   // Flush any deferred card marks.
   CardTableBarrierSet::on_thread_detach(thread);
   G1ThreadLocalData::satb_mark_queue(thread).flush();
-  G1ThreadLocalData::dirty_card_queue(thread).flush();
+  G1ThreadLocalData::dirty_card_queue(thread).on_thread_detach();
 }
diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
index 08ee547b9b6..45fc8ae046b 100644
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@@ -1054,10 +1054,11 @@ void G1CollectedHeap::abort_refinement() {
     _hot_card_cache->reset_hot_cache();
   }
 
-  // Discard all remembered set updates.
+  // Discard all remembered set updates and reset refinement statistics.
   G1BarrierSet::dirty_card_queue_set().abandon_logs();
   assert(G1BarrierSet::dirty_card_queue_set().num_cards() == 0,
          "DCQS should be empty");
+  concurrent_refine()->get_and_reset_refinement_stats();
 }
 
 void G1CollectedHeap::verify_after_full_collection() {
@@ -2684,9 +2685,22 @@ void G1CollectedHeap::gc_prologue(bool full) {
   }
 
   // Fill TLAB's and such
-  double start = os::elapsedTime();
-  ensure_parsability(true);
-  phase_times()->record_prepare_tlab_time_ms((os::elapsedTime() - start) * 1000.0);
+  {
+    Ticks start = Ticks::now();
+    ensure_parsability(true);
+    Tickspan dt = Ticks::now() - start;
+    phase_times()->record_prepare_tlab_time_ms(dt.seconds() * MILLIUNITS);
+  }
+
+  if (!full) {
+    // Flush dirty card queues to qset, so later phases don't need to account
+    // for partially filled per-thread queues and such.  Not needed for full
+    // collections, which ignore those logs.
+    Ticks start = Ticks::now();
+    G1BarrierSet::dirty_card_queue_set().concatenate_logs();
+    Tickspan dt = Ticks::now() - start;
+    phase_times()->record_concatenate_dirty_card_logs_time_ms(dt.seconds() * MILLIUNITS);
+  }
 }
 
 void G1CollectedHeap::gc_epilogue(bool full) {
@@ -2759,20 +2773,6 @@ void G1CollectedHeap::do_concurrent_mark() {
   }
 }
 
-size_t G1CollectedHeap::pending_card_num() {
-  struct CountCardsClosure : public ThreadClosure {
-    size_t _cards;
-    CountCardsClosure() : _cards(0) {}
-    virtual void do_thread(Thread* t) {
-      _cards += G1ThreadLocalData::dirty_card_queue(t).size();
-    }
-  } count_from_threads;
-  Threads::threads_do(&count_from_threads);
-
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  return dcqs.num_cards() + count_from_threads._cards;
-}
-
 bool G1CollectedHeap::is_potential_eager_reclaim_candidate(HeapRegion* r) const {
   // We don't nominate objects with many remembered set entries, on
   // the assumption that such objects are likely still live.
diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
index 2e542d668ed..7b2463d4c94 100644
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
@@ -1465,8 +1465,6 @@ class G1CollectedHeap : public CollectedHeap {
 
   // Used to print information about locations in the hs_err file.
   virtual bool print_location(outputStream* st, void* addr) const;
-
-  size_t pending_card_num();
 };
 
 class G1ParEvacuateFollowersClosure : public VoidClosure {
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
index ee5817dab03..2607f48a127 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@@ -408,20 +408,18 @@ void G1ConcurrentRefine::adjust(double logged_cards_scan_time,
   dcqs.notify_if_necessary();
 }
 
-G1ConcurrentRefine::RefinementStats G1ConcurrentRefine::total_refinement_stats() const {
-  struct CollectData : public ThreadClosure {
-    Tickspan _total_time;
-    size_t _total_cards;
-    CollectData() : _total_time(), _total_cards(0) {}
+G1ConcurrentRefineStats G1ConcurrentRefine::get_and_reset_refinement_stats() {
+  struct CollectStats : public ThreadClosure {
+    G1ConcurrentRefineStats _total_stats;
     virtual void do_thread(Thread* t) {
       G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
-      _total_time += crt->total_refinement_time();
-      _total_cards += crt->total_refined_cards();
+      G1ConcurrentRefineStats& stats = *crt->refinement_stats();
+      _total_stats += stats;
+      stats.reset();
     }
   } collector;
-  // Cast away const so we can call non-modifying closure on threads.
-  const_cast<G1ConcurrentRefine*>(this)->threads_do(&collector);
-  return RefinementStats(collector._total_time, collector._total_cards);
+  threads_do(&collector);
+  return collector._total_stats;
 }
 
 size_t G1ConcurrentRefine::activation_threshold(uint worker_id) const {
@@ -445,7 +443,7 @@ void G1ConcurrentRefine::maybe_activate_more_threads(uint worker_id, size_t num_
 }
 
 bool G1ConcurrentRefine::do_refinement_step(uint worker_id,
-                                            size_t* total_refined_cards) {
+                                            G1ConcurrentRefineStats* stats) {
   G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
 
   size_t curr_cards = dcqs.num_cards();
@@ -460,5 +458,5 @@ bool G1ConcurrentRefine::do_refinement_step(uint worker_id,
   // Process the next buffer, if there are enough left.
   return dcqs.refine_completed_buffer_concurrently(worker_id + worker_id_offset(),
                                                    deactivation_threshold(worker_id),
-                                                   total_refined_cards);
+                                                   stats);
 }
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
index 8a9a041578e..3f32a5c3199 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,6 +25,7 @@
 #ifndef SHARE_GC_G1_G1CONCURRENTREFINE_HPP
 #define SHARE_GC_G1_G1CONCURRENTREFINE_HPP
 
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "memory/allocation.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/ticks.hpp"
@@ -119,13 +120,9 @@ class G1ConcurrentRefine : public CHeapObj<mtGC> {
   // Adjust refinement thresholds based on work done during the pause and the goal time.
   void adjust(double logged_cards_scan_time, size_t processed_logged_cards, double goal_ms);
 
-  struct RefinementStats {
-    Tickspan _time;
-    size_t _cards;
-    RefinementStats(Tickspan time, size_t cards) : _time(time), _cards(cards) {}
-  };
-
-  RefinementStats total_refinement_stats() const;
+  // Return total of concurrent refinement stats for the
+  // ConcurrentRefineThreads.  Also reset the stats for the threads.
+  G1ConcurrentRefineStats get_and_reset_refinement_stats();
 
   // Cards in the dirty card queue set.
   size_t activation_threshold(uint worker_id) const;
@@ -133,8 +130,8 @@ class G1ConcurrentRefine : public CHeapObj<mtGC> {
 
   // Perform a single refinement step; called by the refinement
   // threads.  Returns true if there was refinement work available.
-  // Increments *total_refined_cards.
-  bool do_refinement_step(uint worker_id, size_t* total_refined_cards);
+  // Updates stats.
+  bool do_refinement_step(uint worker_id, G1ConcurrentRefineStats* stats);
 
   // Iterate over all concurrent refinement threads applying the given closure.
   void threads_do(ThreadClosure *tc);
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
new file mode 100644
index 00000000000..055fd8b891d
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
+
+G1ConcurrentRefineStats::G1ConcurrentRefineStats() :
+  _refinement_time(),
+  _refined_cards(0),
+  _precleaned_cards(0),
+  _dirtied_cards(0)
+{}
+
+G1ConcurrentRefineStats&
+G1ConcurrentRefineStats::operator+=(const G1ConcurrentRefineStats& other) {
+  _refinement_time += other._refinement_time;
+  _refined_cards += other._refined_cards;
+  _precleaned_cards += other._precleaned_cards;
+  _dirtied_cards += other._dirtied_cards;
+  return *this;
+}
+
+template<typename T>
+static T clipped_sub(T x, T y) {
+  return (x < y) ? T() : (x - y);
+}
+
+G1ConcurrentRefineStats&
+G1ConcurrentRefineStats::operator-=(const G1ConcurrentRefineStats& other) {
+  _refinement_time = clipped_sub(_refinement_time, other._refinement_time);
+  _refined_cards = clipped_sub(_refined_cards, other._refined_cards);
+  _precleaned_cards = clipped_sub(_precleaned_cards, other._precleaned_cards);
+  _dirtied_cards = clipped_sub(_dirtied_cards, other._dirtied_cards);
+  return *this;
+}
+
+void G1ConcurrentRefineStats::reset() {
+  *this = G1ConcurrentRefineStats();
+}
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
new file mode 100644
index 00000000000..d0e17301a5f
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_GC_G1_G1CONCURRENTREFINESTATS_HPP
+#define SHARE_GC_G1_G1CONCURRENTREFINESTATS_HPP
+
+#include "memory/allocation.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/ticks.hpp"
+
+// Collection of statistics for concurrent refinement processing.
+// Used for collecting per-thread statistics and for summaries over a
+// collection of threads.
+class G1ConcurrentRefineStats : public CHeapObj<mtGC> {
+  Tickspan _refinement_time;
+  size_t _refined_cards;
+  size_t _precleaned_cards;
+  size_t _dirtied_cards;
+
+public:
+  G1ConcurrentRefineStats();
+
+  // Time spent performing concurrent refinement.
+  Tickspan refinement_time() const { return _refinement_time; }
+
+  // Number of refined cards.
+  size_t refined_cards() const { return _refined_cards; }
+
+  // Number of cards for which refinement was skipped because some other
+  // thread had already refined them.
+  size_t precleaned_cards() const { return _precleaned_cards; }
+
+  // Number of cards marked dirty and in need of refinement.
+  size_t dirtied_cards() const { return _dirtied_cards; }
+
+  void inc_refinement_time(Tickspan t) { _refinement_time += t; }
+  void inc_refined_cards(size_t cards) { _refined_cards += cards; }
+  void inc_precleaned_cards(size_t cards) { _precleaned_cards += cards; }
+  void inc_dirtied_cards(size_t cards) { _dirtied_cards += cards; }
+
+  G1ConcurrentRefineStats& operator+=(const G1ConcurrentRefineStats& other);
+  G1ConcurrentRefineStats& operator-=(const G1ConcurrentRefineStats& other);
+
+  friend G1ConcurrentRefineStats operator+(G1ConcurrentRefineStats x,
+                                           const G1ConcurrentRefineStats& y) {
+    return x += y;
+  }
+
+  friend G1ConcurrentRefineStats operator-(G1ConcurrentRefineStats x,
+                                           const G1ConcurrentRefineStats& y) {
+    return x -= y;
+  }
+
+  void reset();
+};
+
+#endif // SHARE_GC_G1_G1CONCURRENTREFINESTATS_HPP
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
index 55a274486d7..e57617114d2 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
@@ -36,8 +37,7 @@ G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint
   ConcurrentGCThread(),
   _vtime_start(0.0),
   _vtime_accum(0.0),
-  _total_refinement_time(),
-  _total_refined_cards(0),
+  _refinement_stats(new G1ConcurrentRefineStats()),
   _worker_id(worker_id),
   _notifier(new Semaphore(0)),
   _should_notify(true),
@@ -48,6 +48,11 @@ G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint
   create_and_start();
 }
 
+G1ConcurrentRefineThread::~G1ConcurrentRefineThread() {
+  delete _refinement_stats;
+  delete _notifier;
+}
+
 void G1ConcurrentRefineThread::wait_for_completed_buffers() {
   assert(this == Thread::current(), "precondition");
   while (Atomic::load_acquire(&_should_notify)) {
@@ -103,32 +108,35 @@ void G1ConcurrentRefineThread::run_service() {
                           _worker_id, _cr->activation_threshold(_worker_id),
                           G1BarrierSet::dirty_card_queue_set().num_cards());
 
-    size_t start_total_refined_cards = _total_refined_cards; // For logging.
+    // For logging.
+    G1ConcurrentRefineStats start_stats = *_refinement_stats;
+    G1ConcurrentRefineStats total_stats; // Accumulate over activation.
 
     {
       SuspendibleThreadSetJoiner sts_join;
 
       while (!should_terminate()) {
         if (sts_join.should_yield()) {
+          // Accumulate changed stats before possible GC that resets stats.
+          total_stats += *_refinement_stats - start_stats;
           sts_join.yield();
+          // Reinitialize baseline stats after safepoint.
+          start_stats = *_refinement_stats;
           continue;             // Re-check for termination after yield delay.
         }
 
-        Ticks start_time = Ticks::now();
-        bool more_work = _cr->do_refinement_step(_worker_id, &_total_refined_cards);
-        _total_refinement_time += (Ticks::now() - start_time);
-
+        bool more_work = _cr->do_refinement_step(_worker_id, _refinement_stats);
         if (maybe_deactivate(more_work)) break;
       }
     }
 
+    total_stats += *_refinement_stats - start_stats;
     log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT
-                          ", current: " SIZE_FORMAT ", refined cards: "
-                          SIZE_FORMAT ", total refined cards: " SIZE_FORMAT,
+                          ", current: " SIZE_FORMAT
+                          ", refined cards: " SIZE_FORMAT,
                           _worker_id, _cr->deactivation_threshold(_worker_id),
                           G1BarrierSet::dirty_card_queue_set().num_cards(),
-                          _total_refined_cards - start_total_refined_cards,
-                          _total_refined_cards);
+                          total_stats.refined_cards());
 
     if (os::supports_vtime()) {
       _vtime_accum = (os::elapsedVTime() - _vtime_start);
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
index 428c07c4250..cdcab6b2285 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
@@ -30,6 +30,7 @@
 
 // Forward Decl.
 class G1ConcurrentRefine;
+class G1ConcurrentRefineStats;
 
 // One or more G1 Concurrent Refinement Threads may be active if concurrent
 // refinement is in progress.
@@ -40,8 +41,7 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {
   double _vtime_start;  // Initial virtual time.
   double _vtime_accum;  // Accumulated virtual time.
 
-  Tickspan _total_refinement_time;
-  size_t _total_refined_cards;
+  G1ConcurrentRefineStats* _refinement_stats;
 
   uint _worker_id;
 
@@ -71,12 +71,14 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {
 
 public:
   G1ConcurrentRefineThread(G1ConcurrentRefine* cg1r, uint worker_id);
+  virtual ~G1ConcurrentRefineThread();
 
   // Activate this thread.
   void activate();
 
-  Tickspan total_refinement_time() const { return _total_refinement_time; }
-  size_t total_refined_cards() const { return _total_refined_cards; }
+  G1ConcurrentRefineStats* refinement_stats() const {
+    return _refinement_stats;
+  }
 
   // Total virtual time so far.
   double vtime_accum() { return _vtime_accum; }
diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
index edd3f4e9c6a..05b51af06a2 100644
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@@ -26,6 +26,7 @@
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1CardTableEntryClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
@@ -36,6 +37,8 @@
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "memory/iterator.hpp"
 #include "runtime/atomic.hpp"
+#include "runtime/mutex.hpp"
+#include "runtime/mutexLocker.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
@@ -43,22 +46,37 @@
 #include "utilities/globalCounter.inline.hpp"
 #include "utilities/macros.hpp"
 #include "utilities/quickSort.hpp"
+#include "utilities/ticks.hpp"
 
 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
   // Dirty card queues are always active, so we create them with their
   // active field set to true.
-  PtrQueue(qset, true /* active */)
+  PtrQueue(qset, true /* active */),
+  _refinement_stats(new G1ConcurrentRefineStats())
 { }
 
 G1DirtyCardQueue::~G1DirtyCardQueue() {
   flush();
+  delete _refinement_stats;
+}
+
+void G1DirtyCardQueue::flush() {
+  _refinement_stats->inc_dirtied_cards(size());
+  flush_impl();
+}
+
+void G1DirtyCardQueue::on_thread_detach() {
+  assert(this == &G1ThreadLocalData::dirty_card_queue(Thread::current()), "precondition");
+  flush();
+  dirty_card_qset()->record_detached_refinement_stats(_refinement_stats);
 }
 
 void G1DirtyCardQueue::handle_completed_buffer() {
   assert(!is_empty(), "precondition");
+  _refinement_stats->inc_dirtied_cards(size());
   BufferNode* node = BufferNode::make_node_from_buffer(_buf, index());
   allocate_buffer();
-  dirty_card_qset()->handle_completed_buffer(node);
+  dirty_card_qset()->handle_completed_buffer(node, _refinement_stats);
 }
 
 // Assumed to be zero by concurrent threads.
@@ -74,15 +92,13 @@ G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
   _process_cards_threshold(ProcessCardsThresholdNever),
   _max_cards(MaxCardsUnlimited),
   _padded_max_cards(MaxCardsUnlimited),
-  _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC))
+  _detached_refinement_stats()
 {
-  ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t));
   _all_active = true;
 }
 
 G1DirtyCardQueueSet::~G1DirtyCardQueueSet() {
   abandon_completed_buffers();
-  FREE_C_HEAP_ARRAY(size_t, _mutator_refined_cards_counters);
 }
 
 // Determines how many mutator threads can process the buffers in parallel.
@@ -90,14 +106,6 @@ uint G1DirtyCardQueueSet::num_par_ids() {
   return (uint)os::initial_active_processor_count();
 }
 
-size_t G1DirtyCardQueueSet::total_mutator_refined_cards() const {
-  size_t sum = 0;
-  for (uint i = 0; i < num_par_ids(); ++i) {
-    sum += _mutator_refined_cards_counters[i];
-  }
-  return sum;
-}
-
 void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
   G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }
@@ -422,7 +430,7 @@ class G1RefineBufferedCards : public StackObj {
   CardTable::CardValue** const _node_buffer;
   const size_t _node_buffer_size;
   const uint _worker_id;
-  size_t* _total_refined_cards;
+  G1ConcurrentRefineStats* _stats;
   G1RemSet* const _g1rs;
 
   static inline int compare_card(const CardTable::CardValue* p1,
@@ -472,7 +480,8 @@ class G1RefineBufferedCards : public StackObj {
     const size_t first_clean = dst - _node_buffer;
     assert(first_clean >= start && first_clean <= _node_buffer_size, "invariant");
     // Discarded cards are considered as refined.
-    *_total_refined_cards += first_clean - start;
+    _stats->inc_refined_cards(first_clean - start);
+    _stats->inc_precleaned_cards(first_clean - start);
     return first_clean;
   }
 
@@ -488,7 +497,7 @@ class G1RefineBufferedCards : public StackObj {
       _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id);
     }
     _node->set_index(i);
-    *_total_refined_cards += i - start_index;
+    _stats->inc_refined_cards(i - start_index);
     return result;
   }
 
@@ -502,12 +511,12 @@ class G1RefineBufferedCards : public StackObj {
   G1RefineBufferedCards(BufferNode* node,
                         size_t node_buffer_size,
                         uint worker_id,
-                        size_t* total_refined_cards) :
+                        G1ConcurrentRefineStats* stats) :
     _node(node),
     _node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))),
     _node_buffer_size(node_buffer_size),
     _worker_id(worker_id),
-    _total_refined_cards(total_refined_cards),
+    _stats(stats),
     _g1rs(G1CollectedHeap::heap()->rem_set()) {}
 
   bool refine() {
@@ -532,12 +541,15 @@ class G1RefineBufferedCards : public StackObj {
 
 bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
                                         uint worker_id,
-                                        size_t* total_refined_cards) {
+                                        G1ConcurrentRefineStats* stats) {
+  Ticks start_time = Ticks::now();
   G1RefineBufferedCards buffered_cards(node,
                                        buffer_size(),
                                        worker_id,
-                                       total_refined_cards);
-  return buffered_cards.refine();
+                                       stats);
+  bool result = buffered_cards.refine();
+  stats->inc_refinement_time(Ticks::now() - start_time);
+  return result;
 }
 
 void G1DirtyCardQueueSet::handle_refined_buffer(BufferNode* node,
@@ -555,7 +567,8 @@ void G1DirtyCardQueueSet::handle_refined_buffer(BufferNode* node,
   }
 }
 
-void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node) {
+void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node,
+                                                  G1ConcurrentRefineStats* stats) {
   enqueue_completed_buffer(new_node);
 
   // No need for mutator refinement if number of cards is below limit.
@@ -574,9 +587,7 @@ void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node) {
   // Refine cards in buffer.
 
   uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id
-  uint counter_index = worker_id - par_ids_start();
-  size_t* counter = &_mutator_refined_cards_counters[counter_index];
-  bool fully_processed = refine_buffer(node, worker_id, counter);
+  bool fully_processed = refine_buffer(node, worker_id, stats);
   _free_ids.release_par_id(worker_id); // release the id
 
   // Deal with buffer after releasing id, to let another thread use id.
@@ -585,14 +596,14 @@ void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node) {
 
 bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
                                                                size_t stop_at,
-                                                               size_t* total_refined_cards) {
+                                                               G1ConcurrentRefineStats* stats) {
   // Not enough cards to trigger processing.
   if (Atomic::load(&_num_cards) <= stop_at) return false;
 
   BufferNode* node = get_completed_buffer();
   if (node == NULL) return false; // Didn't get a buffer to process.
 
-  bool fully_processed = refine_buffer(node, worker_id, total_refined_cards);
+  bool fully_processed = refine_buffer(node, worker_id, stats);
   handle_refined_buffer(node, fully_processed);
   return true;
 }
@@ -600,12 +611,15 @@ bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
 void G1DirtyCardQueueSet::abandon_logs() {
   assert_at_safepoint();
   abandon_completed_buffers();
+  _detached_refinement_stats.reset();
 
   // Since abandon is done only at safepoints, we can safely manipulate
   // these queues.
   struct AbandonThreadLogClosure : public ThreadClosure {
     virtual void do_thread(Thread* t) {
-      G1ThreadLocalData::dirty_card_queue(t).reset();
+      G1DirtyCardQueue& dcq = G1ThreadLocalData::dirty_card_queue(t);
+      dcq.reset();
+      dcq.refinement_stats()->reset();
     }
   } closure;
   Threads::threads_do(&closure);
@@ -637,6 +651,40 @@ void G1DirtyCardQueueSet::concatenate_logs() {
   set_max_cards(old_limit);
 }
 
+G1ConcurrentRefineStats G1DirtyCardQueueSet::get_and_reset_refinement_stats() {
+  assert_at_safepoint();
+
+  // Since we're at a safepoint, there aren't any races with recording of
+  // detached refinement stats.  In particular, there's no risk of double
+  // counting a thread that detaches after we've examined it but before
+  // we've processed the detached stats.
+
+  // Collect and reset stats for attached threads.
+  struct CollectStats : public ThreadClosure {
+    G1ConcurrentRefineStats _total_stats;
+    virtual void do_thread(Thread* t) {
+      G1DirtyCardQueue& dcq = G1ThreadLocalData::dirty_card_queue(t);
+      G1ConcurrentRefineStats& stats = *dcq.refinement_stats();
+      _total_stats += stats;
+      stats.reset();
+    }
+  } closure;
+  Threads::threads_do(&closure);
+
+  // Collect and reset stats from detached threads.
+  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
+  closure._total_stats += _detached_refinement_stats;
+  _detached_refinement_stats.reset();
+
+  return closure._total_stats;
+}
+
+void G1DirtyCardQueueSet::record_detached_refinement_stats(G1ConcurrentRefineStats* stats) {
+  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
+  _detached_refinement_stats += *stats;
+  stats->reset();
+}
+
 size_t G1DirtyCardQueueSet::max_cards() const {
   return _max_cards;
 }
diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
index 375676cbf53..2b8b445154d 100644
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@@ -27,6 +27,7 @@
 
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/shared/ptrQueue.hpp"
 #include "memory/allocation.hpp"
 #include "memory/padded.hpp"
@@ -38,6 +39,8 @@ class Thread;
 
 // A ptrQueue whose elements are "oops", pointers to object heads.
 class G1DirtyCardQueue: public PtrQueue {
+  G1ConcurrentRefineStats* _refinement_stats;
+
 protected:
   virtual void handle_completed_buffer();
 
@@ -49,10 +52,18 @@ class G1DirtyCardQueue: public PtrQueue {
   ~G1DirtyCardQueue();
 
   // Process queue entries and release resources.
-  void flush() { flush_impl(); }
+  void flush();
 
   inline G1DirtyCardQueueSet* dirty_card_qset() const;
 
+  G1ConcurrentRefineStats* refinement_stats() const {
+    return _refinement_stats;
+  }
+
+  // To be called by the barrier set's on_thread_detach, to notify this
+  // object of the corresponding state change of its owning thread.
+  void on_thread_detach();
+
   // Compiler support.
   static ByteSize byte_offset_of_index() {
     return PtrQueue::byte_offset_of_index<G1DirtyCardQueue>();
@@ -215,9 +226,7 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
   volatile size_t _padded_max_cards;
   static const size_t MaxCardsUnlimited = SIZE_MAX;
 
-  // Array of cumulative dirty cards refined by mutator threads.
-  // Array has an entry per id in _free_ids.
-  size_t* _mutator_refined_cards_counters;
+  G1ConcurrentRefineStats _detached_refinement_stats;
 
   // Verify _num_cards == sum of cards in the completed queue.
   void verify_num_cards() const NOT_DEBUG_RETURN;
@@ -241,9 +250,10 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
   // is a pending yield request.  The node's index is updated to exclude
   // the processed elements, e.g. up to the element before processing
   // stopped, or one past the last element if the entire buffer was
-  // processed. Increments *total_refined_cards by the number of cards
-  // processed and removed from the buffer.
-  bool refine_buffer(BufferNode* node, uint worker_id, size_t* total_refined_cards);
+  // processed. Updates stats.
+  bool refine_buffer(BufferNode* node,
+                     uint worker_id,
+                     G1ConcurrentRefineStats* stats);
 
   // Deal with buffer after a call to refine_buffer.  If fully processed,
   // deallocate the buffer.  Otherwise, record it as paused.
@@ -296,26 +306,23 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
   // Enqueue the buffer, and optionally perform refinement by the mutator.
   // Mutator refinement is only done by Java threads, and only if there
   // are more than max_cards (possibly padded) cards in the completed
-  // buffers.
+  // buffers.  Updates stats.
   //
   // Mutator refinement, if performed, stops processing a buffer if
   // SuspendibleThreadSet::should_yield(), recording the incompletely
   // processed buffer for later processing of the remainder.
-  void handle_completed_buffer(BufferNode* node);
+  void handle_completed_buffer(BufferNode* node, G1ConcurrentRefineStats* stats);
 
   // If there are more than stop_at cards in the completed buffers, pop
   // a buffer, refine its contents, and return true.  Otherwise return
-  // false.
+  // false.  Updates stats.
   //
   // Stops processing a buffer if SuspendibleThreadSet::should_yield(),
   // recording the incompletely processed buffer for later processing of
   // the remainder.
-  //
-  // Increments *total_refined_cards by the number of cards processed and
-  // removed from the buffer.
   bool refine_completed_buffer_concurrently(uint worker_id,
                                             size_t stop_at,
-                                            size_t* total_refined_cards);
+                                            G1ConcurrentRefineStats* stats);
 
   // If a full collection is happening, reset partial logs, and release
   // completed ones: the full collection will make them all irrelevant.
@@ -324,6 +331,14 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
   // If any threads have partial logs, add them to the global list of logs.
   void concatenate_logs();
 
+  // Return the total of mutator refinement stats for all threads.
+  // Also resets the stats for the threads.
+  // precondition: at safepoint.
+  G1ConcurrentRefineStats get_and_reset_refinement_stats();
+
+  // Accumulate refinement stats from threads that are detaching.
+  void record_detached_refinement_stats(G1ConcurrentRefineStats* stats);
+
   // Threshold for mutator threads to also do refinement when there
   // are concurrent refinement threads.
   size_t max_cards() const;
@@ -336,9 +351,6 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
 
   // Discard artificial increase of mutator refinement threshold.
   void discard_max_cards_padding();
-
-  // Total dirty cards refined by mutator threads.
-  size_t total_mutator_refined_cards() const;
 };
 
 inline G1DirtyCardQueueSet* G1DirtyCardQueue::dirty_card_qset() const {
diff --git a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
index 88fd9a7a7c8..feb9c033ca5 100644
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -157,6 +157,7 @@ void G1GCPhaseTimes::reset() {
   _cur_string_deduplication_time_ms = 0.0;
   _cur_prepare_tlab_time_ms = 0.0;
   _cur_resize_tlab_time_ms = 0.0;
+  _cur_concatenate_dirty_card_logs_time_ms = 0.0;
   _cur_derived_pointer_table_update_time_ms = 0.0;
   _cur_clear_ct_time_ms = 0.0;
   _cur_expand_heap_time_ms = 0.0;
@@ -377,6 +378,8 @@ void G1GCPhaseTimes::trace_count(const char* name, size_t value) const {
 
 double G1GCPhaseTimes::print_pre_evacuate_collection_set() const {
   const double sum_ms = _root_region_scan_wait_time_ms +
+                        _cur_prepare_tlab_time_ms +
+                        _cur_concatenate_dirty_card_logs_time_ms +
                         _recorded_young_cset_choice_time_ms +
                         _recorded_non_young_cset_choice_time_ms +
                         _cur_region_register_time +
@@ -389,6 +392,7 @@ double G1GCPhaseTimes::print_pre_evacuate_collection_set() const {
     debug_time("Root Region Scan Waiting", _root_region_scan_wait_time_ms);
   }
   debug_time("Prepare TLABs", _cur_prepare_tlab_time_ms);
+  debug_time("Concatenate Dirty Card Logs", _cur_concatenate_dirty_card_logs_time_ms);
   debug_time("Choose Collection Set", (_recorded_young_cset_choice_time_ms + _recorded_non_young_cset_choice_time_ms));
   debug_time("Region Register", _cur_region_register_time);
   if (G1EagerReclaimHumongousObjects) {
diff --git a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
index 5aa46240732..98680d396f6 100644
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -143,6 +143,8 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
   double _cur_prepare_tlab_time_ms;
   double _cur_resize_tlab_time_ms;
 
+  double _cur_concatenate_dirty_card_logs_time_ms;
+
   double _cur_derived_pointer_table_update_time_ms;
 
   double _cur_clear_ct_time_ms;
@@ -252,6 +254,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     _cur_resize_tlab_time_ms = ms;
   }
 
+  void record_concatenate_dirty_card_logs_time_ms(double ms) {
+    _cur_concatenate_dirty_card_logs_time_ms = ms;
+  }
+
   void record_derived_pointer_table_update_time(double ms) {
     _cur_derived_pointer_table_update_time_ms = ms;
   }
diff --git a/src/hotspot/share/gc/g1/g1Policy.cpp b/src/hotspot/share/gc/g1/g1Policy.cpp
index 5f18cdfba0d..c046bbc0318 100644
--- a/src/hotspot/share/gc/g1/g1Policy.cpp
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp
@@ -31,6 +31,7 @@
 #include "gc/g1/g1ConcurrentMark.hpp"
 #include "gc/g1/g1ConcurrentMarkThread.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/g1/g1CollectionSetChooser.hpp"
 #include "gc/g1/g1HeterogeneousHeapPolicy.hpp"
 #include "gc/g1/g1HotCardCache.hpp"
@@ -43,7 +44,7 @@
 #include "gc/g1/heapRegionRemSet.hpp"
 #include "gc/shared/concurrentGCBreakpoints.hpp"
 #include "gc/shared/gcPolicyCounters.hpp"
-#include "logging/logStream.hpp"
+#include "logging/log.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/java.hpp"
 #include "runtime/mutexLocker.hpp"
@@ -72,10 +73,6 @@ G1Policy::G1Policy(STWGCTimer* gc_timer) :
   _rs_length(0),
   _rs_length_prediction(0),
   _pending_cards_at_gc_start(0),
-  _pending_cards_at_prev_gc_end(0),
-  _total_mutator_refined_cards(0),
-  _total_concurrent_refined_cards(0),
-  _total_concurrent_refinement_time(),
   _bytes_allocated_in_old_since_last_gc(0),
   _initial_mark_to_mixed(),
   _collection_set(NULL),
@@ -432,7 +429,7 @@ void G1Policy::record_full_collection_start() {
   collector_state()->set_in_young_only_phase(false);
   collector_state()->set_in_full_gc(true);
   _collection_set->clear_candidates();
-  record_concurrent_refinement_data(true /* is_full_collection */);
+  _pending_cards_at_gc_start = 0;
 }
 
 void G1Policy::record_full_collection_end() {
@@ -462,64 +459,62 @@ void G1Policy::record_full_collection_end() {
   _survivor_surv_rate_group->reset();
   update_young_list_max_and_target_length();
   update_rs_length_prediction();
-  _pending_cards_at_prev_gc_end = _g1h->pending_card_num();
 
   _bytes_allocated_in_old_since_last_gc = 0;
 
   record_pause(FullGC, _full_collection_start_sec, end_sec);
 }
 
-void G1Policy::record_concurrent_refinement_data(bool is_full_collection) {
-  _pending_cards_at_gc_start = _g1h->pending_card_num();
+static void log_refinement_stats(const char* kind, const G1ConcurrentRefineStats& stats) {
+  log_debug(gc, refine, stats)
+           ("%s refinement: %.2fms, refined: " SIZE_FORMAT
+            ", precleaned: " SIZE_FORMAT ", dirtied: " SIZE_FORMAT,
+            kind,
+            stats.refinement_time().seconds() * MILLIUNITS,
+            stats.refined_cards(),
+            stats.precleaned_cards(),
+            stats.dirtied_cards());
+}
+
+void G1Policy::record_concurrent_refinement_stats() {
+  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
+  _pending_cards_at_gc_start = dcqs.num_cards();
 
-  // Record info about concurrent refinement thread processing.
+  // Collect per-thread stats, mostly from mutator activity.
+  G1ConcurrentRefineStats mut_stats = dcqs.get_and_reset_refinement_stats();
+
+  // Collect specialized concurrent refinement thread stats.
   G1ConcurrentRefine* cr = _g1h->concurrent_refine();
-  G1ConcurrentRefine::RefinementStats cr_stats = cr->total_refinement_stats();
-
-  Tickspan cr_time = cr_stats._time - _total_concurrent_refinement_time;
-  _total_concurrent_refinement_time = cr_stats._time;
-
-  size_t cr_cards = cr_stats._cards - _total_concurrent_refined_cards;
-  _total_concurrent_refined_cards = cr_stats._cards;
-
-  // Don't update rate if full collection.  We could be in an implicit full
-  // collection after a non-full collection failure, in which case there
-  // wasn't any mutator/cr-thread activity since last recording.  And if
-  // we're in an explicit full collection, the time since the last GC can
-  // be arbitrarily short, so not a very good sample.  Similarly, don't
-  // update the rate if the current sample is empty or time is zero.
-  if (!is_full_collection && (cr_cards > 0) && (cr_time > Tickspan())) {
-    double rate = cr_cards / (cr_time.seconds() * MILLIUNITS);
+  G1ConcurrentRefineStats cr_stats = cr->get_and_reset_refinement_stats();
+
+  G1ConcurrentRefineStats total_stats = mut_stats + cr_stats;
+
+  log_refinement_stats("Mutator", mut_stats);
+  log_refinement_stats("Concurrent", cr_stats);
+  log_refinement_stats("Total", total_stats);
+
+  // Record the rate at which cards were refined.
+  // Don't update the rate if the current sample is empty or time is zero.
+  Tickspan refinement_time = total_stats.refinement_time();
+  size_t refined_cards = total_stats.refined_cards();
+  if ((refined_cards > 0) && (refinement_time > Tickspan())) {
+    double rate = refined_cards / (refinement_time.seconds() * MILLIUNITS);
     _analytics->report_concurrent_refine_rate_ms(rate);
+    log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms", rate);
   }
 
-  // Record info about mutator thread processing.
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  size_t mut_total_cards = dcqs.total_mutator_refined_cards();
-  size_t mut_cards = mut_total_cards - _total_mutator_refined_cards;
-  _total_mutator_refined_cards = mut_total_cards;
-
   // Record mutator's card logging rate.
-  // Don't update if full collection; see above.
-  if (!is_full_collection) {
-    size_t total_cards = _pending_cards_at_gc_start + cr_cards + mut_cards;
-    assert(_pending_cards_at_prev_gc_end <= total_cards,
-           "untracked cards: last pending: " SIZE_FORMAT
-           ", pending: " SIZE_FORMAT ", conc refine: " SIZE_FORMAT
-           ", mut refine:" SIZE_FORMAT,
-           _pending_cards_at_prev_gc_end, _pending_cards_at_gc_start,
-           cr_cards, mut_cards);
-    size_t logged_cards = total_cards - _pending_cards_at_prev_gc_end;
-    double logging_start_time = _analytics->prev_collection_pause_end_ms();
-    double logging_end_time = Ticks::now().seconds() * MILLIUNITS;
-    double logging_time = logging_end_time - logging_start_time;
-    // Unlike above for conc-refine rate, here we should not require a
-    // non-empty sample, since an application could go some time with only
-    // young-gen or filtered out writes.  But we'll ignore unusually short
-    // sample periods, as they may just pollute the predictions.
-    if (logging_time > 1.0) {   // Require > 1ms sample time.
-      _analytics->report_logged_cards_rate_ms(logged_cards / logging_time);
-    }
+  double mut_start_time = _analytics->prev_collection_pause_end_ms();
+  double mut_end_time = phase_times()->cur_collection_start_sec() * MILLIUNITS;
+  double mut_time = mut_end_time - mut_start_time;
+  // Unlike above for conc-refine rate, here we should not require a
+  // non-empty sample, since an application could go some time with only
+  // young-gen or filtered out writes.  But we'll ignore unusually short
+  // sample periods, as they may just pollute the predictions.
+  if (mut_time > 1.0) {   // Require > 1ms sample time.
+    double dirtied_rate = total_stats.dirtied_cards() / mut_time;
+    _analytics->report_dirtied_cards_rate_ms(dirtied_rate);
+    log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms", dirtied_rate);
   }
 }
 
@@ -536,7 +531,7 @@ void G1Policy::record_collection_pause_start(double start_time_sec) {
 
   phase_times()->record_cur_collection_start_sec(start_time_sec);
 
-  record_concurrent_refinement_data(false /* is_full_collection */);
+  record_concurrent_refinement_stats();
 
   _collection_set->reset_bytes_used_before();
 
@@ -830,7 +825,6 @@ void G1Policy::record_collection_pause_end(double pause_time_ms) {
     scan_logged_cards_time_goal_ms -= merge_hcc_time_ms;
   }
 
-  _pending_cards_at_prev_gc_end = _g1h->pending_card_num();
   double const logged_cards_time = logged_cards_processing_time();
 
   log_debug(gc, ergo, refine)("Concurrent refinement times: Logged Cards Scan time goal: %1.2fms Logged Cards Scan time: %1.2fms HCC time: %1.2fms",
diff --git a/src/hotspot/share/gc/g1/g1Policy.hpp b/src/hotspot/share/gc/g1/g1Policy.hpp
index 2535635b0f1..b52a9f7013d 100644
--- a/src/hotspot/share/gc/g1/g1Policy.hpp
+++ b/src/hotspot/share/gc/g1/g1Policy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -101,10 +101,6 @@ class G1Policy: public CHeapObj<mtGC> {
   size_t _rs_length_prediction;
 
   size_t _pending_cards_at_gc_start;
-  size_t _pending_cards_at_prev_gc_end;
-  size_t _total_mutator_refined_cards;
-  size_t _total_concurrent_refined_cards;
-  Tickspan _total_concurrent_refinement_time;
 
   // The amount of allocated bytes in old gen during the last mutator and the following
   // young GC phase.
@@ -287,7 +283,8 @@ class G1Policy: public CHeapObj<mtGC> {
   // Indicate that we aborted marking before doing any mixed GCs.
   void abort_time_to_mixed_tracking();
 
-  void record_concurrent_refinement_data(bool is_full_collection);
+  // Record and log stats before not-full collection.
+  void record_concurrent_refinement_stats();
 
 public:
 
diff --git a/src/hotspot/share/gc/g1/g1RemSet.cpp b/src/hotspot/share/gc/g1/g1RemSet.cpp
index e89919107b7..cc2fbbe7295 100644
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -910,9 +910,6 @@ void G1RemSet::prepare_region_for_scan(HeapRegion* region) {
 }
 
 void G1RemSet::prepare_for_scan_heap_roots() {
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  dcqs.concatenate_logs();
-
   _scan_state->prepare();
 }
 
diff --git a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
index f78971fcc5f..6c06e3d8dff 100644
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,12 +47,10 @@ void G1RemSetSummary::update() {
       G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
       _summary->set_rs_thread_vtime(_counter, crt->vtime_accum());
       _counter++;
-      _summary->_total_concurrent_refined_cards += crt->total_refined_cards();
     }
   } collector(this);
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
   g1h->concurrent_refine()->threads_do(&collector);
-  _total_mutator_refined_cards = G1BarrierSet::dirty_card_queue_set().total_mutator_refined_cards();
   _num_coarsenings = HeapRegionRemSet::n_coarsenings();
 
   set_sampling_thread_vtime(g1h->sampling_thread()->vtime_accum());
@@ -71,8 +69,6 @@ double G1RemSetSummary::rs_thread_vtime(uint thread) const {
 }
 
 G1RemSetSummary::G1RemSetSummary(bool should_update) :
-  _total_mutator_refined_cards(0),
-  _total_concurrent_refined_cards(0),
   _num_coarsenings(0),
   _num_vtimes(G1ConcurrentRefine::max_num_threads()),
   _rs_threads_vtimes(NEW_C_HEAP_ARRAY(double, _num_vtimes, mtGC)),
@@ -93,9 +89,6 @@ void G1RemSetSummary::set(G1RemSetSummary* other) {
   assert(other != NULL, "just checking");
   assert(_num_vtimes == other->_num_vtimes, "just checking");
 
-  _total_mutator_refined_cards = other->total_mutator_refined_cards();
-  _total_concurrent_refined_cards = other->total_concurrent_refined_cards();
-
   _num_coarsenings = other->num_coarsenings();
 
   memcpy(_rs_threads_vtimes, other->_rs_threads_vtimes, sizeof(double) * _num_vtimes);
@@ -107,9 +100,6 @@ void G1RemSetSummary::subtract_from(G1RemSetSummary* other) {
   assert(other != NULL, "just checking");
   assert(_num_vtimes == other->_num_vtimes, "just checking");
 
-  _total_mutator_refined_cards = other->total_mutator_refined_cards() - _total_mutator_refined_cards;
-  _total_concurrent_refined_cards = other->total_concurrent_refined_cards() - _total_concurrent_refined_cards;
-
   _num_coarsenings = other->num_coarsenings() - _num_coarsenings;
 
   for (uint i = 0; i < _num_vtimes; i++) {
@@ -330,14 +320,6 @@ class HRRSStatsIter: public HeapRegionClosure {
 };
 
 void G1RemSetSummary::print_on(outputStream* out) {
-  out->print_cr(" Recent concurrent refinement statistics");
-  out->print_cr("  Of " SIZE_FORMAT " refined cards:", total_refined_cards());
-  out->print_cr("     " SIZE_FORMAT_W(8) " (%5.1f%%) by concurrent refinement threads.",
-                total_concurrent_refined_cards(),
-                percent_of(total_concurrent_refined_cards(), total_refined_cards()));
-  out->print_cr("     " SIZE_FORMAT_W(8) " (%5.1f%%) by mutator threads.",
-                total_mutator_refined_cards(),
-                percent_of(total_mutator_refined_cards(), total_refined_cards()));
   out->print_cr("  Did " SIZE_FORMAT " coarsenings.", num_coarsenings());
   out->print_cr("  Concurrent refinement threads times (s)");
   out->print("     ");
diff --git a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
index fea8d4e489c..4208bd32b5c 100644
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,10 +33,6 @@ class G1RemSet;
 // A G1RemSetSummary manages statistical information about the G1RemSet
 
 class G1RemSetSummary {
-private:
-  size_t _total_mutator_refined_cards;
-  size_t _total_concurrent_refined_cards;
-
   size_t _num_coarsenings;
 
   size_t _num_vtimes;
@@ -70,18 +66,6 @@ class G1RemSetSummary {
     return _sampling_thread_vtime;
   }
 
-  size_t total_mutator_refined_cards() const {
-    return _total_mutator_refined_cards;
-  }
-
-  size_t total_concurrent_refined_cards() const {
-    return _total_concurrent_refined_cards;
-  }
-
-  size_t total_refined_cards() const {
-    return total_mutator_refined_cards() + total_concurrent_refined_cards();
-  }
-
   size_t num_coarsenings() const {
     return _num_coarsenings;
   }
diff --git a/src/hotspot/share/logging/logPrefix.hpp b/src/hotspot/share/logging/logPrefix.hpp
index 8652e7b6361..2027bf6bc29 100644
--- a/src/hotspot/share/logging/logPrefix.hpp
+++ b/src/hotspot/share/logging/logPrefix.hpp
@@ -82,6 +82,7 @@ DEBUG_ONLY(size_t Test_log_prefix_prefixer(char* buf, size_t len);)
   LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, remset, tracking)) \
   LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, ref)) \
   LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, ref, start)) \
+  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, refine, stats)) \
   LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, reloc)) \
   LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, start)) \
   LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, symboltable)) \
diff --git a/src/hotspot/share/runtime/mutexLocker.cpp b/src/hotspot/share/runtime/mutexLocker.cpp
index de8240b6cf0..badb5f7e19b 100644
--- a/src/hotspot/share/runtime/mutexLocker.cpp
+++ b/src/hotspot/share/runtime/mutexLocker.cpp
@@ -74,6 +74,7 @@ Monitor* STS_lock                     = NULL;
 Monitor* FullGCCount_lock             = NULL;
 Monitor* G1OldGCCount_lock            = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
+Mutex*   G1DetachedRefinementStats_lock = NULL;
 Mutex*   MarkStackFreeList_lock       = NULL;
 Mutex*   MarkStackChunkList_lock      = NULL;
 Mutex*   MonitoringSupport_lock       = NULL;
@@ -212,6 +213,8 @@ void mutex_init() {
 
     def(Shared_DirtyCardQ_lock     , PaddedMutex  , access + 1,  true,  _safepoint_check_never);
 
+    def(G1DetachedRefinementStats_lock, PaddedMutex, leaf    ,   true, _safepoint_check_never);
+
     def(FreeList_lock              , PaddedMutex  , leaf     ,   true,  _safepoint_check_never);
     def(OldSets_lock               , PaddedMutex  , leaf     ,   true,  _safepoint_check_never);
     def(RootRegionScan_lock        , PaddedMonitor, leaf     ,   true,  _safepoint_check_never);
@@ -269,7 +272,7 @@ void mutex_init() {
   def(PerfDataManager_lock         , PaddedMutex  , leaf,        true,  _safepoint_check_always); // used for synchronized access to PerfDataManager resources
 
   def(Threads_lock                 , PaddedMonitor, barrier,     true,  _safepoint_check_always);  // Used for safepoint protocol.
-  def(NonJavaThreadsList_lock      , PaddedMutex,   leaf,        true,  _safepoint_check_never);
+  def(NonJavaThreadsList_lock      , PaddedMutex,   barrier,     true,  _safepoint_check_never);
   def(NonJavaThreadsListSync_lock  , PaddedMutex,   leaf,        true,  _safepoint_check_never);
 
   def(VMOperationQueue_lock        , PaddedMonitor, nonleaf,     true,  _safepoint_check_never);  // VM_thread allowed to block on these
diff --git a/src/hotspot/share/runtime/mutexLocker.hpp b/src/hotspot/share/runtime/mutexLocker.hpp
index c46b70ffbeb..f618beb9962 100644
--- a/src/hotspot/share/runtime/mutexLocker.hpp
+++ b/src/hotspot/share/runtime/mutexLocker.hpp
@@ -71,6 +71,7 @@ extern Monitor* G1OldGCCount_lock;               // in support of "concurrent" f
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card
                                                  // queue shared by
                                                  // non-Java threads.
+extern Mutex*   G1DetachedRefinementStats_lock;  // Lock protecting detached refinement stats
 extern Mutex*   MarkStackFreeList_lock;          // Protects access to the global mark stack free list.
 extern Mutex*   MarkStackChunkList_lock;         // Protects access to the global mark stack chunk list.
 extern Mutex*   MonitoringSupport_lock;          // Protects updates to the serviceability memory pools.
diff --git a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
index fbab8e008b0..00dbafe1764 100644
--- a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
+++ b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
@@ -143,6 +143,7 @@ public boolean isAvailable() {
         new LogMessageWithLevel("Expand Heap After Collection", Level.DEBUG),
         new LogMessageWithLevel("Region Register", Level.DEBUG),
         new LogMessageWithLevel("Prepare Heap Roots", Level.DEBUG),
+        new LogMessageWithLevel("Concatenate Dirty Card Logs", Level.DEBUG),
         // Free CSet
         new LogMessageWithLevel("Free Collection Set", Level.DEBUG),
         new LogMessageWithLevel("Serial Free Collection Set", Level.TRACE),
@@ -186,6 +187,7 @@ void checkMessagesAtLevel(OutputAnalyzer output, LogMessageWithLevel messages[],
 
     public static void main(String[] args) throws Exception {
         new TestGCLogMessages().testNormalLogs();
+        new TestGCLogMessages().testConcurrentRefinementLogs();
         new TestGCLogMessages().testWithToSpaceExhaustionLogs();
         new TestGCLogMessages().testWithInitialMark();
         new TestGCLogMessages().testExpandHeap();
@@ -221,6 +223,23 @@ private void testNormalLogs() throws Exception {
         output.shouldHaveExitValue(0);
     }
 
+    LogMessageWithLevel concRefineMessages[] = new LogMessageWithLevel[] {
+        new LogMessageWithLevel("Mutator refinement: ", Level.DEBUG),
+        new LogMessageWithLevel("Concurrent refinement: ", Level.DEBUG),
+        new LogMessageWithLevel("Total refinement: ", Level.DEBUG),
+        // "Concurrent refinement rate" optionally printed if any.
+        // "Generate dirty cards rate" optionally printed if any.
+    };
+
+    private void testConcurrentRefinementLogs() throws Exception {
+        ProcessBuilder pb = ProcessTools.createJavaProcessBuilder("-XX:+UseG1GC",
+                                                                  "-Xmx10M",
+                                                                  "-Xlog:gc+refine+stats=debug",
+                                                                  GCTest.class.getName());
+        OutputAnalyzer output = new OutputAnalyzer(pb.start());
+        checkMessagesAtLevel(output, concRefineMessages, Level.DEBUG);
+    }
+
     LogMessageWithLevel exhFailureMessages[] = new LogMessageWithLevel[] {
         new LogMessageWithLevel("Evacuation Failure", Level.DEBUG),
         new LogMessageWithLevel("Recalculate Used", Level.TRACE),
diff --git a/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java b/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java
index 19e11a71804..66114801bff 100644
--- a/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java
+++ b/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -110,7 +110,7 @@ public static void expectPerRegionRSetSummaries(String result, int expectedCumul
     }
 
     public static void expectRSetSummaries(String result, int expectedCumulative, int expectedPeriodic) throws Exception {
-        int actualTotal = result.split("concurrent refinement statistics").length - 1;
+        int actualTotal = result.split("Concurrent refinement threads times").length - 1;
         int actualCumulative = result.split("Cumulative RS summary").length - 1;
 
         if (expectedCumulative != actualCumulative) {