From 386ee313903acf663a3d0842e8517ced7ce52158 Mon Sep 17 00:00:00 2001 From: Andrei Koval Date: Wed, 14 Feb 2018 09:57:41 +0100 Subject: [PATCH] Implement new VM pause monitor --- .../java/org/neo4j/util/Preconditions.java | 47 +++++ .../factory/GraphDatabaseSettings.java | 12 +- .../kernel/impl/cache/MeasureDoNothing.java | 124 ------------ ...orGc.java => VmPauseMonitorComponent.java} | 37 ++-- .../kernel/impl/factory/DataSourceModule.java | 7 +- .../kernel/monitoring/VmPauseMonitor.java | 176 ++++++++++++++++++ .../OnDemandDetailsExecutionMonitor.java | 53 ++++-- 7 files changed, 278 insertions(+), 178 deletions(-) create mode 100644 community/common/src/main/java/org/neo4j/util/Preconditions.java delete mode 100644 community/kernel/src/main/java/org/neo4j/kernel/impl/cache/MeasureDoNothing.java rename community/kernel/src/main/java/org/neo4j/kernel/impl/cache/{MonitorGc.java => VmPauseMonitorComponent.java} (53%) create mode 100644 community/kernel/src/main/java/org/neo4j/kernel/monitoring/VmPauseMonitor.java diff --git a/community/common/src/main/java/org/neo4j/util/Preconditions.java b/community/common/src/main/java/org/neo4j/util/Preconditions.java new file mode 100644 index 0000000000000..fa1c8b3ceb93f --- /dev/null +++ b/community/common/src/main/java/org/neo4j/util/Preconditions.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2002-2018 "Neo Technology," + * Network Engine for Objects in Lund AB [http://neotechnology.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package org.neo4j.util; + +public final class Preconditions +{ + private Preconditions() + { + // util class + } + + public static long requireNonNegative( long value ) + { + if ( value < 0 ) + { + throw new IllegalArgumentException( "Expected non-negative integer value, got " + value ); + } + return value; + } + + public static long requirePositive( long value ) + { + if ( value < 1 ) + { + throw new IllegalArgumentException( "Expected positive integer value, got " + value ); + } + return value; + } +} diff --git a/community/kernel/src/main/java/org/neo4j/graphdb/factory/GraphDatabaseSettings.java b/community/kernel/src/main/java/org/neo4j/graphdb/factory/GraphDatabaseSettings.java index c92b6f859aaa0..bb3601d570ba6 100644 --- a/community/kernel/src/main/java/org/neo4j/graphdb/factory/GraphDatabaseSettings.java +++ b/community/kernel/src/main/java/org/neo4j/graphdb/factory/GraphDatabaseSettings.java @@ -43,7 +43,6 @@ import org.neo4j.kernel.configuration.Settings; import org.neo4j.kernel.configuration.Title; import org.neo4j.kernel.configuration.ssl.SslPolicyConfigValidator; -import org.neo4j.kernel.impl.cache.MonitorGc; import org.neo4j.logging.Level; import org.neo4j.logging.LogTimeZone; @@ -617,16 +616,13 @@ public class GraphDatabaseSettings implements LoadableConfig public static final Setting forced_kernel_id = buildSetting( "unsupported.dbms.kernel_id", STRING, NO_DEFAULT ).constraint( illegalValueMessage( "has to be a valid kernel identifier", matches( "[a-zA-Z0-9]*" ) ) ).build(); - @SuppressWarnings( "unused" ) - @Description( "Amount of time in ms the GC monitor thread will wait before taking another measurement." ) @Internal - public static final Setting gc_monitor_interval = MonitorGc.Configuration.gc_monitor_wait_time; + public static final Setting vm_pause_monitor_measurement_duration = + setting( "unsupported.vm_pause_monitor.measurement_duration", DURATION, "100ms" ); - @SuppressWarnings( "unused" ) - @Description( "The amount of time in ms the monitor thread has to be blocked before logging a message it was " + - "blocked." ) @Internal - public static final Setting gc_monitor_block_threshold = MonitorGc.Configuration.gc_monitor_threshold; + public static final Setting vm_pause_monitor_stall_alert_threshold = + setting( "unsupported.vm_pause_monitor.stall_alert_threshold", DURATION, "100ms" ); @Description( "Relationship count threshold for considering a node to be dense" ) public static final Setting dense_node_threshold = diff --git a/community/kernel/src/main/java/org/neo4j/kernel/impl/cache/MeasureDoNothing.java b/community/kernel/src/main/java/org/neo4j/kernel/impl/cache/MeasureDoNothing.java deleted file mode 100644 index 68b3b92d446fd..0000000000000 --- a/community/kernel/src/main/java/org/neo4j/kernel/impl/cache/MeasureDoNothing.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2002-2018 "Neo Technology," - * Network Engine for Objects in Lund AB [http://neotechnology.com] - * - * This file is part of Neo4j. - * - * Neo4j is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -package org.neo4j.kernel.impl.cache; - -import org.neo4j.logging.Log; - -public class MeasureDoNothing extends Thread -{ - public interface Monitor - { - default void started() - { - } - - void blocked( long millis ); - - default void stopped() - { - } - } - - public static Monitor loggingMonitor( Log log ) - { - return new Monitor() - { - @Override - public void started() - { - log.debug( "GC Monitor started. " ); - } - - @Override - public void stopped() - { - log.debug( "GC Monitor stopped. " ); - } - - @Override - public void blocked( long millis ) - { - log.warn( String.format( "GC Monitor: Application threads blocked for %dms.", millis ) ); - } - }; - } - - public static class CollectingMonitor implements Monitor - { - private long gcBlockTime; - - @Override - public void blocked( long millis ) - { - gcBlockTime += millis; - } - - public long getGcBlockTime() - { - return gcBlockTime; - } - } - - private volatile boolean measure = true; - - private final Monitor monitor; - private final long TIME_TO_WAIT; - private final long NOTIFICATION_THRESHOLD; - - public MeasureDoNothing( String threadName, Monitor monitor, long timeToWait, long pauseNotificationThreshold ) - { - super( threadName ); - this.monitor = monitor; - this.TIME_TO_WAIT = timeToWait; - this.NOTIFICATION_THRESHOLD = pauseNotificationThreshold + timeToWait; - setDaemon( true ); - } - - @Override - public void run() - { - monitor.started(); - while ( measure ) - { - long start = System.nanoTime(); - try - { - Thread.sleep( TIME_TO_WAIT ); - } - catch ( InterruptedException e ) - { - Thread.interrupted(); - } - long time = (System.nanoTime() - start) / 1_000_000; - if ( time > NOTIFICATION_THRESHOLD ) - { - long blockTime = time - TIME_TO_WAIT; - monitor.blocked( blockTime ); - } - } - monitor.stopped(); - } - - public void stopMeasuring() - { - measure = false; - this.interrupt(); - } -} diff --git a/community/kernel/src/main/java/org/neo4j/kernel/impl/cache/MonitorGc.java b/community/kernel/src/main/java/org/neo4j/kernel/impl/cache/VmPauseMonitorComponent.java similarity index 53% rename from community/kernel/src/main/java/org/neo4j/kernel/impl/cache/MonitorGc.java rename to community/kernel/src/main/java/org/neo4j/kernel/impl/cache/VmPauseMonitorComponent.java index 62c539f43f992..10fb052c52c2c 100644 --- a/community/kernel/src/main/java/org/neo4j/kernel/impl/cache/MonitorGc.java +++ b/community/kernel/src/main/java/org/neo4j/kernel/impl/cache/VmPauseMonitorComponent.java @@ -19,32 +19,19 @@ */ package org.neo4j.kernel.impl.cache; -import java.time.Duration; - -import org.neo4j.graphdb.config.Setting; +import org.neo4j.graphdb.factory.GraphDatabaseSettings; import org.neo4j.kernel.configuration.Config; import org.neo4j.kernel.lifecycle.Lifecycle; +import org.neo4j.kernel.monitoring.VmPauseMonitor; import org.neo4j.logging.Log; -import static org.neo4j.kernel.configuration.Settings.DURATION; -import static org.neo4j.kernel.configuration.Settings.setting; -import static org.neo4j.kernel.impl.cache.MeasureDoNothing.loggingMonitor; - -public class MonitorGc implements Lifecycle +public class VmPauseMonitorComponent implements Lifecycle { - public static class Configuration - { - public static final Setting gc_monitor_wait_time = setting( "unsupported.dbms.gc_monitor_wait_time", - DURATION, "100ms" ); - public static final Setting gc_monitor_threshold = setting("unsupported.dbms.gc_monitor_threshold", - DURATION, "200ms" ); - } - private final Config config; private final Log log; - private volatile MeasureDoNothing monitorGc; + private volatile VmPauseMonitor vmPauseMonitor; - public MonitorGc( Config config, Log log ) + public VmPauseMonitorComponent( Config config, Log log ) { this.config = config; this.log = log; @@ -58,17 +45,19 @@ public void init() @Override public void start() { - monitorGc = new MeasureDoNothing( "neo4j.PauseMonitor", loggingMonitor( log ), - config.get( Configuration.gc_monitor_wait_time ).toMillis(), - config.get( Configuration.gc_monitor_threshold ).toMillis() ); - monitorGc.start(); + vmPauseMonitor = new VmPauseMonitor( + config.get( GraphDatabaseSettings.vm_pause_monitor_measurement_duration ).toMillis(), + config.get( GraphDatabaseSettings.vm_pause_monitor_stall_alert_threshold ).toMillis(), + log, vmPauseInfo -> log.debug( "Detected VM stop-the-world pause: {}", vmPauseInfo ) + ); + vmPauseMonitor.start(); } @Override public void stop() { - monitorGc.stopMeasuring(); - monitorGc = null; + vmPauseMonitor.stop(); + vmPauseMonitor = null; } @Override diff --git a/community/kernel/src/main/java/org/neo4j/kernel/impl/factory/DataSourceModule.java b/community/kernel/src/main/java/org/neo4j/kernel/impl/factory/DataSourceModule.java index 7201c6ab4f075..63408884e3603 100644 --- a/community/kernel/src/main/java/org/neo4j/kernel/impl/factory/DataSourceModule.java +++ b/community/kernel/src/main/java/org/neo4j/kernel/impl/factory/DataSourceModule.java @@ -50,11 +50,9 @@ import org.neo4j.kernel.impl.api.dbms.NonTransactionalDbmsOperations; import org.neo4j.kernel.impl.api.explicitindex.InternalAutoIndexing; import org.neo4j.kernel.impl.api.index.IndexingService; -import org.neo4j.kernel.impl.cache.MonitorGc; +import org.neo4j.kernel.impl.cache.VmPauseMonitorComponent; import org.neo4j.kernel.impl.core.DatabasePanicEventGenerator; import org.neo4j.kernel.impl.core.EmbeddedProxySPI; -import org.neo4j.kernel.impl.core.NodeProxy; -import org.neo4j.kernel.impl.core.RelationshipProxy; import org.neo4j.kernel.impl.core.RelationshipTypeTokenHolder; import org.neo4j.kernel.impl.core.StartupStatisticsProvider; import org.neo4j.kernel.impl.core.ThreadToStatementContextBridge; @@ -65,7 +63,6 @@ import org.neo4j.kernel.impl.proc.ProcedureTransactionProvider; import org.neo4j.kernel.impl.proc.Procedures; import org.neo4j.kernel.impl.proc.TerminationGuardProvider; -import org.neo4j.kernel.impl.proc.TypeMappers.DefaultValueConverter; import org.neo4j.kernel.impl.query.QueryExecutionEngine; import org.neo4j.kernel.impl.store.StoreId; import org.neo4j.kernel.impl.transaction.log.files.LogFileCreationMonitor; @@ -209,7 +206,7 @@ public DataSourceModule( final PlatformModule platformModule, EditionModule edit dataSourceManager.register( neoStoreDataSource ); - life.add( new MonitorGc( config, logging.getInternalLog( MonitorGc.class ) ) ); + life.add( new VmPauseMonitorComponent( config, logging.getInternalLog( VmPauseMonitorComponent.class ) ) ); life.add( new PublishPageCacheTracerMetricsAfterStart( platformModule.tracers.pageCursorTracerSupplier ) ); diff --git a/community/kernel/src/main/java/org/neo4j/kernel/monitoring/VmPauseMonitor.java b/community/kernel/src/main/java/org/neo4j/kernel/monitoring/VmPauseMonitor.java new file mode 100644 index 0000000000000..eb72acf7ed1c2 --- /dev/null +++ b/community/kernel/src/main/java/org/neo4j/kernel/monitoring/VmPauseMonitor.java @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2002-2018 "Neo Technology," + * Network Engine for Objects in Lund AB [http://neotechnology.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +package org.neo4j.kernel.monitoring; + +import java.lang.management.GarbageCollectorMXBean; +import java.lang.management.ManagementFactory; +import java.util.function.Consumer; + +import org.neo4j.logging.Log; + +import static java.lang.String.format; +import static java.lang.System.nanoTime; +import static java.lang.Thread.currentThread; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.neo4j.util.Preconditions.requirePositive; + +public final class VmPauseMonitor +{ + private final long measurementDurationnNs; + private final long stallAlertThresholdNs; + private final Log log; + private final Consumer listener; + private final Thread worker; + + public VmPauseMonitor( long measureIntervalMs, long stallAlertThresholdMs, Log log, Consumer listener ) + { + this.measurementDurationnNs = MILLISECONDS.toNanos( requirePositive( measureIntervalMs ) ); + this.stallAlertThresholdNs = MILLISECONDS.toNanos( requirePositive( stallAlertThresholdMs ) ); + this.log = requireNonNull( log ); + this.listener = requireNonNull( listener ); + this.worker = new Thread( this::run ); + worker.setName( "VmPauseMonitor" ); + worker.setDaemon( true ); + } + + public void start() + { + log.debug( "Starting VM pause monitor" ); + worker.start(); + } + + public void stop() + { + log.debug( "Stopping VM pause monitor" ); + worker.interrupt(); + try + { + worker.join(); + } + catch ( InterruptedException ignore ) + { + currentThread().interrupt(); + } + } + + private void run() + { + try + { + monitor(); + } + catch ( InterruptedException ignore ) + { + log.debug( "VM pause monitor stopped" ); + } + catch ( RuntimeException e ) + { + log.debug( "VM pause monitor failed", e ); + } + } + + private void monitor() throws InterruptedException + { + GcStats lastGcStats = getGcStats(); + long nextCheckPoint = nanoTime() + measurementDurationnNs; + + while ( !currentThread().isInterrupted() ) + { + NANOSECONDS.sleep( measurementDurationnNs ); + final long now = nanoTime(); + final long pauseNs = now - nextCheckPoint; + nextCheckPoint = now + measurementDurationnNs; + + final GcStats gcStats = getGcStats(); + if ( pauseNs >= stallAlertThresholdNs ) + { + final VmPauseInfo pauseInfo = new VmPauseInfo( + NANOSECONDS.toMillis( pauseNs ), + gcStats.time - lastGcStats.time, + gcStats.count - lastGcStats.count + ); + listener.accept( pauseInfo ); + } + lastGcStats = gcStats; + } + } + + public static class VmPauseInfo + { + private final long pauseTime; + private final long gcTime; + private final long gcCount; + + public VmPauseInfo( long pauseTime, long gcTime, long gcCount ) + { + this.pauseTime = pauseTime; + this.gcTime = gcTime; + this.gcCount = gcCount; + } + + public long getPauseTime() + { + return pauseTime; + } + + public long getGcTime() + { + return gcTime; + } + + public long getGcCount() + { + return gcCount; + } + + @Override + public String toString() + { + return format( "{pauseTime=%d, gcTime=%d, gcCount=%d}", pauseTime, gcTime, gcCount ); + } + } + + private static GcStats getGcStats() + { + long time = 0; + long count = 0; + for ( GarbageCollectorMXBean gcBean : ManagementFactory.getGarbageCollectorMXBeans() ) + { + time += gcBean.getCollectionTime(); + count += gcBean.getCollectionCount(); + } + return new GcStats( time, count ); + } + + private static class GcStats + { + private final long time; + private final long count; + + private GcStats( long time, long count ) + { + this.time = time; + this.count = count; + } + } +} diff --git a/community/kernel/src/main/java/org/neo4j/unsafe/impl/batchimport/staging/OnDemandDetailsExecutionMonitor.java b/community/kernel/src/main/java/org/neo4j/unsafe/impl/batchimport/staging/OnDemandDetailsExecutionMonitor.java index 8141371846abe..60dc20bcaa4de 100644 --- a/community/kernel/src/main/java/org/neo4j/unsafe/impl/batchimport/staging/OnDemandDetailsExecutionMonitor.java +++ b/community/kernel/src/main/java/org/neo4j/unsafe/impl/batchimport/staging/OnDemandDetailsExecutionMonitor.java @@ -28,14 +28,17 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Consumer; import java.util.function.LongFunction; import org.neo4j.graphdb.DependencyResolver; import org.neo4j.helpers.Format; import org.neo4j.helpers.collection.Pair; -import org.neo4j.kernel.impl.cache.MeasureDoNothing; -import org.neo4j.kernel.impl.cache.MeasureDoNothing.CollectingMonitor; import org.neo4j.io.os.OsBeanUtil; +import org.neo4j.kernel.monitoring.VmPauseMonitor; +import org.neo4j.kernel.monitoring.VmPauseMonitor.VmPauseInfo; +import org.neo4j.logging.NullLog; import org.neo4j.unsafe.impl.batchimport.stats.DetailLevel; import org.neo4j.unsafe.impl.batchimport.stats.Keys; import org.neo4j.unsafe.impl.batchimport.stats.Stat; @@ -74,8 +77,8 @@ interface Monitor private final PrintStream out; private final InputStream in; private final Map> actions = new HashMap<>(); - private final CollectingMonitor gcBlockTime = new CollectingMonitor(); - private final MeasureDoNothing gcMonitor; + private final VmPauseTimeAccumulator vmPauseTimeAccumulator = new VmPauseTimeAccumulator(); + private final VmPauseMonitor vmPauseMonitor; private final Monitor monitor; private StageDetails current; @@ -88,7 +91,7 @@ public OnDemandDetailsExecutionMonitor( PrintStream out, InputStream in, Monitor this.monitor = monitor; this.actions.put( "i", Pair.of( "Print more detailed information", this::printDetails ) ); this.actions.put( "c", Pair.of( "Print more detailed information about current stage", this::printDetailsForCurrentStage ) ); - this.gcMonitor = new MeasureDoNothing( "Importer GC monitor", gcBlockTime, 100, 200 ); + this.vmPauseMonitor = new VmPauseMonitor( 100L, 100L, NullLog.getInstance(), vmPauseTimeAccumulator ); } @Override @@ -97,13 +100,13 @@ public void initialize( DependencyResolver dependencyResolver ) out.println( "InteractiveReporterInteractions command list (end with ENTER):" ); actions.forEach( ( key, action ) -> out.println( " " + key + ": " + action.first() ) ); out.println(); - gcMonitor.start(); + vmPauseMonitor.start(); } @Override public void start( StageExecution execution ) { - details.add( current = new StageDetails( execution, gcBlockTime ) ); + details.add( current = new StageDetails( execution, vmPauseTimeAccumulator ) ); } @Override @@ -119,7 +122,7 @@ public void done( long totalTimeMillis, String additionalInformation ) { printDetails(); } - gcMonitor.stopMeasuring(); + vmPauseMonitor.stop(); } @Override @@ -149,7 +152,7 @@ private void printDetails() printIndented( out, " Free physical memory: " + bytes( OsBeanUtil.getFreePhysicalMemory() ) ); printIndented( out, " Max VM memory: " + bytes( Runtime.getRuntime().maxMemory() ) ); printIndented( out, " Free VM memory: " + bytes( Runtime.getRuntime().freeMemory() ) ); - printIndented( out, " GC block time: " + duration( gcBlockTime.getGcBlockTime() ) ); + printIndented( out, " VM stop-the-world time: " + duration( vmPauseTimeAccumulator.getPauseTime() ) ); printIndented( out, " Duration: " + duration( totalTime ) ); out.println(); } @@ -206,21 +209,21 @@ private static class StageDetails { private final StageExecution execution; private final long startTime; - private final CollectingMonitor gcBlockTime; - private final long baseGcBlockTime; + private final VmPauseTimeAccumulator vmPauseTimeAccumulator; + private final long baseVmPauseTime; private long memoryUsage; private long ioThroughput; private long totalTimeMillis; - private long stageGcBlockTime; + private long stageVmPauseTime; private long doneBatches; - StageDetails( StageExecution execution, CollectingMonitor gcBlockTime ) + StageDetails( StageExecution execution, VmPauseTimeAccumulator vmPauseTimeAccumulator ) { this.execution = execution; - this.gcBlockTime = gcBlockTime; + this.vmPauseTimeAccumulator = vmPauseTimeAccumulator; + this.baseVmPauseTime = vmPauseTimeAccumulator.getPauseTime(); this.startTime = currentTimeMillis(); - this.baseGcBlockTime = gcBlockTime.getGcBlockTime(); } void print( PrintStream out ) @@ -231,7 +234,7 @@ void print( PrintStream out ) printIndented( out, builder.toString() ); printValue( out, memoryUsage, "Memory usage", Format::bytes ); printValue( out, ioThroughput, "I/O throughput", value -> bytes( value ) + "/s" ); - printValue( out, stageGcBlockTime, "GC block time", Format::duration ); + printValue( out, stageVmPauseTime, "VM stop-the-world time", Format::duration ); printValue( out, totalTimeMillis, "Duration", Format::duration ); printValue( out, doneBatches, "Done batches", String::valueOf ); @@ -249,7 +252,7 @@ private static void printValue( PrintStream out, long value, String description, void collect() { totalTimeMillis = currentTimeMillis() - startTime; - stageGcBlockTime = gcBlockTime.getGcBlockTime() - baseGcBlockTime; + stageVmPauseTime = vmPauseTimeAccumulator.getPauseTime() - baseVmPauseTime; long lastDoneBatches = doneBatches; for ( Step step : execution.steps() ) { @@ -269,4 +272,20 @@ void collect() doneBatches = lastDoneBatches; } } + + private static class VmPauseTimeAccumulator implements Consumer + { + private final AtomicLong totalPauseTime = new AtomicLong(); + + @Override + public void accept( VmPauseInfo pauseInfo ) + { + totalPauseTime.addAndGet( pauseInfo.getPauseTime() ); + } + + long getPauseTime() + { + return totalPauseTime.get(); + } + } }