Skip to content
Permalink
Browse files
1042: Watchdog causing multiple restarts for mlbridge
Reviewed-by: kcr, ehelin
  • Loading branch information
erikj79 committed May 21, 2021
1 parent 3111cc4 commit c1f869791e71a25c9b7e9f057a11a961b76ef37f
Showing 5 changed files with 25 additions and 5 deletions.
@@ -211,6 +211,7 @@ private void drain(Duration timeout) throws TimeoutException {
private final List<Bot> bots;
private final ScheduledThreadPoolExecutor executor;
private final BotWatchdog botWatchdog;
private final Duration watchdogWarnTimeout;
private volatile boolean isReady;

private static final Logger log = Logger.getLogger("org.openjdk.skara.bot");
@@ -229,7 +230,8 @@ public BotRunner(BotRunnerConfiguration config, List<Bot> bots) {
}

executor = new ScheduledThreadPoolExecutor(config.concurrency());
botWatchdog = new BotWatchdog(Duration.ofMinutes(10));
botWatchdog = new BotWatchdog(config.watchdogTimeout());
watchdogWarnTimeout = config.watchdogWarnTimeout();
isReady = false;
}

@@ -263,14 +265,15 @@ private void itemWatchdog() {
synchronized (executor) {
for (var activeItem : active.entrySet()) {
var activeDuration = Duration.between(activeItem.getValue(), Instant.now());
if (activeDuration.compareTo(config.watchdogTimeout()) > 0) {
if (activeDuration.compareTo(watchdogWarnTimeout) > 0) {
log.severe("Item " + activeItem.getKey() + " has been active more than " + activeDuration +
" - this may be an error!");
// Reset the counter to avoid continuous reporting - once every watchdogTimeout is enough
activeItem.setValue(Instant.now());
}
}
// Inform the global watchdog that the scheduler is still executing items
log.fine("Pinging Watchdog");
botWatchdog.ping();
}
}
@@ -415,9 +415,18 @@ Optional<HttpServerConfiguration> httpServer(BotRunner runner) {
Duration watchdogTimeout() {
if (!config.contains("runner") || !config.get("runner").contains("watchdog")) {
log.info("No WorkItem watchdog timeout defined, using default value");
return Duration.ofHours(1);
return Duration.ofMinutes(30);
} else {
return Duration.parse(config.get("runner").get("watchdog").asString());
}
}

Duration watchdogWarnTimeout() {
if (!config.contains("runner") || !config.get("runner").contains("watchdog_warn")) {
log.info("No WorkItem watchdog_warn timeout defined, using watchdog value");
return watchdogTimeout();
} else {
return Duration.parse(config.get("runner").get("watchdog_warn").asString());
}
}
}
@@ -27,14 +27,15 @@
public class BotWatchdog {
private final Thread watchThread;
private final long maxWaitMillis;
private final Duration maxWait;
private volatile boolean hasBeenPinged = false;

private void threadMain() {
while (true) {
try {
Thread.sleep(maxWaitMillis);
if (!hasBeenPinged) {
System.out.println("No watchdog ping detected - exiting...");
System.out.println("No watchdog ping detected for " + maxWait + " - exiting...");
System.exit(1);
}
hasBeenPinged = false;
@@ -44,6 +45,7 @@ private void threadMain() {
}

BotWatchdog(Duration maxWait) {
this.maxWait = maxWait;
maxWaitMillis = maxWait.toMillis();
watchThread = new Thread(this::threadMain);
watchThread.setName("BotWatchdog");
@@ -313,7 +313,7 @@ void watchdogTrigger() throws TimeoutException {
var countdownLatch = new CountDownLatch(1);
var item = new TestBlockedWorkItem(countdownLatch);
var bot = new TestBot(item);
var runner = new BotRunner(config("{ \"runner\": { \"watchdog\": \"PT0.01S\", \"interval\": \"PT0.001S\" } }"), List.of(bot));
var runner = new BotRunner(config("{ \"runner\": { \"watchdog_warn\": \"PT0.01S\", \"interval\": \"PT0.001S\" } }"), List.of(bot));

var errors = new ArrayList<String>();
var log = Logger.getLogger("org.openjdk.skara.bot");
@@ -163,11 +163,15 @@ public static void main(String... args) {

applyLogging(jsonConfig);
var log = Logger.getLogger("org.openjdk.skara.bots.cli");
log.info("Starting BotLauncher");

BotRunnerConfiguration runnerConfig = null;
try {
runnerConfig = BotRunnerConfiguration.parse(jsonConfig, jsonFile.getParent());
} catch (ConfigurationError configurationError) {
log.severe("Failed to parse configuration file: " + jsonFile
+ " error message: " + configurationError.getMessage());
// Also print directly as logging may not be setup
System.out.println("Failed to parse configuration file: " + jsonFile);
System.out.println("Error message: " + configurationError.getMessage());
System.exit(1);
@@ -176,6 +180,8 @@ public static void main(String... args) {
var botFactories = BotFactory.getBotFactories().stream()
.collect(Collectors.toMap(BotFactory::name, Function.identity()));
if (botFactories.size() == 0) {
log.severe("Error: no bot factories found. Make sure the module path is correct. Exiting...");
// Also print directly as logging may not be setup
System.out.println("Error: no bot factories found. Make sure the module path is correct. Exiting...");
System.exit(1);
}

1 comment on commit c1f8697

@openjdk-notifier
Copy link

@openjdk-notifier openjdk-notifier bot commented on c1f8697 May 21, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.