Skip to content

Commit

Permalink
[fix] Fix lost state (instanceIdsToTerminate) on configuration change
Browse files Browse the repository at this point in the history
[fix] Fix maxtotaluses decrement logic

add logs in post job action to expose tasks terminated with problems

jenkinsci#322

add and fix tests
  • Loading branch information
pdk27 committed Jun 27, 2023
1 parent 9450fa8 commit 9aa7327
Show file tree
Hide file tree
Showing 5 changed files with 228 additions and 177 deletions.
93 changes: 61 additions & 32 deletions src/main/java/com/amazon/jenkins/ec2fleet/EC2RetentionStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
*/
public class EC2RetentionStrategy extends RetentionStrategy<SlaveComputer> implements ExecutorListener {

private static final int RE_CHECK_IN_MINUTE = 1;
private static final int RE_CHECK_IN_A_MINUTE = 1;

private static final Logger LOGGER = Logger.getLogger(EC2RetentionStrategy.class.getName());

Expand All @@ -42,52 +42,62 @@ public long check(final SlaveComputer computer) {
if (cloud == null) {
LOGGER.warning("Cloud is null for computer " + fc.getDisplayName()
+ ". This should be autofixed in a few minutes, if not please create an issue for the plugin");
return RE_CHECK_IN_MINUTE;
return RE_CHECK_IN_A_MINUTE;
}

// Ensure that the EC2FleetCloud cannot be mutated from under us while
// we're doing this check
// Ensure nobody provisions onto this node until we've done
// checking
boolean shouldAcceptTasks = fc.isAcceptingTasks();
boolean justTerminated = false;
boolean markedForTermination = false;
fc.setAcceptingTasks(false);
try {
if(fc.isIdle()) {
final EC2AgentTerminationReason reason;
if (isIdleForTooLong(cloud, fc)) {
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
Node node = fc.getNode();
if (node == null) {
return RE_CHECK_IN_A_MINUTE;
}

EC2AgentTerminationReason reason;
// Determine the reason for termination from specific to generic use cases.
// Reasoning for checking all cases of termination initiated by the plugin:
// A user-initiated change to cloud configuration creates a new EC2FleetCloud object, erasing class fields containing data like instance IDs to terminate.
// Hence, determine the reasons for termination here using persisted fields for accurate handling of termination.
if (fc.isMarkedForDeletion()) {
reason = EC2AgentTerminationReason.AGENT_DELETED;
} else if (cloud.hasExcessCapacity()) {
reason = EC2AgentTerminationReason.EXCESS_CAPACITY;
} else if (cloud instanceof EC2FleetCloud && !((EC2FleetCloud) cloud).hasUnlimitedUsesForNodes()
&& ((EC2FleetNode)node).getUsesRemaining() <= 0) {
reason = EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED;
} else if (isIdleForTooLong(cloud, fc)) {
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
} else {
return 0;
}

// Find instance ID
Node compNode = fc.getNode();
if (compNode == null) {
return 0;
return RE_CHECK_IN_A_MINUTE;
}

final String instanceId = compNode.getNodeName();
if (cloud.scheduleToTerminate(instanceId, false, reason)) {
final String instanceId = node.getNodeName();
final boolean ignoreMinConstraints = reason.equals(EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
if (cloud.scheduleToTerminate(instanceId, ignoreMinConstraints, reason)) {
// Instance successfully scheduled for termination, so no longer accept tasks (i.e. suspended)
shouldAcceptTasks = false;
LOGGER.fine(String.format("Suspended node %s after scheduling instance for termination, reason: %s.",
compNode.getDisplayName(), instanceId, reason));
justTerminated = true;
node.getDisplayName(), instanceId, reason));
markedForTermination = true;
}
}

if (cloud.isAlwaysReconnect() && !justTerminated && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
// if connection to the computer is lost for some reason, try to reconnect if configured to do so.
if (cloud.isAlwaysReconnect() && !markedForTermination && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
LOGGER.log(Level.INFO, "Reconnecting to instance: " + fc.getDisplayName());
fc.tryReconnect();
}
} finally {
fc.setAcceptingTasks(shouldAcceptTasks);
}

return RE_CHECK_IN_MINUTE;
return RE_CHECK_IN_A_MINUTE;
}

@Override
Expand Down Expand Up @@ -121,37 +131,56 @@ public void taskAccepted(Executor executor, Queue.Task task) {
final EC2FleetNode ec2FleetNode = computer.getNode();
if (ec2FleetNode != null) {
final int maxTotalUses = ec2FleetNode.getMaxTotalUses();
if (maxTotalUses <= -1) {
LOGGER.fine("maxTotalUses set to unlimited (" + ec2FleetNode.getMaxTotalUses() + ") for agent " + computer.getName());
} else if (maxTotalUses <= 1) {
LOGGER.info("maxTotalUses drained - suspending agent after current build " + computer.getName());
computer.setAcceptingTasks(false);
} else {
ec2FleetNode.setMaxTotalUses(ec2FleetNode.getMaxTotalUses() - 1);
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getMaxTotalUses() + " builds left");
if (maxTotalUses <= -1) { // unlimited uses
LOGGER.fine("maxTotalUses set to unlimited (" + maxTotalUses + ") for agent " + computer.getName());
} else { // limited uses
if (ec2FleetNode.getUsesRemaining() > 1) {
ec2FleetNode.decrementUsesRemaining();
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getUsesRemaining() + " builds left");
} else if (ec2FleetNode.getUsesRemaining() == 1) { // current task should be the last task for this agent
LOGGER.info(String.format("maxTotalUses drained - suspending agent %s after current build", computer.getName()));
computer.setAcceptingTasks(false);
ec2FleetNode.decrementUsesRemaining();
} else {
// don't decrement when usesRemaining=0, as -1 has a special meaning.
LOGGER.warning(String.format("Agent %s accepted a task after being suspended!!! MaxTotalUses: %d, uses remaining: %d",
computer.getName(), ec2FleetNode.getMaxTotalUses(), ec2FleetNode.getUsesRemaining()));
}
}
}
}
}

@Override
public void taskCompleted(Executor executor, Queue.Task task, long l) {
postJobAction(executor);
postJobAction(executor, null);
}

@Override
public void taskCompletedWithProblems(Executor executor, Queue.Task task, long l, Throwable throwable) {
postJobAction(executor);
postJobAction(executor, throwable);
}

private void postJobAction(Executor executor) {
private void postJobAction(final Executor executor, final Throwable throwable) {
if (throwable != null) {
LOGGER.warning(String.format("Build %s completed with problems on agent %s. TimeSpentInQueue: %ds, duration: %ds, problems: %s",
executor.getCurrentExecutable(), executor.getOwner().getName(),
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime()), throwable.getMessage()));
} else {
LOGGER.info(String.format("Build %s completed successfully on agent %s. TimeSpentInQueue: %ds, duration: %ds.",
executor.getCurrentExecutable(), executor.getOwner().getName(),
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime())));
}

final EC2FleetNodeComputer computer = (EC2FleetNodeComputer) executor.getOwner();
if(computer != null) {
if (computer != null) {
final EC2FleetNode ec2FleetNode = computer.getNode();
if (ec2FleetNode != null) {
final AbstractEC2FleetCloud cloud = ec2FleetNode.getCloud();
if (computer.countBusy() <= 1 && !computer.isAcceptingTasks()) {
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to maxTotalUses (" + ec2FleetNode.getMaxTotalUses() + ")");
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to exhausted maxTotalUses.");
// Schedule instance for termination even if it breaches minSize and minSpareSize constraints
cloud.scheduleToTerminate(ec2FleetNode.getNodeName(), true, EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
}
Expand Down

This file was deleted.

Loading

0 comments on commit 9aa7327

Please sign in to comment.