Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import oracle.weblogic.kubernetes.annotations.IntegrationTest;
import oracle.weblogic.kubernetes.annotations.Namespaces;
import oracle.weblogic.kubernetes.logging.LoggingFacade;
import org.awaitility.core.ConditionFactory;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.DisplayName;
Expand All @@ -48,6 +49,8 @@
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.SECONDS;
import static oracle.weblogic.kubernetes.TestConstants.ADMIN_PASSWORD_DEFAULT;
import static oracle.weblogic.kubernetes.TestConstants.ADMIN_USERNAME_DEFAULT;
import static oracle.weblogic.kubernetes.TestConstants.BASE_IMAGES_REPO_SECRET;
Expand Down Expand Up @@ -109,6 +112,7 @@
import static oracle.weblogic.kubernetes.utils.SecretUtils.createSecretWithUsernamePassword;
import static oracle.weblogic.kubernetes.utils.ThreadSafeLogger.getLogger;
import static oracle.weblogic.kubernetes.utils.WLSTUtils.executeWLSTScript;
import static org.awaitility.Awaitility.with;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
Expand Down Expand Up @@ -155,6 +159,10 @@ class ItKubernetesEvents {

private static LoggingFacade logger = null;

public static ConditionFactory withLongRetryPolicy = with().pollDelay(2, SECONDS)
.and().with().pollInterval(10, SECONDS)
.atMost(10, MINUTES).await();

/**
* Assigns unique namespaces for operator and domains.
* Pull WebLogic image if running tests in Kind cluster.
Expand Down Expand Up @@ -188,7 +196,7 @@ public static void initAll(@Namespaces(6) List<String> namespaces) {
externalRestHttpsPort = getServiceNodePort(opNamespace, "external-weblogic-operator-svc");

// This test uses the operator restAPI to scale the domain. To do this in OKD cluster,
// we need to expose the external service as route and set tls termination to passthrough
// we need to expose the external service as route and set tls termination to passthrough
logger.info("Create a route for the operator external service - only for OKD");
String opExternalSvc = createRouteForOKD("external-weblogic-operator-svc", opNamespace);
// Patch the route just created to set tls termination to passthrough
Expand Down Expand Up @@ -292,54 +300,82 @@ void testDomainK8sEventsNonExistingCluster() {
}

/**
* Test the following domain events are logged when domain resource goes through various life cycle stages.
* Patch the domain resource to remove the webLogicCredentialsSecret and verify DomainChanged is
* logged when operator processes the domain resource changes.
* Verifies DomainProcessingRetrying is logged when operator retries the failed domain resource
* changes since webLogicCredentialsSecret is still missing.
* Test the following domain events are logged when domain resource goes through introspector failure.
* Patch the domain resource to shutdown servers.
* Patch the domain resource to point to a bad DOMAIN_HOME and update serverStartPolicy to IF_NEEDED.
* Verifies DomainProcessingFailed event is logged.
* Verifies DomainProcessingAborted is logged when operator exceeds the maximum retries and gives
* up processing the domain resource.
* Cleanup by patching the domain resource to a valid location and introspectVersion to bring up all servers again.
*/
@Order(4)
@Test
@DisplayName("Test domain events for failed/retried domain life cycle changes")
void testDomainK8SEventsFailed() {
V1Patch patch;
String patchStr;
Domain domain = assertDoesNotThrow(() -> getDomainCustomResource(domainUid, domainNamespace1));
String originalDomainHome = domain.getSpec().getDomainHome();

OffsetDateTime timestamp = now();
try {
logger.info("remove the webLogicCredentialsSecret to verify the following events"
+ " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged");
patchStr = "[{\"op\": \"remove\", \"path\": \"/spec/webLogicCredentialsSecret\"}]";
logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr);
logger.info("Shutting down all servers in domain with serverStartPolicy : NEVER");
patchStr = "[{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"NEVER\"}]";
patch = new V1Patch(patchStr);
assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH),
"patchDomainCustomResource failed");

logger.info("Checking if the admin server {0} is shutdown in namespace {1}",
adminServerPodName, domainNamespace1);
checkPodDoesNotExist(adminServerPodName, domainUid, domainNamespace1);

for (int i = 1; i <= replicaCount; i++) {
logger.info("Checking if the managed server {0} is shutdown in namespace {1}",
managedServerPodNamePrefix + i, domainNamespace1);
checkPodDoesNotExist(managedServerPodNamePrefix + i, domainUid, domainNamespace1);
}

logger.info("Replace the domainHome to a nonexisting location to verify the following events"
+ " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged");
patchStr = "[{\"op\": \"replace\", "
+ "\"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "bad\"},"
+ "{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"IF_NEEDED\"}]";
logger.info("PatchStr for domainHome: {0}", patchStr);

patch = new V1Patch(patchStr);
assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH),
"patchDomainCustomResource failed");
"patchDomainCustomResource failed");

logger.info("verify domain changed event is logged");
checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp);

// logger.info("verify domain processing retrying event");
// checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_RETRYING, "Normal", timestamp);

logger.info("verify domain processing aborted event");
checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_ABORTED, "Warning", timestamp);
} finally {
logger.info("Restoring the domain with valid location and bringing up all servers");
timestamp = now();
// add back the webLogicCredentialsSecret
patchStr = "[{\"op\": \"add\", \"path\": \"/spec/webLogicCredentialsSecret\", "
+ "\"value\" : {\"name\":\"" + wlSecretName + "\" , \"namespace\":\"" + domainNamespace1 + "\"}"
+ "}]";
logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr);
String introspectVersion = assertDoesNotThrow(() -> getNextIntrospectVersion(domainUid, domainNamespace1));
// add back the original domain home
patchStr = "["
+ "{\"op\": \"replace\", \"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "\"},"
+ "{\"op\": \"add\", \"path\": \"/spec/introspectVersion\", \"value\": \"" + introspectVersion + "\"}"
+ "]";
logger.info("PatchStr for domainHome: {0}", patchStr);

patch = new V1Patch(patchStr);
assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH),
"patchDomainCustomResource failed");
"patchDomainCustomResource failed");

logger.info("verify domain changed event is logged");
checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp);
logger.info("verifying the admin server is created and started");
checkPodReadyAndServiceExists(adminServerPodName, domainUid, domainNamespace1);

// verify managed server services created
for (int i = 1; i <= replicaCount; i++) {
logger.info("Checking managed server service/pod {0} is created in namespace {1}",
managedServerPodNamePrefix + i, domainNamespace1);
checkPodReadyAndServiceExists(managedServerPodNamePrefix + i, domainUid, domainNamespace1);
}
}
}

Expand Down Expand Up @@ -889,7 +925,7 @@ public static void tearDown() {
private static void checkEvent(
String opNamespace, String domainNamespace, String domainUid,
String reason, String type, OffsetDateTime timestamp) {
testUntil(
testUntil(withLongRetryPolicy,
checkDomainEvent(opNamespace, domainNamespace, domainUid, reason, type, timestamp),
logger,
"domain event {0} to be logged in namespace {1}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -841,7 +841,7 @@ private boolean shouldContinue() {
return true;
} else if (shouldReportAbortedEvent()) {
return true;
} else if (hasExceededRetryCount() && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
} else if (hasExceededRetryCount(liveInfo) && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) {
LOGGER.severe(ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG);
return false;
} else if (isFatalIntrospectorError()) {
Expand All @@ -852,10 +852,7 @@ private boolean shouldContinue() {
return false; // we have already cached this
} else if (shouldRecheck(cachedInfo)) {

if (hasExceededRetryCount()) {
resetIntrospectorJobFailureCount();
}
if (getCurrentIntrospectFailureRetryCount() > 0) {
if (getCurrentIntrospectFailureRetryCount(liveInfo) > 0) {
logRetryCount(cachedInfo);
}
LOGGER.fine("Continue the make-right domain presence, explicitRecheck -> " + explicitRecheck);
Expand All @@ -869,29 +866,9 @@ private boolean shouldReportAbortedEvent() {
return Optional.ofNullable(eventData).map(EventData::getItem).orElse(null) == DOMAIN_PROCESSING_ABORTED;
}

private void resetIntrospectorJobFailureCount() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Main already has functionality for resetting the failure count. Why discard it in favor of the approach used in 3.3.3?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is resetting the failure count in the domain status. I believe this approach is same in latest 3.3.3 release. Previously we had 2 failure/retry counts (1) in the DomainStatus and (2) in-memory count in DomainPresenceInfo. After the fix for OWLS-90180, we can no longer rely on the in-memory state of the operator as the introspector job might have been created before the operator started. I have removed the in-memory retry count in DomainPresenceInfo and made changes to use the failure count in domain status instead.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hear the reasoning; I'm just surprised by the size of the change needed to do that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. I have used the same approach in PR 2580 which is merged into latest 3.3.3 release. Please let me know if you have other suggestions for this approach. Thanks.

Optional.ofNullable(liveInfo)
.map(DomainPresenceInfo::getDomain)
.map(Domain::getStatus)
.map(DomainStatus::resetIntrospectJobFailureCount);
}

private boolean hasExceededRetryCount() {
return getCurrentIntrospectFailureRetryCount()
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
}

private Integer getCurrentIntrospectFailureRetryCount() {
return Optional.ofNullable(liveInfo)
.map(DomainPresenceInfo::getDomain)
.map(Domain::getStatus)
.map(DomainStatus::getIntrospectJobFailureCount)
.orElse(0);
}

private void logRetryCount(DomainPresenceInfo cachedInfo) {
LOGGER.info(MessageKeys.INTROSPECT_JOB_FAILED_RETRY_COUNT, cachedInfo.getDomain().getDomainUid(),
getCurrentIntrospectFailureRetryCount(),
getCurrentIntrospectFailureRetryCount(liveInfo),
DomainPresence.getDomainPresenceFailureRetryMaxCount());
}

Expand Down Expand Up @@ -1020,6 +997,19 @@ private static String getIntrospectVersion(DomainPresenceInfo info) {
.orElse(null);
}

private Integer getCurrentIntrospectFailureRetryCount(DomainPresenceInfo info) {
return Optional.ofNullable(info)
.map(DomainPresenceInfo::getDomain)
.map(Domain::getStatus)
.map(DomainStatus::getIntrospectJobFailureCount)
.orElse(0);
}

private boolean hasExceededRetryCount(DomainPresenceInfo info) {
return getCurrentIntrospectFailureRetryCount(info)
>= DomainPresence.getDomainPresenceFailureRetryMaxCount();
}

private static boolean isCachedInfoNewer(DomainPresenceInfo liveInfo, DomainPresenceInfo cachedInfo) {
return liveInfo.getDomain() != null
&& KubernetesUtils.isFirstNewer(cachedInfo.getDomain().getMetadata(), liveInfo.getDomain().getMetadata());
Expand Down Expand Up @@ -1048,7 +1038,8 @@ public void onThrowable(Packet packet, Throwable throwable) {
gate.startFiberIfLastFiberMatches(
domainUid,
Fiber.getCurrentIfSet(),
DomainStatusUpdater.createFailureRelatedSteps(throwable),
Step.chain(DomainStatusUpdater.createFailureCountStep(null),
DomainStatusUpdater.createFailureRelatedSteps(throwable)),
plan.packet,
new CompletionCallback() {
@Override
Expand All @@ -1071,7 +1062,7 @@ public void onThrowable(Packet packet, Throwable throwable) {
LoggingContext.setThreadContext().namespace(ns).domainUid(domainUid)) {
existing.setPopulated(false);
// proceed only if we have not already retried max number of times
int retryCount = existing.incrementAndGetFailureCount();
int retryCount = getCurrentIntrospectFailureRetryCount(existing);
LOGGER.fine(
"Failure count for DomainPresenceInfo: "
+ existing
Expand Down Expand Up @@ -1128,6 +1119,11 @@ Step createDomainUpPlan(DomainPresenceInfo info) {
bringAdminServerUp(info, delegate.getPodAwaiterStepFactory(info.getNamespace())),
managedServerStrategy);

if (hasExceededRetryCount(info) && isImgRestartIntrospectVerChanged(info,
getExistingDomainPresenceInfo(info.getNamespace(), info.getDomainUid()))) {
domainUpStrategy = Step.chain(DomainStatusUpdater.createResetFailureCountStep(), domainUpStrategy);
}

return Step.chain(
createDomainUpInitialStep(info),
ConfigMapHelper.readExistingIntrospectorConfigMap(info.getNamespace(), info.getDomainUid()),
Expand Down Expand Up @@ -1174,8 +1170,7 @@ private static class TailStep extends Step {

@Override
public NextAction apply(Packet packet) {
packet.getSpi(DomainPresenceInfo.class).complete();
return doNext(packet);
return doNext(DomainStatusUpdater.createResetFailureCountStep(), packet);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ static class FailureCountStep extends DomainStatusUpdaterStep {

private final V1Job domainIntrospectorJob;

public FailureCountStep(@Nonnull V1Job domainIntrospectorJob) {
public FailureCountStep(V1Job domainIntrospectorJob) {
super(null);
this.domainIntrospectorJob = domainIntrospectorJob;
}
Expand All @@ -177,10 +177,22 @@ void modifyStatus(DomainStatus domainStatus) {

@Nullable
private String getJobUid() {
return Optional.of(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null);
return Optional.ofNullable(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null);
}
}

static class ResetFailureCountStep extends DomainStatusUpdaterStep {

@Override
void modifyStatus(DomainStatus domainStatus) {
domainStatus.resetIntrospectJobFailureCount();
}
}

public static Step createResetFailureCountStep() {
return new ResetFailureCountStep();
}

private static String getEventMessage(@Nonnull DomainFailureReason reason, String message) {
return !StringUtils.isBlank(message) ? message : reason.toString();
}
Expand Down Expand Up @@ -736,7 +748,6 @@ private boolean isDeleting(String serverName) {
return Optional.ofNullable(getInfo().getServerPod(serverName)).map(PodHelper::isDeleting).orElse(false);
}


private String getDesiredState(String serverName, String clusterName, boolean isAdminServer) {
return isAdminServer | expectedRunningServers.contains(serverName)
? getDomain().getServer(serverName, clusterName).getDesiredState()
Expand Down
20 changes: 16 additions & 4 deletions operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,11 @@ public static boolean isComplete(V1Job job) {
return false;
}

static boolean isFailed(V1Job job) {
/**
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Test if" is never a good Javadoc comment. This method is better described as "Returns true if the specified job has a failed status or condition." Then you don't need the @return. Also, it is bad style to describe a parameter simply by repeating its name. I usually say something like, "the job to be tested."

Copy link
Member Author

@ankedia ankedia Nov 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in eaf7651.

* Returns true if the specified job has a failed status or condition.
* @param job job to be tested
*/
public static boolean isFailed(V1Job job) {
if (job == null) {
return false;
}
Expand Down Expand Up @@ -173,8 +177,12 @@ private static String getStatus(V1JobCondition jobCondition) {
return Optional.ofNullable(jobCondition).map(V1JobCondition::getStatus).orElse("");
}


static String getFailedReason(V1Job job) {
/**
* Get the reason for job failure.
* @param job job
* @return Job failure reason.
*/
public static String getFailedReason(V1Job job) {
V1JobStatus status = job.getStatus();
if (status != null && status.getConditions() != null) {
for (V1JobCondition cond : status.getConditions()) {
Expand Down Expand Up @@ -298,7 +306,7 @@ void updatePacket(Packet packet, V1Job job) {
// be available for reading
@Override
boolean shouldTerminateFiber(V1Job job) {
return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job)));
return isJobTimedOut(job);
}

// create an exception to terminate the fiber
Expand Down Expand Up @@ -328,6 +336,10 @@ public NextAction onSuccess(Packet packet, CallResponse<V1Job> callResponse) {
}
}

public static boolean isJobTimedOut(V1Job job) {
return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job)));
}

static class DeadlineExceededException extends Exception {
final V1Job job;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
Expand Down Expand Up @@ -62,7 +61,6 @@ public class DomainPresenceInfo implements PacketComponent {
private final AtomicReference<Domain> domain;
private final AtomicBoolean isDeleting = new AtomicBoolean(false);
private final AtomicBoolean isPopulated = new AtomicBoolean(false);
private final AtomicInteger retryCount = new AtomicInteger(0);
private final AtomicReference<Collection<ServerStartupInfo>> serverStartupInfo;
private final AtomicReference<Collection<ServerShutdownInfo>> serverShutdownInfo;

Expand Down Expand Up @@ -570,23 +568,6 @@ public void setPopulated(boolean populated) {
isPopulated.set(populated);
}

private void resetFailureCount() {
retryCount.set(0);
}

public int incrementAndGetFailureCount() {
return retryCount.incrementAndGet();
}

int getRetryCount() {
return retryCount.get();
}

/** Sets the last completion time to now. */
public void complete() {
resetFailureCount();
}

/**
* Gets the domain. Except the instance to change frequently based on status updates.
*
Expand Down
Loading