diff --git a/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java b/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java index 0048f5e7d3a..8e0081f3a09 100644 --- a/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java +++ b/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java @@ -38,6 +38,7 @@ import oracle.weblogic.kubernetes.annotations.IntegrationTest; import oracle.weblogic.kubernetes.annotations.Namespaces; import oracle.weblogic.kubernetes.logging.LoggingFacade; +import org.awaitility.core.ConditionFactory; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.DisplayName; @@ -48,6 +49,8 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; +import static java.util.concurrent.TimeUnit.MINUTES; +import static java.util.concurrent.TimeUnit.SECONDS; import static oracle.weblogic.kubernetes.TestConstants.ADMIN_PASSWORD_DEFAULT; import static oracle.weblogic.kubernetes.TestConstants.ADMIN_USERNAME_DEFAULT; import static oracle.weblogic.kubernetes.TestConstants.BASE_IMAGES_REPO_SECRET; @@ -109,6 +112,7 @@ import static oracle.weblogic.kubernetes.utils.SecretUtils.createSecretWithUsernamePassword; import static oracle.weblogic.kubernetes.utils.ThreadSafeLogger.getLogger; import static oracle.weblogic.kubernetes.utils.WLSTUtils.executeWLSTScript; +import static org.awaitility.Awaitility.with; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -155,6 +159,10 @@ class ItKubernetesEvents { private static LoggingFacade logger = null; + public static ConditionFactory withLongRetryPolicy = with().pollDelay(2, SECONDS) + .and().with().pollInterval(10, SECONDS) + .atMost(10, MINUTES).await(); + /** * Assigns unique namespaces for operator and domains. * Pull WebLogic image if running tests in Kind cluster. @@ -188,7 +196,7 @@ public static void initAll(@Namespaces(6) List namespaces) { externalRestHttpsPort = getServiceNodePort(opNamespace, "external-weblogic-operator-svc"); // This test uses the operator restAPI to scale the domain. To do this in OKD cluster, - // we need to expose the external service as route and set tls termination to passthrough + // we need to expose the external service as route and set tls termination to passthrough logger.info("Create a route for the operator external service - only for OKD"); String opExternalSvc = createRouteForOKD("external-weblogic-operator-svc", opNamespace); // Patch the route just created to set tls termination to passthrough @@ -292,13 +300,13 @@ void testDomainK8sEventsNonExistingCluster() { } /** - * Test the following domain events are logged when domain resource goes through various life cycle stages. - * Patch the domain resource to remove the webLogicCredentialsSecret and verify DomainChanged is - * logged when operator processes the domain resource changes. - * Verifies DomainProcessingRetrying is logged when operator retries the failed domain resource - * changes since webLogicCredentialsSecret is still missing. + * Test the following domain events are logged when domain resource goes through introspector failure. + * Patch the domain resource to shutdown servers. + * Patch the domain resource to point to a bad DOMAIN_HOME and update serverStartPolicy to IF_NEEDED. + * Verifies DomainProcessingFailed event is logged. * Verifies DomainProcessingAborted is logged when operator exceeds the maximum retries and gives * up processing the domain resource. + * Cleanup by patching the domain resource to a valid location and introspectVersion to bring up all servers again. */ @Order(4) @Test @@ -306,40 +314,68 @@ void testDomainK8sEventsNonExistingCluster() { void testDomainK8SEventsFailed() { V1Patch patch; String patchStr; + Domain domain = assertDoesNotThrow(() -> getDomainCustomResource(domainUid, domainNamespace1)); + String originalDomainHome = domain.getSpec().getDomainHome(); OffsetDateTime timestamp = now(); try { - logger.info("remove the webLogicCredentialsSecret to verify the following events" - + " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged"); - patchStr = "[{\"op\": \"remove\", \"path\": \"/spec/webLogicCredentialsSecret\"}]"; - logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr); + logger.info("Shutting down all servers in domain with serverStartPolicy : NEVER"); + patchStr = "[{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"NEVER\"}]"; + patch = new V1Patch(patchStr); + assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH), + "patchDomainCustomResource failed"); + + logger.info("Checking if the admin server {0} is shutdown in namespace {1}", + adminServerPodName, domainNamespace1); + checkPodDoesNotExist(adminServerPodName, domainUid, domainNamespace1); + + for (int i = 1; i <= replicaCount; i++) { + logger.info("Checking if the managed server {0} is shutdown in namespace {1}", + managedServerPodNamePrefix + i, domainNamespace1); + checkPodDoesNotExist(managedServerPodNamePrefix + i, domainUid, domainNamespace1); + } + + logger.info("Replace the domainHome to a nonexisting location to verify the following events" + + " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged"); + patchStr = "[{\"op\": \"replace\", " + + "\"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "bad\"}," + + "{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"IF_NEEDED\"}]"; + logger.info("PatchStr for domainHome: {0}", patchStr); patch = new V1Patch(patchStr); assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH), - "patchDomainCustomResource failed"); + "patchDomainCustomResource failed"); logger.info("verify domain changed event is logged"); checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp); - - // logger.info("verify domain processing retrying event"); - // checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_RETRYING, "Normal", timestamp); - logger.info("verify domain processing aborted event"); checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_ABORTED, "Warning", timestamp); } finally { + logger.info("Restoring the domain with valid location and bringing up all servers"); timestamp = now(); - // add back the webLogicCredentialsSecret - patchStr = "[{\"op\": \"add\", \"path\": \"/spec/webLogicCredentialsSecret\", " - + "\"value\" : {\"name\":\"" + wlSecretName + "\" , \"namespace\":\"" + domainNamespace1 + "\"}" - + "}]"; - logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr); + String introspectVersion = assertDoesNotThrow(() -> getNextIntrospectVersion(domainUid, domainNamespace1)); + // add back the original domain home + patchStr = "[" + + "{\"op\": \"replace\", \"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "\"}," + + "{\"op\": \"add\", \"path\": \"/spec/introspectVersion\", \"value\": \"" + introspectVersion + "\"}" + + "]"; + logger.info("PatchStr for domainHome: {0}", patchStr); patch = new V1Patch(patchStr); assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH), - "patchDomainCustomResource failed"); + "patchDomainCustomResource failed"); logger.info("verify domain changed event is logged"); checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp); + logger.info("verifying the admin server is created and started"); + checkPodReadyAndServiceExists(adminServerPodName, domainUid, domainNamespace1); + + // verify managed server services created + for (int i = 1; i <= replicaCount; i++) { + logger.info("Checking managed server service/pod {0} is created in namespace {1}", + managedServerPodNamePrefix + i, domainNamespace1); + checkPodReadyAndServiceExists(managedServerPodNamePrefix + i, domainUid, domainNamespace1); + } } } @@ -889,7 +925,7 @@ public static void tearDown() { private static void checkEvent( String opNamespace, String domainNamespace, String domainUid, String reason, String type, OffsetDateTime timestamp) { - testUntil( + testUntil(withLongRetryPolicy, checkDomainEvent(opNamespace, domainNamespace, domainUid, reason, type, timestamp), logger, "domain event {0} to be logged in namespace {1}", diff --git a/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java b/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java index 60fc8ab09d7..9b40b0b3795 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java +++ b/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java @@ -841,7 +841,7 @@ private boolean shouldContinue() { return true; } else if (shouldReportAbortedEvent()) { return true; - } else if (hasExceededRetryCount() && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) { + } else if (hasExceededRetryCount(liveInfo) && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) { LOGGER.severe(ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG); return false; } else if (isFatalIntrospectorError()) { @@ -852,10 +852,7 @@ private boolean shouldContinue() { return false; // we have already cached this } else if (shouldRecheck(cachedInfo)) { - if (hasExceededRetryCount()) { - resetIntrospectorJobFailureCount(); - } - if (getCurrentIntrospectFailureRetryCount() > 0) { + if (getCurrentIntrospectFailureRetryCount(liveInfo) > 0) { logRetryCount(cachedInfo); } LOGGER.fine("Continue the make-right domain presence, explicitRecheck -> " + explicitRecheck); @@ -869,29 +866,9 @@ private boolean shouldReportAbortedEvent() { return Optional.ofNullable(eventData).map(EventData::getItem).orElse(null) == DOMAIN_PROCESSING_ABORTED; } - private void resetIntrospectorJobFailureCount() { - Optional.ofNullable(liveInfo) - .map(DomainPresenceInfo::getDomain) - .map(Domain::getStatus) - .map(DomainStatus::resetIntrospectJobFailureCount); - } - - private boolean hasExceededRetryCount() { - return getCurrentIntrospectFailureRetryCount() - >= DomainPresence.getDomainPresenceFailureRetryMaxCount(); - } - - private Integer getCurrentIntrospectFailureRetryCount() { - return Optional.ofNullable(liveInfo) - .map(DomainPresenceInfo::getDomain) - .map(Domain::getStatus) - .map(DomainStatus::getIntrospectJobFailureCount) - .orElse(0); - } - private void logRetryCount(DomainPresenceInfo cachedInfo) { LOGGER.info(MessageKeys.INTROSPECT_JOB_FAILED_RETRY_COUNT, cachedInfo.getDomain().getDomainUid(), - getCurrentIntrospectFailureRetryCount(), + getCurrentIntrospectFailureRetryCount(liveInfo), DomainPresence.getDomainPresenceFailureRetryMaxCount()); } @@ -1020,6 +997,19 @@ private static String getIntrospectVersion(DomainPresenceInfo info) { .orElse(null); } + private Integer getCurrentIntrospectFailureRetryCount(DomainPresenceInfo info) { + return Optional.ofNullable(info) + .map(DomainPresenceInfo::getDomain) + .map(Domain::getStatus) + .map(DomainStatus::getIntrospectJobFailureCount) + .orElse(0); + } + + private boolean hasExceededRetryCount(DomainPresenceInfo info) { + return getCurrentIntrospectFailureRetryCount(info) + >= DomainPresence.getDomainPresenceFailureRetryMaxCount(); + } + private static boolean isCachedInfoNewer(DomainPresenceInfo liveInfo, DomainPresenceInfo cachedInfo) { return liveInfo.getDomain() != null && KubernetesUtils.isFirstNewer(cachedInfo.getDomain().getMetadata(), liveInfo.getDomain().getMetadata()); @@ -1048,7 +1038,8 @@ public void onThrowable(Packet packet, Throwable throwable) { gate.startFiberIfLastFiberMatches( domainUid, Fiber.getCurrentIfSet(), - DomainStatusUpdater.createFailureRelatedSteps(throwable), + Step.chain(DomainStatusUpdater.createFailureCountStep(null), + DomainStatusUpdater.createFailureRelatedSteps(throwable)), plan.packet, new CompletionCallback() { @Override @@ -1071,7 +1062,7 @@ public void onThrowable(Packet packet, Throwable throwable) { LoggingContext.setThreadContext().namespace(ns).domainUid(domainUid)) { existing.setPopulated(false); // proceed only if we have not already retried max number of times - int retryCount = existing.incrementAndGetFailureCount(); + int retryCount = getCurrentIntrospectFailureRetryCount(existing); LOGGER.fine( "Failure count for DomainPresenceInfo: " + existing @@ -1128,6 +1119,11 @@ Step createDomainUpPlan(DomainPresenceInfo info) { bringAdminServerUp(info, delegate.getPodAwaiterStepFactory(info.getNamespace())), managedServerStrategy); + if (hasExceededRetryCount(info) && isImgRestartIntrospectVerChanged(info, + getExistingDomainPresenceInfo(info.getNamespace(), info.getDomainUid()))) { + domainUpStrategy = Step.chain(DomainStatusUpdater.createResetFailureCountStep(), domainUpStrategy); + } + return Step.chain( createDomainUpInitialStep(info), ConfigMapHelper.readExistingIntrospectorConfigMap(info.getNamespace(), info.getDomainUid()), @@ -1174,8 +1170,7 @@ private static class TailStep extends Step { @Override public NextAction apply(Packet packet) { - packet.getSpi(DomainPresenceInfo.class).complete(); - return doNext(packet); + return doNext(DomainStatusUpdater.createResetFailureCountStep(), packet); } } diff --git a/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java b/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java index f79799b7302..1fedd09e698 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java +++ b/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java @@ -165,7 +165,7 @@ static class FailureCountStep extends DomainStatusUpdaterStep { private final V1Job domainIntrospectorJob; - public FailureCountStep(@Nonnull V1Job domainIntrospectorJob) { + public FailureCountStep(V1Job domainIntrospectorJob) { super(null); this.domainIntrospectorJob = domainIntrospectorJob; } @@ -177,10 +177,22 @@ void modifyStatus(DomainStatus domainStatus) { @Nullable private String getJobUid() { - return Optional.of(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null); + return Optional.ofNullable(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null); } } + static class ResetFailureCountStep extends DomainStatusUpdaterStep { + + @Override + void modifyStatus(DomainStatus domainStatus) { + domainStatus.resetIntrospectJobFailureCount(); + } + } + + public static Step createResetFailureCountStep() { + return new ResetFailureCountStep(); + } + private static String getEventMessage(@Nonnull DomainFailureReason reason, String message) { return !StringUtils.isBlank(message) ? message : reason.toString(); } @@ -736,7 +748,6 @@ private boolean isDeleting(String serverName) { return Optional.ofNullable(getInfo().getServerPod(serverName)).map(PodHelper::isDeleting).orElse(false); } - private String getDesiredState(String serverName, String clusterName, boolean isAdminServer) { return isAdminServer | expectedRunningServers.contains(serverName) ? getDomain().getServer(serverName, clusterName).getDesiredState() diff --git a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java index 31ef8b72902..7d03f05330b 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java +++ b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java @@ -137,7 +137,11 @@ public static boolean isComplete(V1Job job) { return false; } - static boolean isFailed(V1Job job) { + /** + * Returns true if the specified job has a failed status or condition. + * @param job job to be tested + */ + public static boolean isFailed(V1Job job) { if (job == null) { return false; } @@ -173,8 +177,12 @@ private static String getStatus(V1JobCondition jobCondition) { return Optional.ofNullable(jobCondition).map(V1JobCondition::getStatus).orElse(""); } - - static String getFailedReason(V1Job job) { + /** + * Get the reason for job failure. + * @param job job + * @return Job failure reason. + */ + public static String getFailedReason(V1Job job) { V1JobStatus status = job.getStatus(); if (status != null && status.getConditions() != null) { for (V1JobCondition cond : status.getConditions()) { @@ -298,7 +306,7 @@ void updatePacket(Packet packet, V1Job job) { // be available for reading @Override boolean shouldTerminateFiber(V1Job job) { - return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job))); + return isJobTimedOut(job); } // create an exception to terminate the fiber @@ -328,6 +336,10 @@ public NextAction onSuccess(Packet packet, CallResponse callResponse) { } } + public static boolean isJobTimedOut(V1Job job) { + return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job))); + } + static class DeadlineExceededException extends Exception { final V1Job job; diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java index 373a4389220..565332117b1 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java @@ -16,7 +16,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -62,7 +61,6 @@ public class DomainPresenceInfo implements PacketComponent { private final AtomicReference domain; private final AtomicBoolean isDeleting = new AtomicBoolean(false); private final AtomicBoolean isPopulated = new AtomicBoolean(false); - private final AtomicInteger retryCount = new AtomicInteger(0); private final AtomicReference> serverStartupInfo; private final AtomicReference> serverShutdownInfo; @@ -570,23 +568,6 @@ public void setPopulated(boolean populated) { isPopulated.set(populated); } - private void resetFailureCount() { - retryCount.set(0); - } - - public int incrementAndGetFailureCount() { - return retryCount.incrementAndGet(); - } - - int getRetryCount() { - return retryCount.get(); - } - - /** Sets the last completion time to now. */ - public void complete() { - resetFailureCount(); - } - /** * Gets the domain. Except the instance to change frequently based on status updates. * diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java index 4cc6248ca77..ee7268247eb 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java @@ -13,12 +13,16 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; +import io.kubernetes.client.openapi.models.V1ContainerState; +import io.kubernetes.client.openapi.models.V1ContainerStateWaiting; +import io.kubernetes.client.openapi.models.V1ContainerStatus; import io.kubernetes.client.openapi.models.V1DeleteOptions; import io.kubernetes.client.openapi.models.V1Job; import io.kubernetes.client.openapi.models.V1JobSpec; import io.kubernetes.client.openapi.models.V1ObjectMeta; import io.kubernetes.client.openapi.models.V1Pod; import io.kubernetes.client.openapi.models.V1PodList; +import io.kubernetes.client.openapi.models.V1PodStatus; import oracle.kubernetes.operator.IntrospectorConfigMapConstants; import oracle.kubernetes.operator.JobWatcher; import oracle.kubernetes.operator.LabelConstants; @@ -49,6 +53,7 @@ import static oracle.kubernetes.operator.DomainStatusUpdater.createFailureRelatedSteps; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_DOMAIN_SPEC_GENERATION; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_STATE_LABEL; +import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECTOR_JOB; import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECT_REQUESTED; import static oracle.kubernetes.operator.helpers.ConfigMapHelper.readExistingIntrospectorConfigMap; import static oracle.kubernetes.operator.logging.MessageKeys.INTROSPECTOR_JOB_FAILED; @@ -210,16 +215,16 @@ private class VerifyIntrospectorJobResponseStep extends DefaultResponseStep callResponse) { V1Job job = (V1Job) callResponse.getResult(); - if ((job != null) && (packet.get(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB) == null)) { - packet.put(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB, job); + if ((job != null) && (packet.get(DOMAIN_INTROSPECTOR_JOB) == null)) { + packet.put(DOMAIN_INTROSPECTOR_JOB, job); } - if (isKnownFailedJob(job)) { - return doNext(cleanUpAndReintrospect(), packet); + if (isKnownFailedJob(job) || JobWatcher.isJobTimedOut(job)) { + return doNext(cleanUpAndReintrospect(getNext()), packet); } else if (job != null) { - return doNext(processIntrospectionResults(), packet); + return doNext(processIntrospectionResults(getNext()), packet); } else if (isIntrospectionNeeded(packet)) { - return doNext(createIntrospectionSteps(), packet); + return doNext(createIntrospectionSteps(getNext()), packet); } else { return doNext(packet); } @@ -239,21 +244,6 @@ private String getLastFailedUid() { return getDomain().getOrCreateStatus().getFailedIntrospectionUid(); } - private Step cleanUpAndReintrospect() { - return Step.chain(deleteIntrospectorJob(), createIntrospectionSteps()); - } - - private Step createIntrospectionSteps() { - return Step.chain( - readExistingIntrospectorConfigMap(getNamespace(), getDomainUid()), - startNewIntrospection(getNext())); - } - - @Nonnull - private Step startNewIntrospection(Step next) { - return Step.chain(createNewJob(), processIntrospectionResults(), next); - } - private boolean isIntrospectionNeeded(Packet packet) { return getDomainTopology() == null || isBringingUpNewDomain(packet) @@ -315,22 +305,31 @@ private String getIntrospectVersion() { } } - // Returns a chain of steps which attempt to read the introspection result into a config map. - private Step processIntrospectionResults() { + private Step cleanUpAndReintrospect(Step next) { + return Step.chain(deleteIntrospectorJob(), createIntrospectionSteps(next)); + } + + private Step createIntrospectionSteps(Step next) { return Step.chain( - waitForIntrospectionToComplete(), - findIntrospectorPodName(), - readNamedPodLog(), - deleteIntrospectorJob(), - createIntrospectorConfigMap() - ); + readExistingIntrospectorConfigMap(getNamespace(), getDomainUid()), + startNewIntrospection(next)); + } + + @Nonnull + private Step startNewIntrospection(Step next) { + return Step.chain(createNewJob(), processIntrospectionResults(next)); + } + + // Returns a chain of steps which read the job pod and decide how to handle it. + private Step processIntrospectionResults(Step next) { + return Step.chain(waitForIntrospectionToComplete(), verifyJobPod(), next); } private Step waitForIntrospectionToComplete() { return new WatchDomainIntrospectorJobReadyStep(); } - private Step findIntrospectorPodName() { + private Step verifyJobPod() { return new ReadDomainIntrospectorPodStep(); } @@ -390,7 +389,7 @@ private class ReadPodLogResponseStep extends ResponseStep { public NextAction onSuccess(Packet packet, CallResponse callResponse) { Optional.ofNullable(callResponse.getResult()).ifPresent(result -> processIntrospectionResult(packet, result)); - final V1Job domainIntrospectorJob = packet.getValue(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); + final V1Job domainIntrospectorJob = packet.getValue(DOMAIN_INTROSPECTOR_JOB); if (JobWatcher.isComplete(domainIntrospectorJob)) { return doNext(packet); } else { @@ -559,30 +558,62 @@ private class PodListResponseStep extends ResponseStep { @Override public NextAction onSuccess(Packet packet, CallResponse callResponse) { - Optional.ofNullable(callResponse.getResult()) + final V1Pod jobPod + = Optional.ofNullable(callResponse.getResult()) .map(V1PodList::getItems) .orElseGet(Collections::emptyList) .stream() - .map(this::getName) - .filter(this::isJobPodName) + .filter(this::isJobPod) .findFirst() - .ifPresent(name -> recordJobPodName(packet, name)); + .orElse(null); - return doContinueListOrNext(callResponse, packet); + if (jobPod == null) { + return doContinueListOrNext(callResponse, packet, processIntrospectorPodLog(getNext())); + } else if (hasImagePullFailure(jobPod) || isJobPodTimedOut(jobPod)) { + return doNext(cleanUpAndReintrospect(getNext()), packet); + } else { + recordJobPodName(packet, getName(jobPod)); + return doNext(processIntrospectorPodLog(getNext()), packet); + } + } + + private boolean isJobPodTimedOut(V1Pod jobPod) { + return "DeadlineExceeded".equals(getJobPodStatusReason(jobPod)); + } + + private String getJobPodStatusReason(V1Pod jobPod) { + return Optional.ofNullable(jobPod.getStatus()).map(V1PodStatus::getReason).orElse(null); + } + + // Returns a chain of steps which read the pod log and create a config map. + private Step processIntrospectorPodLog(Step next) { + return Step.chain(readNamedPodLog(), deleteIntrospectorJob(), createIntrospectorConfigMap(), next); } private String getName(V1Pod pod) { return Optional.of(pod).map(V1Pod::getMetadata).map(V1ObjectMeta::getName).orElse(""); } - private boolean isJobPodName(String podName) { - return podName.startsWith(getJobName()); + private boolean isJobPod(V1Pod pod) { + return getName(pod).startsWith(getJobName()); + } + + private boolean hasImagePullFailure(V1Pod pod) { + return Optional.ofNullable(getJobPodContainerWaitingReason(pod)) + .map(s -> s.contains("ErrImagePull") || s.contains("ImagePullBackOff")) + .orElse(false); + } + + private String getJobPodContainerWaitingReason(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getStatus) + .map(V1PodStatus::getContainerStatuses).map(statuses -> statuses.get(0)) + .map(V1ContainerStatus::getState).map(V1ContainerState::getWaiting) + .map(V1ContainerStateWaiting::getReason).orElse(null); } private void recordJobPodName(Packet packet, String podName) { packet.put(ProcessingConstants.JOB_POD_NAME, podName); } - } } @@ -605,7 +636,7 @@ private static void logIntrospectorFailure(Packet packet, V1Job domainIntrospect static void logJobDeleted(String domainUid, String namespace, String jobName, Packet packet) { V1Job domainIntrospectorJob = - (V1Job) packet.remove(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); + (V1Job) packet.remove(DOMAIN_INTROSPECTOR_JOB); packet.remove(ProcessingConstants.INTROSPECTOR_JOB_FAILURE_LOGGED); if (domainIntrospectorJob != null diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java index 39a4cc70148..bdc9939cef6 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java @@ -311,7 +311,12 @@ private V1ObjectMeta createMetadata() { private long getActiveDeadlineSeconds(TuningParameters.PodTuning podTuning) { return getIntrospectorJobActiveDeadlineSeconds(podTuning) - + (DEFAULT_ACTIVE_DEADLINE_INCREMENT_SECONDS * info.getRetryCount()); + + (DEFAULT_ACTIVE_DEADLINE_INCREMENT_SECONDS * getIntrospectJobFailureCount()); + } + + private Integer getIntrospectJobFailureCount() { + return Optional.ofNullable(info.getDomain().getStatus()) + .map(s -> s.getIntrospectJobFailureCount()).orElse(0); } V1JobSpec createJobSpec(TuningParameters tuningParameters) { diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java index b2c08b78603..eefb9ea2ea7 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java @@ -62,10 +62,13 @@ public NextAction apply(Packet packet) { secretName = dpi.getDomain().getWebLogicCredentialsSecretName(); namespace = dpi.getNamespace(); - LOGGER.fine(MessageKeys.RETRIEVING_SECRET, secretName); - Step read = new CallBuilder().readSecretAsync(secretName, namespace, new SecretResponseStep(getNext())); - - return doNext(read, packet); + if (secretName == null) { + return doNext(packet); + } else { + LOGGER.fine(MessageKeys.RETRIEVING_SECRET, secretName); + final Step read = new CallBuilder().readSecretAsync(secretName, namespace, new SecretResponseStep(getNext())); + return doNext(read, packet); + } } } diff --git a/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java b/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java index 823475b25f4..01696c60e13 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java +++ b/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java @@ -58,12 +58,14 @@ AuthorizationSource getAuthorizationSource() { } final HttpRequest.Builder createRequestBuilder(String url) { - return HttpRequest.newBuilder() + HttpRequest.Builder builder = HttpRequest.newBuilder() .uri(URI.create(url)) - .header("Authorization", getAuthorizationSource().createBasicAuthorizationString()) .header("Accept", "application/json") .header("Content-Type", "application/json") .header("X-Requested-By", "WebLogic Operator"); + Optional.ofNullable(getAuthorizationSource()) + .ifPresent(source -> builder.header("Authorization", source.createBasicAuthorizationString())); + return builder; } /** diff --git a/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java b/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java index 3867c7e1703..76465c16348 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java +++ b/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java @@ -109,14 +109,20 @@ public synchronized Fiber startFiberIfLastFiberMatches( new CompletionCallback() { @Override public void onCompletion(Packet packet) { - gateMap.remove(key, f); - callback.onCompletion(packet); + try { + callback.onCompletion(packet); + } finally { + gateMap.remove(key, f); + } } @Override public void onThrowable(Packet packet, Throwable throwable) { - gateMap.remove(key, f); - callback.onThrowable(packet, throwable); + try { + callback.onThrowable(packet, throwable); + } finally { + gateMap.remove(key, f); + } } }); return f; diff --git a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java index 17b66a3ce41..0da34436800 100644 --- a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java +++ b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java @@ -458,7 +458,7 @@ public Domain withStatus(DomainStatus status) { * @return the secret name */ public String getWebLogicCredentialsSecretName() { - return spec.getWebLogicCredentialsSecret().getName(); + return Optional.ofNullable(spec.getWebLogicCredentialsSecret()).map(V1SecretReference::getName).orElse(null); } /** diff --git a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java index 43d6116165c..11b001352cf 100644 --- a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java +++ b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java @@ -295,12 +295,20 @@ public Integer getIntrospectJobFailureCount() { * @param uid the Kubernetes-assigned UID of the job which discovered the introspection failure */ public void incrementIntrospectJobFailureCount(String uid) { - if (!Objects.equals(uid, failedIntrospectionUid)) { + if (fiberException(uid) || failedIntrospectionNotRecorded(uid)) { introspectJobFailureCount = introspectJobFailureCount + 1; } failedIntrospectionUid = uid; } + private boolean fiberException(String uid) { + return uid == null; + } + + private boolean failedIntrospectionNotRecorded(String uid) { + return !uid.equals(failedIntrospectionUid); + } + /** * Reset the number of introspect job failure to default. * @@ -520,4 +528,9 @@ public boolean equals(Object other) { public void createPatchFrom(JsonPatchBuilder builder, @Nullable DomainStatus oldStatus) { statusPatch.createPatch(builder, "/status", oldStatus, this); } + + public DomainStatus withIntrospectJobFailureCount(int failureCount) { + this.introspectJobFailureCount = failureCount; + return this; + } } diff --git a/operator/src/main/resources/Operator.properties b/operator/src/main/resources/Operator.properties index a1a23ef2f63..33b2cf60cf5 100644 --- a/operator/src/main/resources/Operator.properties +++ b/operator/src/main/resources/Operator.properties @@ -107,7 +107,9 @@ WLSKO-0154=Job {0} failed due to reason: DeadlineExceeded. \ Use kubectl describe for the job and its pod for more job failure information. \ The job may be retried by the operator up to {3} \ times with longer ActiveDeadlineSeconds value in each subsequent retry. \ - Use tuning parameter 'domainPresenceFailureRetryMaxCount' to configure max retries. + Use tuning parameter 'domainPresenceFailureRetryMaxCount' to configure max retries. \ + Use 'spec.configuration.introspectorJobActiveDeadlineSeconds' to increase the job \ + timeout interval if the job still fails after the retries are exhausted. WLSKO-0156=Access denied for operator service account for operation {0} on resource {1} in namespace {2}. WLSKO-0157=Domain {0} is not valid: {1} WLSKO-162=Unable to read internal certificate at path {0} diff --git a/operator/src/main/resources/scripts/modelInImage.sh b/operator/src/main/resources/scripts/modelInImage.sh index a72265bf038..15a2cdc4f6d 100755 --- a/operator/src/main/resources/scripts/modelInImage.sh +++ b/operator/src/main/resources/scripts/modelInImage.sh @@ -33,7 +33,12 @@ IMG_MODELS_ROOTDIR="${IMG_MODELS_HOME}" IMG_ARCHIVES_ROOTDIR="${IMG_MODELS_HOME}" IMG_VARIABLE_FILES_ROOTDIR="${IMG_MODELS_HOME}" WDT_ROOT="${WDT_INSTALL_HOME:-/u01/wdt/weblogic-deploy}" -WDT_OUTPUT="/tmp/wdt_output.log" +WDT_OUTPUT_DIR="${LOG_HOME:-/tmp}" +WDT_OUTPUT="${WDT_OUTPUT_DIR}/wdt_output.log" +WDT_CREATE_DOMAIN_LOG=createDomain.log +WDT_UPDATE_DOMAIN_LOG=updateDomain.log +WDT_VALIDATE_MODEL_LOG=validateModel.log +WDT_COMPARE_MODEL_LOG=compareModel.log WDT_BINDIR="${WDT_ROOT}/bin" WDT_FILTER_JSON="/weblogic-operator/scripts/model-filters.json" WDT_CREATE_FILTER="/weblogic-operator/scripts/model-wdt-create-filter.py" @@ -71,6 +76,10 @@ export WDT_MODEL_SECRETS_DIRS="/weblogic-operator/config-overrides-secrets" #For now: export WDT_MODEL_SECRETS_NAME_DIR_PAIRS="__weblogic-credentials__=/weblogic-operator/secrets,__WEBLOGIC-CREDENTIALS__=/weblogic-operator/secrets" +if [ ! -d "${WDT_OUTPUT_DIR}" ]; then + trace "Creating WDT standard output directory: '${WDT_OUTPUT_DIR}'" + createFolder "${WDT_OUTPUT_DIR}" +fi # sort_files sort the files according to the names and naming conventions and write the result to stdout # $1 directory @@ -667,6 +676,8 @@ function diff_model() { fi fi + wdtRotateAndCopyLogFile "${WDT_COMPARE_MODEL_LOG}" + trace "Exiting diff_model" } @@ -836,6 +847,8 @@ function generateMergedModel() { exitOrLoop fi + wdtRotateAndCopyLogFile "${WDT_VALIDATE_MODEL_LOG}" + # restore trap start_trap trace "Exiting generateMergedModel" @@ -933,6 +946,8 @@ function wdtCreatePrimordialDomain() { fi fi + wdtRotateAndCopyLogFile "${WDT_CREATE_DOMAIN_LOG}" + # restore trap start_trap trace "Exiting wdtCreatePrimordialDomain" @@ -1006,6 +1021,8 @@ function wdtUpdateModelDomain() { encrypt_decrypt_model "encrypt" ${DOMAIN_HOME}/wlsdeploy/domain_model.json.b64 ${MII_PASSPHRASE} \ ${DOMAIN_HOME}/wlsdeploy/domain_model.json + wdtRotateAndCopyLogFile "${WDT_UPDATE_DOMAIN_LOG}" + # restore trap start_trap trace "Exiting wdtUpdateModelDomain" @@ -1098,6 +1115,8 @@ function wdtHandleOnlineUpdate() { cp /tmp/encrypted_merge_model.json ${DOMAIN_HOME}/wlsdeploy/domain_model.json + wdtRotateAndCopyLogFile ${WDT_UPDATE_DOMAIN_LOG} + trace "wrote updateResult" start_trap @@ -1326,3 +1345,16 @@ function logSevereAndExit() { trace SEVERE "cp '$1' failed" exitOrLoop } + +# Function to rotate WDT script log file and copy the file to WDT output dir. +# parameter: +# 1 - Name of the log file to rotate and copy to WDT output directory. +function wdtRotateAndCopyLogFile() { + local logFileName=$1 + testLogFileRotate "${WDT_OUTPUT_DIR}/${logFileName}" + [ $? -ne 0 ] && trace SEVERE "Error accessing '${WDT_OUTPUT_DIR}'. See previous log messages." && exit 1 + + logFileRotate ${WDT_OUTPUT_DIR}/${logFileName} ${WDT_LOG_FILE_MAX:-11} + + cp ${WDT_ROOT}/logs/${logFileName} ${WDT_OUTPUT_DIR}/ +} diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java index 8ac46e2f407..53698649bd1 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java @@ -17,6 +17,8 @@ import oracle.kubernetes.operator.work.Step; import static com.meterware.simplestub.Stub.createStrictStub; +import static oracle.kubernetes.operator.JobWatcher.getFailedReason; +import static oracle.kubernetes.operator.JobWatcher.isFailed; /** A test stub for processing domains in unit tests. */ public abstract class DomainProcessorDelegateStub implements DomainProcessorDelegate { @@ -50,7 +52,7 @@ public PodAwaiterStepFactory getPodAwaiterStepFactory(String namespace) { @Override public JobAwaiterStepFactory getJobAwaiterStepFactory(String namespace) { - return new PassthroughJobAwaiterStepFactory(); + return new TestJobAwaiterStepFactory(); } @Override @@ -96,9 +98,12 @@ public Step waitForDelete(V1Pod pod, Step next) { } } - private class PassthroughJobAwaiterStepFactory implements JobAwaiterStepFactory { + private class TestJobAwaiterStepFactory implements JobAwaiterStepFactory { @Override public Step waitForReady(V1Job job, Step next) { + if (isFailed(job) && "DeadlineExceeded".equals(getFailedReason(job))) { + throw new RuntimeException("DeadlineExceeded"); + } waitedForIntrospection = true; return next; } diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java index 515765cd38b..a3d07d0dc7a 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java @@ -56,6 +56,7 @@ import oracle.kubernetes.operator.helpers.UnitTestHash; import oracle.kubernetes.operator.http.HttpAsyncTestSupport; import oracle.kubernetes.operator.http.HttpResponseStub; +import oracle.kubernetes.operator.logging.MessageKeys; import oracle.kubernetes.operator.rest.ScanCacheStub; import oracle.kubernetes.operator.utils.InMemoryCertificates; import oracle.kubernetes.operator.wlsconfig.WlsClusterConfig; @@ -73,6 +74,7 @@ import oracle.kubernetes.weblogic.domain.model.DomainStatus; import oracle.kubernetes.weblogic.domain.model.ManagedServer; import oracle.kubernetes.weblogic.domain.model.ServerStatus; +import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -96,10 +98,12 @@ import static oracle.kubernetes.operator.LabelConstants.DOMAINUID_LABEL; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_STATE_LABEL; import static oracle.kubernetes.operator.LabelConstants.SERVERNAME_LABEL; +import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECTOR_JOB; import static oracle.kubernetes.operator.WebLogicConstants.RUNNING_STATE; import static oracle.kubernetes.operator.WebLogicConstants.SHUTDOWN_STATE; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.CONFIG_MAP; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.DOMAIN; +import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.JOB; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.POD; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.SERVICE; import static oracle.kubernetes.operator.helpers.SecretHelper.PASSWORD_KEY; @@ -838,6 +842,113 @@ void whenIntrospectionJobNotComplete_waitForIt() throws Exception { assertThat(processorDelegate.waitedForIntrospection(), is(true)); } + + @Test + void whenIntrospectionJobTimedOut_failureCountIncremented() throws Exception { + consoleHandlerMemento.ignoringLoggedExceptions(RuntimeException.class); + consoleHandlerMemento.ignoreMessage(MessageKeys.NOT_STARTING_DOMAINUID_THREAD); + establishPreviousIntrospection(null); + jobStatus = createTimedOutStatus(); + domainConfigurator.withIntrospectVersion(NEW_INTROSPECTION_STATE); + processor.createMakeRightOperation(new DomainPresenceInfo(newDomain)).interrupt().execute(); + + assertThat(newDomain.getStatus().getIntrospectJobFailureCount(), is(1)); + } + + @Test + void whenIntrospectionJobTimedOut_activeDeadlineIncremented() throws Exception { + consoleHandlerMemento.ignoringLoggedExceptions(RuntimeException.class); + consoleHandlerMemento.ignoreMessage(MessageKeys.NOT_STARTING_DOMAINUID_THREAD); + establishPreviousIntrospection(null); + jobStatus = createTimedOutStatus(); + domainConfigurator.withIntrospectVersion(NEW_INTROSPECTION_STATE); + processor.createMakeRightOperation(new DomainPresenceInfo(newDomain)).interrupt().execute(); + + executeScheduledRetry(); + + assertThat(getJob().getSpec().getActiveDeadlineSeconds(), is(240L)); + } + + private void executeScheduledRetry() { + testSupport.setTime(10, TimeUnit.SECONDS); + } + + + @NotNull + private V1Job getJob() { + return (V1Job) testSupport.getResources(JOB).stream().findFirst().get(); + } + + V1JobStatus createTimedOutStatus() { + return new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("DeadlineExceeded")); + } + + @Test + void whenIntrospectionJobPodTimedOut_jobRecreatedAndFailedConditionCleared() throws Exception { + consoleHandlerMemento.ignoringLoggedExceptions(RuntimeException.class); + consoleHandlerMemento.ignoreMessage(MessageKeys.NOT_STARTING_DOMAINUID_THREAD); + jobStatus = createBackoffStatus(); + establishPreviousIntrospection(null); + defineTimedoutIntrospection(); + testSupport.doOnDelete(JOB, j -> deletePod()); + testSupport.doOnCreate(JOB, j -> createJobPodAndSetCompletedStatus(job)); + domainConfigurator.withIntrospectVersion(NEW_INTROSPECTION_STATE); + processor.createMakeRightOperation(new DomainPresenceInfo(newDomain)).interrupt().execute(); + + assertThat(isDomainConditionFailed(), is(false)); + } + + private boolean isDomainConditionFailed() { + return newDomain.getStatus().getConditions().stream().anyMatch(c -> c.getType() == Failed); + } + + V1JobStatus createBackoffStatus() { + return new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("BackoffLimitExceeded")); + } + + private void deletePod() { + testSupport.deleteResources(new V1Pod().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS))); + } + + private static String getJobName() { + return LegalNames.toJobIntrospectorName(UID); + } + + private void createJobPodAndSetCompletedStatus(V1Job job) { + Map labels = new HashMap<>(); + labels.put(LabelConstants.JOBNAME_LABEL, getJobName()); + testSupport.defineResources(POD, + new V1Pod().metadata(new V1ObjectMeta().name(getJobName()).labels(labels).namespace(NS))); + job.setStatus(createCompletedStatus()); + } + + private void defineTimedoutIntrospection() { + V1Job job = asTimedoutJob(createIntrospectorJob("TIMEDOUT_JOB")); + testSupport.defineResources(job); + testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, job); + setJobPodStatusReasonDeadlineExceeded(); + } + + private void setJobPodStatusReasonDeadlineExceeded() { + testSupport.getResourceWithName(POD, getJobName()).status(new V1PodStatus().reason("DeadlineExceeded")); + } + + private V1Job asTimedoutJob(V1Job job) { + job.setStatus(new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("BackoffLimitExceeded"))); + return job; + } + + private V1Job createIntrospectorJob(String uid) { + return new V1Job().metadata(createJobMetadata(uid)).status(new V1JobStatus()); + } + + private V1ObjectMeta createJobMetadata(String uid) { + return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(uid); + } + private void establishPreviousIntrospection(Consumer domainSetup) throws JsonProcessingException { establishPreviousIntrospection(domainSetup, Arrays.asList(1,2)); } @@ -884,6 +995,8 @@ private String defineTopology() throws JsonProcessingException { return IntrospectionTestUtils.createTopologyYaml(createDomainConfig()); } + // case 1: job was able to pull, time out during introspection: pod will have DEADLINE_EXCEEDED + @Test void afterIntrospection_introspectorConfigMapHasUpToDateLabel() throws Exception { establishPreviousIntrospection(null); @@ -1367,10 +1480,6 @@ private String getDesiredState(Domain domain, String serverName) { return Optional.ofNullable(getServerStatus(domain, serverName)).map(ServerStatus::getDesiredState).orElse(""); } - private String getActualState(Domain domain, String serverName) { - return Optional.ofNullable(getServerStatus(domain, serverName)).map(ServerStatus::getState).orElse(""); - } - private ServerStatus getServerStatus(Domain domain, String serverName) { for (ServerStatus status : domain.getStatus().getServers()) { if (status.getServerName().equals(serverName)) { @@ -1404,9 +1513,10 @@ private void forceExceptionDuringProcessing() { } @Test - void whenExceptionDuringProcessing_sendAbortedEvent() { + void whenWebLogicCredentialsSecretRemoved_NullPointerExceptionAndAbortedEventNotGenerated() { + consoleHandlerMemento.ignoreMessage(NOT_STARTING_DOMAINUID_THREAD); DomainProcessorImpl.registerDomainPresenceInfo(new DomainPresenceInfo(domain)); - forceExceptionDuringProcessing(); + domain.getSpec().withWebLogicCredentialsSecret(null); int time = 0; for (int numRetries = 0; numRetries < DomainPresence.getDomainPresenceFailureRetryMaxCount(); numRetries++) { @@ -1415,7 +1525,7 @@ void whenExceptionDuringProcessing_sendAbortedEvent() { testSupport.setTime(time, TimeUnit.SECONDS); } - assertThat(getEvents().stream().anyMatch(this::isDomainProcessingAbortedEvent), is(true)); + assertThat(getEvents().stream().anyMatch(this::isDomainProcessingAbortedEvent), is(false)); } private List getEvents() { diff --git a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java index 78628cb7e3d..aaac25fc064 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java @@ -6,7 +6,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.logging.Level; import java.util.logging.LogRecord; @@ -18,16 +20,22 @@ import io.kubernetes.client.openapi.ApiException; import io.kubernetes.client.openapi.models.V1ConfigMap; import io.kubernetes.client.openapi.models.V1Container; +import io.kubernetes.client.openapi.models.V1ContainerState; +import io.kubernetes.client.openapi.models.V1ContainerStateWaiting; +import io.kubernetes.client.openapi.models.V1ContainerStatus; import io.kubernetes.client.openapi.models.V1EmptyDirVolumeSource; import io.kubernetes.client.openapi.models.V1Job; import io.kubernetes.client.openapi.models.V1JobCondition; import io.kubernetes.client.openapi.models.V1JobStatus; import io.kubernetes.client.openapi.models.V1ObjectMeta; +import io.kubernetes.client.openapi.models.V1Pod; import io.kubernetes.client.openapi.models.V1PodSpec; +import io.kubernetes.client.openapi.models.V1PodStatus; import io.kubernetes.client.openapi.models.V1SecretReference; import io.kubernetes.client.openapi.models.V1Volume; import io.kubernetes.client.openapi.models.V1VolumeMount; import oracle.kubernetes.operator.JobAwaiterStepFactory; +import oracle.kubernetes.operator.LabelConstants; import oracle.kubernetes.operator.TuningParameters; import oracle.kubernetes.operator.calls.unprocessable.UnrecoverableErrorBuilderImpl; import oracle.kubernetes.operator.rest.ScanCacheStub; @@ -120,7 +128,7 @@ class DomainIntrospectorJobTest { private static final String INFO_MESSAGE = "@[INFO] just letting you know"; public static final String TEST_VOLUME_NAME = "test"; private static final String JOB_UID = "FAILED_JOB"; - + private static final String JOB_NAME = UID + "-introspector"; private final TerminalStep terminalStep = new TerminalStep(); private final Domain domain = createDomain(); private final DomainPresenceInfo domainPresenceInfo = createDomainPresenceInfo(domain); @@ -591,6 +599,66 @@ void whenPreviousFailedJobExists_createNewJob() { assertThat(affectedJob, notNullValue()); } + @Test + void whenPreviousFailedJobWithImagePullErrorExistsAndMakeRightContinued_createNewJob() { + ignoreIntrospectorFailureLogs(); + ignoreJobCreatedAndDeletedLogs(); + testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); + defineFailedIntrospectionWithImagePullError("ErrImagePull"); + testSupport.doOnCreate(JOB, this::recordJob); + testSupport.doAfterCall(JOB, "deleteJob", this::replaceFailedJobPodWithSuccess); + + testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); + + assertThat(affectedJob, notNullValue()); + } + + private void replaceFailedJobPodWithSuccess() { + testSupport.deleteResources(createIntrospectorJobPod()); + testSupport.defineResources(createIntrospectorJobPod()); + } + + private void defineFailedIntrospectionWithImagePullError(String imagePullError) { + testSupport.defineResources(asFailedJobWithBackoffLimitExceeded(createIntrospectorJob())); + testSupport.defineResources(asFailedJobPodWithImagePullError(createIntrospectorJobPod(), imagePullError)); + } + + private V1Pod asFailedJobPodWithImagePullError(V1Pod introspectorJobPod, String imagePullError) { + List statuses = List.of(createImagePullContainerStatus(imagePullError)); + return introspectorJobPod.status(new V1PodStatus().containerStatuses(statuses)); + } + + private V1ContainerStatus createImagePullContainerStatus(String imagePullError) { + return new V1ContainerStatus().state( + new V1ContainerState().waiting(new V1ContainerStateWaiting().reason(imagePullError))); + } + + private V1Pod createIntrospectorJobPod() { + Map labels = new HashMap<>(); + labels.put(LabelConstants.JOBNAME_LABEL, JOB_NAME); + return new V1Pod().metadata(new V1ObjectMeta().name(JOB_NAME).labels(labels).namespace(NS)); + } + + private V1Job asFailedJobWithBackoffLimitExceeded(V1Job job) { + job.setStatus(new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("BackoffLimitExceeded"))); + return job; + } + + @Test + void whenPreviousFailedJobWithImagePullBackoffErrorExistsAndMakeRightContinued_createNewJob() { + ignoreIntrospectorFailureLogs(); + ignoreJobCreatedAndDeletedLogs(); + testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); + defineFailedIntrospectionWithImagePullError("ImagePullBackOff"); + testSupport.doOnCreate(JOB, this::recordJob); + testSupport.doAfterCall(JOB, "deleteJob", this::replaceFailedJobPodWithSuccess); + + testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); + + assertThat(affectedJob, notNullValue()); + } + private V1Job affectedJob; private void recordJob(Object job) { @@ -677,11 +745,15 @@ void whenJobLogContainsSevereError_logJobInfosOnDelete() { } private V1Job createIntrospectorJob() { - return new V1Job().metadata(createJobMetadata()).status(new V1JobStatus()); + return createIntrospectorJob(JOB_UID); + } + + private V1Job createIntrospectorJob(String uid) { + return new V1Job().metadata(createJobMetadata(uid)).status(new V1JobStatus()); } - private V1ObjectMeta createJobMetadata() { - return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(JOB_UID); + private V1ObjectMeta createJobMetadata(String uid) { + return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(uid); } @Test @@ -785,7 +857,6 @@ void whenJobLogContainsSevereErrorAndStatusHasDifferentJobUID_incrementFailureCo assertThat(getUpdatedDomain().getStatus().getIntrospectJobFailureCount(), equalTo(2)); } - // create job // add job uid to status // do NOT create pod log diff --git a/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java b/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java index a968beb32d8..396ff1e8bd6 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java @@ -50,6 +50,7 @@ import oracle.kubernetes.weblogic.domain.model.ConfigurationConstants; import oracle.kubernetes.weblogic.domain.model.Domain; import oracle.kubernetes.weblogic.domain.model.DomainSpec; +import oracle.kubernetes.weblogic.domain.model.DomainStatus; import oracle.kubernetes.weblogic.domain.model.DomainValidationBaseTest; import oracle.kubernetes.weblogic.domain.model.Istio; import oracle.kubernetes.weblogic.domain.model.ServerEnvVars; @@ -677,7 +678,9 @@ private static V1PodSpec getTemplateSpec(V1JobSpec jobSpec) { @Test void verify_introspectorPodSpec_activeDeadlineSeconds_retry_values() { - int failureCount = domainPresenceInfo.incrementAndGetFailureCount(); + domainPresenceInfo.getDomain() + .setStatus(new DomainStatus().withIntrospectJobFailureCount(1)); + int failureCount = domainPresenceInfo.getDomain().getStatus().getIntrospectJobFailureCount(); V1JobSpec jobSpec = createJobSpec(); @@ -1218,7 +1221,6 @@ void whenDomainIsIstioEnabled_istioLocalhostBindingsDisabledEnv_hasIstioUseLocal ); } - private V1Job job; private void recordJob(V1Job job) {