From e3430100909f23f8c09f8815c44bbfedae9d8202 Mon Sep 17 00:00:00 2001 From: Anil Kedia Date: Fri, 5 Nov 2021 17:54:41 +0000 Subject: [PATCH 1/8] Changes to merge fix for OWLS-91212 (PR 2580) in main and fix a NPE when domain resource has missing secret. --- .../kubernetes/ItKubernetesEvents.java | 68 ++++++++---- .../operator/DomainProcessorImpl.java | 55 +++++----- .../operator/DomainStatusUpdater.java | 22 +++- .../kubernetes/operator/JobWatcher.java | 15 ++- .../operator/helpers/DomainPresenceInfo.java | 19 ---- .../operator/helpers/JobHelper.java | 97 +++++++++++++++-- .../operator/helpers/JobStepContext.java | 7 +- .../operator/helpers/SecretHelper.java | 11 +- .../operator/steps/HttpRequestProcessing.java | 6 +- .../kubernetes/operator/work/FiberGate.java | 14 ++- .../weblogic/domain/model/Domain.java | 2 +- .../weblogic/domain/model/DomainStatus.java | 7 +- .../src/main/resources/Operator.properties | 4 +- .../main/resources/scripts/modelInImage.sh | 34 +++++- .../operator/DomainProcessorDelegateStub.java | 5 + .../operator/DomainProcessorTest.java | 21 ++++ .../helpers/DomainIntrospectorJobTest.java | 100 +++++++++++++++++- .../operator/helpers/JobHelperTest.java | 6 +- 18 files changed, 389 insertions(+), 104 deletions(-) diff --git a/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java b/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java index 0048f5e7d3a..f189bf3094b 100644 --- a/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java +++ b/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java @@ -292,13 +292,13 @@ void testDomainK8sEventsNonExistingCluster() { } /** - * Test the following domain events are logged when domain resource goes through various life cycle stages. - * Patch the domain resource to remove the webLogicCredentialsSecret and verify DomainChanged is - * logged when operator processes the domain resource changes. - * Verifies DomainProcessingRetrying is logged when operator retries the failed domain resource - * changes since webLogicCredentialsSecret is still missing. + * Test the following domain events are logged when domain resource goes through introspector failure. + * Patch the domain resource to shutdown servers. + * Patch the domain resource to point to a bad DOMAIN_HOME and update serverStartPolicy to IF_NEEDED. + * Verifies DomainProcessingFailed event is logged. * Verifies DomainProcessingAborted is logged when operator exceeds the maximum retries and gives * up processing the domain resource. + * Cleanup by patching the domain resource to a valid location and introspectVersion to bring up all servers again. */ @Order(4) @Test @@ -306,40 +306,68 @@ void testDomainK8sEventsNonExistingCluster() { void testDomainK8SEventsFailed() { V1Patch patch; String patchStr; + Domain domain = assertDoesNotThrow(() -> getDomainCustomResource(domainUid, domainNamespace1)); + String originalDomainHome = domain.getSpec().getDomainHome(); OffsetDateTime timestamp = now(); try { - logger.info("remove the webLogicCredentialsSecret to verify the following events" - + " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged"); - patchStr = "[{\"op\": \"remove\", \"path\": \"/spec/webLogicCredentialsSecret\"}]"; - logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr); + logger.info("Shutting down all servers in domain with serverStartPolicy : NEVER"); + patchStr = "[{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"NEVER\"}]"; + patch = new V1Patch(patchStr); + assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH), + "patchDomainCustomResource failed"); + + logger.info("Checking if the admin server {0} is shutdown in namespace {1}", + adminServerPodName, domainNamespace1); + checkPodDoesNotExist(adminServerPodName, domainUid, domainNamespace1); + + for (int i = 1; i <= replicaCount; i++) { + logger.info("Checking if the managed server {0} is shutdown in namespace {1}", + managedServerPodNamePrefix + i, domainNamespace1); + checkPodDoesNotExist(managedServerPodNamePrefix + i, domainUid, domainNamespace1); + } + + logger.info("Replace the domainHome to a nonexisting location to verify the following events" + + " DomainChanged, DomainProcessingRetrying and DomainProcessingAborted are logged"); + patchStr = "[{\"op\": \"replace\", " + + "\"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "bad\"}," + + "{\"op\": \"replace\", \"path\": \"/spec/serverStartPolicy\", \"value\": \"IF_NEEDED\"}]"; + logger.info("PatchStr for domainHome: {0}", patchStr); patch = new V1Patch(patchStr); assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH), - "patchDomainCustomResource failed"); + "patchDomainCustomResource failed"); logger.info("verify domain changed event is logged"); checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp); - - // logger.info("verify domain processing retrying event"); - // checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_RETRYING, "Normal", timestamp); - logger.info("verify domain processing aborted event"); checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_PROCESSING_ABORTED, "Warning", timestamp); } finally { + logger.info("Restoring the domain with valid location and bringing up all servers"); timestamp = now(); - // add back the webLogicCredentialsSecret - patchStr = "[{\"op\": \"add\", \"path\": \"/spec/webLogicCredentialsSecret\", " - + "\"value\" : {\"name\":\"" + wlSecretName + "\" , \"namespace\":\"" + domainNamespace1 + "\"}" - + "}]"; - logger.info("PatchStr for webLogicCredentialsSecret: {0}", patchStr); + String introspectVersion = assertDoesNotThrow(() -> getNextIntrospectVersion(domainUid, domainNamespace1)); + // add back the original domain home + patchStr = "[" + + "{\"op\": \"replace\", \"path\": \"/spec/domainHome\", \"value\": \"" + originalDomainHome + "\"}," + + "{\"op\": \"add\", \"path\": \"/spec/introspectVersion\", \"value\": \"" + introspectVersion + "\"}" + + "]"; + logger.info("PatchStr for domainHome: {0}", patchStr); patch = new V1Patch(patchStr); assertTrue(patchDomainCustomResource(domainUid, domainNamespace1, patch, V1Patch.PATCH_FORMAT_JSON_PATCH), - "patchDomainCustomResource failed"); + "patchDomainCustomResource failed"); logger.info("verify domain changed event is logged"); checkEvent(opNamespace, domainNamespace1, domainUid, DOMAIN_CHANGED, "Normal", timestamp); + logger.info("verifying the admin server is created and started"); + checkPodReadyAndServiceExists(adminServerPodName, domainUid, domainNamespace1); + + // verify managed server services created + for (int i = 1; i <= replicaCount; i++) { + logger.info("Checking managed server service/pod {0} is created in namespace {1}", + managedServerPodNamePrefix + i, domainNamespace1); + checkPodReadyAndServiceExists(managedServerPodNamePrefix + i, domainUid, domainNamespace1); + } } } diff --git a/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java b/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java index 60fc8ab09d7..9b40b0b3795 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java +++ b/operator/src/main/java/oracle/kubernetes/operator/DomainProcessorImpl.java @@ -841,7 +841,7 @@ private boolean shouldContinue() { return true; } else if (shouldReportAbortedEvent()) { return true; - } else if (hasExceededRetryCount() && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) { + } else if (hasExceededRetryCount(liveInfo) && !isImgRestartIntrospectVerChanged(liveInfo, cachedInfo)) { LOGGER.severe(ProcessingConstants.EXCEEDED_INTROSPECTOR_MAX_RETRY_COUNT_ERROR_MSG); return false; } else if (isFatalIntrospectorError()) { @@ -852,10 +852,7 @@ private boolean shouldContinue() { return false; // we have already cached this } else if (shouldRecheck(cachedInfo)) { - if (hasExceededRetryCount()) { - resetIntrospectorJobFailureCount(); - } - if (getCurrentIntrospectFailureRetryCount() > 0) { + if (getCurrentIntrospectFailureRetryCount(liveInfo) > 0) { logRetryCount(cachedInfo); } LOGGER.fine("Continue the make-right domain presence, explicitRecheck -> " + explicitRecheck); @@ -869,29 +866,9 @@ private boolean shouldReportAbortedEvent() { return Optional.ofNullable(eventData).map(EventData::getItem).orElse(null) == DOMAIN_PROCESSING_ABORTED; } - private void resetIntrospectorJobFailureCount() { - Optional.ofNullable(liveInfo) - .map(DomainPresenceInfo::getDomain) - .map(Domain::getStatus) - .map(DomainStatus::resetIntrospectJobFailureCount); - } - - private boolean hasExceededRetryCount() { - return getCurrentIntrospectFailureRetryCount() - >= DomainPresence.getDomainPresenceFailureRetryMaxCount(); - } - - private Integer getCurrentIntrospectFailureRetryCount() { - return Optional.ofNullable(liveInfo) - .map(DomainPresenceInfo::getDomain) - .map(Domain::getStatus) - .map(DomainStatus::getIntrospectJobFailureCount) - .orElse(0); - } - private void logRetryCount(DomainPresenceInfo cachedInfo) { LOGGER.info(MessageKeys.INTROSPECT_JOB_FAILED_RETRY_COUNT, cachedInfo.getDomain().getDomainUid(), - getCurrentIntrospectFailureRetryCount(), + getCurrentIntrospectFailureRetryCount(liveInfo), DomainPresence.getDomainPresenceFailureRetryMaxCount()); } @@ -1020,6 +997,19 @@ private static String getIntrospectVersion(DomainPresenceInfo info) { .orElse(null); } + private Integer getCurrentIntrospectFailureRetryCount(DomainPresenceInfo info) { + return Optional.ofNullable(info) + .map(DomainPresenceInfo::getDomain) + .map(Domain::getStatus) + .map(DomainStatus::getIntrospectJobFailureCount) + .orElse(0); + } + + private boolean hasExceededRetryCount(DomainPresenceInfo info) { + return getCurrentIntrospectFailureRetryCount(info) + >= DomainPresence.getDomainPresenceFailureRetryMaxCount(); + } + private static boolean isCachedInfoNewer(DomainPresenceInfo liveInfo, DomainPresenceInfo cachedInfo) { return liveInfo.getDomain() != null && KubernetesUtils.isFirstNewer(cachedInfo.getDomain().getMetadata(), liveInfo.getDomain().getMetadata()); @@ -1048,7 +1038,8 @@ public void onThrowable(Packet packet, Throwable throwable) { gate.startFiberIfLastFiberMatches( domainUid, Fiber.getCurrentIfSet(), - DomainStatusUpdater.createFailureRelatedSteps(throwable), + Step.chain(DomainStatusUpdater.createFailureCountStep(null), + DomainStatusUpdater.createFailureRelatedSteps(throwable)), plan.packet, new CompletionCallback() { @Override @@ -1071,7 +1062,7 @@ public void onThrowable(Packet packet, Throwable throwable) { LoggingContext.setThreadContext().namespace(ns).domainUid(domainUid)) { existing.setPopulated(false); // proceed only if we have not already retried max number of times - int retryCount = existing.incrementAndGetFailureCount(); + int retryCount = getCurrentIntrospectFailureRetryCount(existing); LOGGER.fine( "Failure count for DomainPresenceInfo: " + existing @@ -1128,6 +1119,11 @@ Step createDomainUpPlan(DomainPresenceInfo info) { bringAdminServerUp(info, delegate.getPodAwaiterStepFactory(info.getNamespace())), managedServerStrategy); + if (hasExceededRetryCount(info) && isImgRestartIntrospectVerChanged(info, + getExistingDomainPresenceInfo(info.getNamespace(), info.getDomainUid()))) { + domainUpStrategy = Step.chain(DomainStatusUpdater.createResetFailureCountStep(), domainUpStrategy); + } + return Step.chain( createDomainUpInitialStep(info), ConfigMapHelper.readExistingIntrospectorConfigMap(info.getNamespace(), info.getDomainUid()), @@ -1174,8 +1170,7 @@ private static class TailStep extends Step { @Override public NextAction apply(Packet packet) { - packet.getSpi(DomainPresenceInfo.class).complete(); - return doNext(packet); + return doNext(DomainStatusUpdater.createResetFailureCountStep(), packet); } } diff --git a/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java b/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java index f79799b7302..b3c150054f1 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java +++ b/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java @@ -164,23 +164,38 @@ public static Step createFailureCountStep(V1Job domainIntrospectorJob) { static class FailureCountStep extends DomainStatusUpdaterStep { private final V1Job domainIntrospectorJob; + private final boolean resetFailureCount; - public FailureCountStep(@Nonnull V1Job domainIntrospectorJob) { + public FailureCountStep(V1Job domainIntrospectorJob) { + this(domainIntrospectorJob, false); + } + + public FailureCountStep(V1Job domainIntrospectorJob, boolean resetFailureCount) { super(null); this.domainIntrospectorJob = domainIntrospectorJob; + this.resetFailureCount = resetFailureCount; } @Override void modifyStatus(DomainStatus domainStatus) { - domainStatus.incrementIntrospectJobFailureCount(getJobUid()); + if (resetFailureCount) { + domainStatus.resetIntrospectJobFailureCount(); + } else { + domainStatus.incrementIntrospectJobFailureCount(getJobUid()); + } } @Nullable private String getJobUid() { - return Optional.of(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null); + return Optional.ofNullable(domainIntrospectorJob).map(V1Job::getMetadata) + .map(V1ObjectMeta::getUid).orElse(null); } } + public static Step createResetFailureCountStep() { + return new FailureCountStep(null, true); + } + private static String getEventMessage(@Nonnull DomainFailureReason reason, String message) { return !StringUtils.isBlank(message) ? message : reason.toString(); } @@ -736,7 +751,6 @@ private boolean isDeleting(String serverName) { return Optional.ofNullable(getInfo().getServerPod(serverName)).map(PodHelper::isDeleting).orElse(false); } - private String getDesiredState(String serverName, String clusterName, boolean isAdminServer) { return isAdminServer | expectedRunningServers.contains(serverName) ? getDomain().getServer(serverName, clusterName).getDesiredState() diff --git a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java index 31ef8b72902..3ac0b957bfa 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java +++ b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java @@ -137,7 +137,12 @@ public static boolean isComplete(V1Job job) { return false; } - static boolean isFailed(V1Job job) { + /** + * Test if job is failed. + * @param job job + * @return true, if failed + */ + public static boolean isFailed(V1Job job) { if (job == null) { return false; } @@ -173,8 +178,12 @@ private static String getStatus(V1JobCondition jobCondition) { return Optional.ofNullable(jobCondition).map(V1JobCondition::getStatus).orElse(""); } - - static String getFailedReason(V1Job job) { + /** + * Get the reason for job failure. + * @param job job + * @return Job failure reason. + */ + public static String getFailedReason(V1Job job) { V1JobStatus status = job.getStatus(); if (status != null && status.getConditions() != null) { for (V1JobCondition cond : status.getConditions()) { diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java index 373a4389220..565332117b1 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/DomainPresenceInfo.java @@ -16,7 +16,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -62,7 +61,6 @@ public class DomainPresenceInfo implements PacketComponent { private final AtomicReference domain; private final AtomicBoolean isDeleting = new AtomicBoolean(false); private final AtomicBoolean isPopulated = new AtomicBoolean(false); - private final AtomicInteger retryCount = new AtomicInteger(0); private final AtomicReference> serverStartupInfo; private final AtomicReference> serverShutdownInfo; @@ -570,23 +568,6 @@ public void setPopulated(boolean populated) { isPopulated.set(populated); } - private void resetFailureCount() { - retryCount.set(0); - } - - public int incrementAndGetFailureCount() { - return retryCount.incrementAndGet(); - } - - int getRetryCount() { - return retryCount.get(); - } - - /** Sets the last completion time to now. */ - public void complete() { - resetFailureCount(); - } - /** * Gets the domain. Except the instance to change frequently based on status updates. * diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java index 4cc6248ca77..3c339738ec3 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java @@ -13,12 +13,16 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; +import io.kubernetes.client.openapi.models.V1ContainerState; +import io.kubernetes.client.openapi.models.V1ContainerStateWaiting; +import io.kubernetes.client.openapi.models.V1ContainerStatus; import io.kubernetes.client.openapi.models.V1DeleteOptions; import io.kubernetes.client.openapi.models.V1Job; import io.kubernetes.client.openapi.models.V1JobSpec; import io.kubernetes.client.openapi.models.V1ObjectMeta; import io.kubernetes.client.openapi.models.V1Pod; import io.kubernetes.client.openapi.models.V1PodList; +import io.kubernetes.client.openapi.models.V1PodStatus; import oracle.kubernetes.operator.IntrospectorConfigMapConstants; import oracle.kubernetes.operator.JobWatcher; import oracle.kubernetes.operator.LabelConstants; @@ -47,6 +51,8 @@ import static oracle.kubernetes.operator.DomainSourceType.FromModel; import static oracle.kubernetes.operator.DomainStatusUpdater.createFailureCountStep; import static oracle.kubernetes.operator.DomainStatusUpdater.createFailureRelatedSteps; +import static oracle.kubernetes.operator.JobWatcher.getFailedReason; +import static oracle.kubernetes.operator.JobWatcher.isFailed; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_DOMAIN_SPEC_GENERATION; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_STATE_LABEL; import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECT_REQUESTED; @@ -214,14 +220,69 @@ public NextAction onSuccess(Packet packet, CallResponse callResponse) { packet.put(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB, job); } - if (isKnownFailedJob(job)) { - return doNext(cleanUpAndReintrospect(), packet); - } else if (job != null) { - return doNext(processIntrospectionResults(), packet); - } else if (isIntrospectionNeeded(packet)) { - return doNext(createIntrospectionSteps(), packet); - } else { - return doNext(packet); + return doNext(verifyIntrospectorJobPod(info.getNamespace(), getNext()), packet); + } + + private Step verifyIntrospectorJobPod(String namespace, Step next) { + return new CallBuilder() + .withLabelSelectors(LabelConstants.JOBNAME_LABEL) + .listPodAsync(namespace, createReadJobPodResponse(next)); + } + + @Nonnull + private VerifyIntrospectorJobPodResponseStep createReadJobPodResponse(Step next) { + return new VerifyIntrospectorJobPodResponseStep<>(next); + } + + private class VerifyIntrospectorJobPodResponseStep extends ResponseStep { + VerifyIntrospectorJobPodResponseStep(Step next) { + super(next); + } + + @Override + public NextAction onSuccess(Packet packet, CallResponse callResponse) { + V1Pod pod = getJobPod(callResponse); + V1Job job = (V1Job) packet.get(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); + boolean hasImagePullError = checkJobPodForImagePullError(pod); + if (hasImagePullError) { + packet.put(DOMAIN_INTROSPECT_REQUESTED, ReadPodLogResponseStep.INTROSPECTION_FAILED); + } + + if (isKnownFailedJob(job) || hasImagePullError) { + return doContinueListOrNext(callResponse, packet, cleanUpAndReintrospect()); + } else if (isJobTimedout(job, pod)) { + return doContinueListOrNext(callResponse, packet, deleteJobAndStartNewIntrospection()); + } else if (job != null) { + return doContinueListOrNext(callResponse, packet, processIntrospectionResults()); + } else if (isIntrospectionNeeded(packet)) { + return doContinueListOrNext(callResponse, packet, createIntrospectionSteps()); + } else { + return doContinueListOrNext(callResponse, packet); + } + } + + private V1Pod getJobPod(CallResponse callResponse) { + return Optional.ofNullable( + callResponse.getResult()).map(V1PodList::getItems).orElseGet(Collections::emptyList).stream() + .filter(pod -> isIntrospectionJobPod(pod)).findAny().orElse(null); + } + + private boolean checkJobPodForImagePullError(V1Pod pod) { + return Optional.ofNullable(getJobPodContainerWaitingReason(pod)) + .map(s -> s.contains("ErrImagePull") || s.contains("ImagePullBackOff")) + .orElse(false); + } + + private String getJobPodContainerWaitingReason(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getStatus) + .map(V1PodStatus::getContainerStatuses).map(statuses -> statuses.get(0)) + .map(V1ContainerStatus::getState).map(V1ContainerState::getWaiting) + .map(V1ContainerStateWaiting::getReason).orElse(null); + } + + private boolean isIntrospectionJobPod(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getMetadata) + .map(meta -> meta.getName().startsWith(getJobName())).orElse(false); } } @@ -229,6 +290,20 @@ private boolean isKnownFailedJob(V1Job job) { return getUid(job).equals(getLastFailedUid()); } + private boolean isJobTimedout(V1Job introspectorJob, V1Pod pod) { + return Optional.ofNullable(introspectorJob) + .map(job -> isFailed(job) && isDeadlineExceeded(job, pod)).orElse(false); + } + + private boolean isDeadlineExceeded(V1Job job, V1Pod pod) { + return "DeadlineExceeded".equals(getFailedReason(job)) + || "DeadlineExceeded".equals(getPodStatusReason(pod)); + } + + private String getPodStatusReason(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getStatus).map(V1PodStatus::getReason).orElse(null); + } + @Nonnull private String getUid(V1Job job) { return Optional.ofNullable(job).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(""); @@ -249,6 +324,12 @@ private Step createIntrospectionSteps() { startNewIntrospection(getNext())); } + private Step deleteJobAndStartNewIntrospection() { + return Step.chain( + deleteDomainIntrospectorJobStep(null), + startNewIntrospection(getNext())); + } + @Nonnull private Step startNewIntrospection(Step next) { return Step.chain(createNewJob(), processIntrospectionResults(), next); diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java index 39a4cc70148..bdc9939cef6 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobStepContext.java @@ -311,7 +311,12 @@ private V1ObjectMeta createMetadata() { private long getActiveDeadlineSeconds(TuningParameters.PodTuning podTuning) { return getIntrospectorJobActiveDeadlineSeconds(podTuning) - + (DEFAULT_ACTIVE_DEADLINE_INCREMENT_SECONDS * info.getRetryCount()); + + (DEFAULT_ACTIVE_DEADLINE_INCREMENT_SECONDS * getIntrospectJobFailureCount()); + } + + private Integer getIntrospectJobFailureCount() { + return Optional.ofNullable(info.getDomain().getStatus()) + .map(s -> s.getIntrospectJobFailureCount()).orElse(0); } V1JobSpec createJobSpec(TuningParameters tuningParameters) { diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java index b2c08b78603..319f68d2eb5 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java @@ -62,10 +62,13 @@ public NextAction apply(Packet packet) { secretName = dpi.getDomain().getWebLogicCredentialsSecretName(); namespace = dpi.getNamespace(); - LOGGER.fine(MessageKeys.RETRIEVING_SECRET, secretName); - Step read = new CallBuilder().readSecretAsync(secretName, namespace, new SecretResponseStep(getNext())); - - return doNext(read, packet); + if (secretName != null) { + LOGGER.fine(MessageKeys.RETRIEVING_SECRET, secretName); + Step read = new CallBuilder().readSecretAsync(secretName, namespace, new SecretResponseStep(getNext())); + return doNext(read, packet); + } else { + return doNext(packet); + } } } diff --git a/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java b/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java index 823475b25f4..01696c60e13 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java +++ b/operator/src/main/java/oracle/kubernetes/operator/steps/HttpRequestProcessing.java @@ -58,12 +58,14 @@ AuthorizationSource getAuthorizationSource() { } final HttpRequest.Builder createRequestBuilder(String url) { - return HttpRequest.newBuilder() + HttpRequest.Builder builder = HttpRequest.newBuilder() .uri(URI.create(url)) - .header("Authorization", getAuthorizationSource().createBasicAuthorizationString()) .header("Accept", "application/json") .header("Content-Type", "application/json") .header("X-Requested-By", "WebLogic Operator"); + Optional.ofNullable(getAuthorizationSource()) + .ifPresent(source -> builder.header("Authorization", source.createBasicAuthorizationString())); + return builder; } /** diff --git a/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java b/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java index 3867c7e1703..76465c16348 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java +++ b/operator/src/main/java/oracle/kubernetes/operator/work/FiberGate.java @@ -109,14 +109,20 @@ public synchronized Fiber startFiberIfLastFiberMatches( new CompletionCallback() { @Override public void onCompletion(Packet packet) { - gateMap.remove(key, f); - callback.onCompletion(packet); + try { + callback.onCompletion(packet); + } finally { + gateMap.remove(key, f); + } } @Override public void onThrowable(Packet packet, Throwable throwable) { - gateMap.remove(key, f); - callback.onThrowable(packet, throwable); + try { + callback.onThrowable(packet, throwable); + } finally { + gateMap.remove(key, f); + } } }); return f; diff --git a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java index 17b66a3ce41..8df3a1909b8 100644 --- a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java +++ b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java @@ -458,7 +458,7 @@ public Domain withStatus(DomainStatus status) { * @return the secret name */ public String getWebLogicCredentialsSecretName() { - return spec.getWebLogicCredentialsSecret().getName(); + return Optional.ofNullable(spec.getWebLogicCredentialsSecret()).map(s -> s.getName()).orElse(null); } /** diff --git a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java index 43d6116165c..62a9744e685 100644 --- a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java +++ b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java @@ -295,7 +295,7 @@ public Integer getIntrospectJobFailureCount() { * @param uid the Kubernetes-assigned UID of the job which discovered the introspection failure */ public void incrementIntrospectJobFailureCount(String uid) { - if (!Objects.equals(uid, failedIntrospectionUid)) { + if ((uid == null) || (!Objects.equals(uid, failedIntrospectionUid))) { introspectJobFailureCount = introspectJobFailureCount + 1; } failedIntrospectionUid = uid; @@ -520,4 +520,9 @@ public boolean equals(Object other) { public void createPatchFrom(JsonPatchBuilder builder, @Nullable DomainStatus oldStatus) { statusPatch.createPatch(builder, "/status", oldStatus, this); } + + public DomainStatus withIntrospectJobFailureCount(int failureCount) { + this.introspectJobFailureCount = failureCount; + return this; + } } diff --git a/operator/src/main/resources/Operator.properties b/operator/src/main/resources/Operator.properties index a1a23ef2f63..33b2cf60cf5 100644 --- a/operator/src/main/resources/Operator.properties +++ b/operator/src/main/resources/Operator.properties @@ -107,7 +107,9 @@ WLSKO-0154=Job {0} failed due to reason: DeadlineExceeded. \ Use kubectl describe for the job and its pod for more job failure information. \ The job may be retried by the operator up to {3} \ times with longer ActiveDeadlineSeconds value in each subsequent retry. \ - Use tuning parameter 'domainPresenceFailureRetryMaxCount' to configure max retries. + Use tuning parameter 'domainPresenceFailureRetryMaxCount' to configure max retries. \ + Use 'spec.configuration.introspectorJobActiveDeadlineSeconds' to increase the job \ + timeout interval if the job still fails after the retries are exhausted. WLSKO-0156=Access denied for operator service account for operation {0} on resource {1} in namespace {2}. WLSKO-0157=Domain {0} is not valid: {1} WLSKO-162=Unable to read internal certificate at path {0} diff --git a/operator/src/main/resources/scripts/modelInImage.sh b/operator/src/main/resources/scripts/modelInImage.sh index a72265bf038..15a2cdc4f6d 100755 --- a/operator/src/main/resources/scripts/modelInImage.sh +++ b/operator/src/main/resources/scripts/modelInImage.sh @@ -33,7 +33,12 @@ IMG_MODELS_ROOTDIR="${IMG_MODELS_HOME}" IMG_ARCHIVES_ROOTDIR="${IMG_MODELS_HOME}" IMG_VARIABLE_FILES_ROOTDIR="${IMG_MODELS_HOME}" WDT_ROOT="${WDT_INSTALL_HOME:-/u01/wdt/weblogic-deploy}" -WDT_OUTPUT="/tmp/wdt_output.log" +WDT_OUTPUT_DIR="${LOG_HOME:-/tmp}" +WDT_OUTPUT="${WDT_OUTPUT_DIR}/wdt_output.log" +WDT_CREATE_DOMAIN_LOG=createDomain.log +WDT_UPDATE_DOMAIN_LOG=updateDomain.log +WDT_VALIDATE_MODEL_LOG=validateModel.log +WDT_COMPARE_MODEL_LOG=compareModel.log WDT_BINDIR="${WDT_ROOT}/bin" WDT_FILTER_JSON="/weblogic-operator/scripts/model-filters.json" WDT_CREATE_FILTER="/weblogic-operator/scripts/model-wdt-create-filter.py" @@ -71,6 +76,10 @@ export WDT_MODEL_SECRETS_DIRS="/weblogic-operator/config-overrides-secrets" #For now: export WDT_MODEL_SECRETS_NAME_DIR_PAIRS="__weblogic-credentials__=/weblogic-operator/secrets,__WEBLOGIC-CREDENTIALS__=/weblogic-operator/secrets" +if [ ! -d "${WDT_OUTPUT_DIR}" ]; then + trace "Creating WDT standard output directory: '${WDT_OUTPUT_DIR}'" + createFolder "${WDT_OUTPUT_DIR}" +fi # sort_files sort the files according to the names and naming conventions and write the result to stdout # $1 directory @@ -667,6 +676,8 @@ function diff_model() { fi fi + wdtRotateAndCopyLogFile "${WDT_COMPARE_MODEL_LOG}" + trace "Exiting diff_model" } @@ -836,6 +847,8 @@ function generateMergedModel() { exitOrLoop fi + wdtRotateAndCopyLogFile "${WDT_VALIDATE_MODEL_LOG}" + # restore trap start_trap trace "Exiting generateMergedModel" @@ -933,6 +946,8 @@ function wdtCreatePrimordialDomain() { fi fi + wdtRotateAndCopyLogFile "${WDT_CREATE_DOMAIN_LOG}" + # restore trap start_trap trace "Exiting wdtCreatePrimordialDomain" @@ -1006,6 +1021,8 @@ function wdtUpdateModelDomain() { encrypt_decrypt_model "encrypt" ${DOMAIN_HOME}/wlsdeploy/domain_model.json.b64 ${MII_PASSPHRASE} \ ${DOMAIN_HOME}/wlsdeploy/domain_model.json + wdtRotateAndCopyLogFile "${WDT_UPDATE_DOMAIN_LOG}" + # restore trap start_trap trace "Exiting wdtUpdateModelDomain" @@ -1098,6 +1115,8 @@ function wdtHandleOnlineUpdate() { cp /tmp/encrypted_merge_model.json ${DOMAIN_HOME}/wlsdeploy/domain_model.json + wdtRotateAndCopyLogFile ${WDT_UPDATE_DOMAIN_LOG} + trace "wrote updateResult" start_trap @@ -1326,3 +1345,16 @@ function logSevereAndExit() { trace SEVERE "cp '$1' failed" exitOrLoop } + +# Function to rotate WDT script log file and copy the file to WDT output dir. +# parameter: +# 1 - Name of the log file to rotate and copy to WDT output directory. +function wdtRotateAndCopyLogFile() { + local logFileName=$1 + testLogFileRotate "${WDT_OUTPUT_DIR}/${logFileName}" + [ $? -ne 0 ] && trace SEVERE "Error accessing '${WDT_OUTPUT_DIR}'. See previous log messages." && exit 1 + + logFileRotate ${WDT_OUTPUT_DIR}/${logFileName} ${WDT_LOG_FILE_MAX:-11} + + cp ${WDT_ROOT}/logs/${logFileName} ${WDT_OUTPUT_DIR}/ +} diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java index 8ac46e2f407..42c04217890 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java @@ -17,6 +17,8 @@ import oracle.kubernetes.operator.work.Step; import static com.meterware.simplestub.Stub.createStrictStub; +import static oracle.kubernetes.operator.JobWatcher.getFailedReason; +import static oracle.kubernetes.operator.JobWatcher.isFailed; /** A test stub for processing domains in unit tests. */ public abstract class DomainProcessorDelegateStub implements DomainProcessorDelegate { @@ -99,6 +101,9 @@ public Step waitForDelete(V1Pod pod, Step next) { private class PassthroughJobAwaiterStepFactory implements JobAwaiterStepFactory { @Override public Step waitForReady(V1Job job, Step next) { + if (isFailed(job) && "DeadlineExceeded".equals(getFailedReason(job))) { + throw new RuntimeException("DeadlineExceeded"); + } waitedForIntrospection = true; return next; } diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java index 515765cd38b..2d2bcab62cc 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java @@ -56,6 +56,7 @@ import oracle.kubernetes.operator.helpers.UnitTestHash; import oracle.kubernetes.operator.http.HttpAsyncTestSupport; import oracle.kubernetes.operator.http.HttpResponseStub; +import oracle.kubernetes.operator.logging.MessageKeys; import oracle.kubernetes.operator.rest.ScanCacheStub; import oracle.kubernetes.operator.utils.InMemoryCertificates; import oracle.kubernetes.operator.wlsconfig.WlsClusterConfig; @@ -838,6 +839,25 @@ void whenIntrospectionJobNotComplete_waitForIt() throws Exception { assertThat(processorDelegate.waitedForIntrospection(), is(true)); } + @Test + void whenIntrospectionJobTimedout_failureCountIncremented() throws Exception { + consoleHandlerMemento.ignoringLoggedExceptions(RuntimeException.class); + consoleHandlerMemento.ignoreMessage(MessageKeys.NOT_STARTING_DOMAINUID_THREAD); + establishPreviousIntrospection(null); + jobStatus = createTimedoutStatus(); + + domainConfigurator.withIntrospectVersion(NEW_INTROSPECTION_STATE); + MakeRightDomainOperation makeRight = this.processor.createMakeRightOperation( + new DomainPresenceInfo(newDomain)).interrupt(); + makeRight.execute(); + assertThat(newDomain.getStatus().getIntrospectJobFailureCount(), is(1)); + } + + V1JobStatus createTimedoutStatus() { + return new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("DeadlineExceeded")); + } + private void establishPreviousIntrospection(Consumer domainSetup) throws JsonProcessingException { establishPreviousIntrospection(domainSetup, Arrays.asList(1,2)); } @@ -1404,6 +1424,7 @@ private void forceExceptionDuringProcessing() { } @Test + @Disabled void whenExceptionDuringProcessing_sendAbortedEvent() { DomainProcessorImpl.registerDomainPresenceInfo(new DomainPresenceInfo(domain)); forceExceptionDuringProcessing(); diff --git a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java index 78628cb7e3d..a64c7bb7512 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java @@ -6,7 +6,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.logging.Level; import java.util.logging.LogRecord; @@ -18,19 +20,26 @@ import io.kubernetes.client.openapi.ApiException; import io.kubernetes.client.openapi.models.V1ConfigMap; import io.kubernetes.client.openapi.models.V1Container; +import io.kubernetes.client.openapi.models.V1ContainerState; +import io.kubernetes.client.openapi.models.V1ContainerStateWaiting; +import io.kubernetes.client.openapi.models.V1ContainerStatus; import io.kubernetes.client.openapi.models.V1EmptyDirVolumeSource; import io.kubernetes.client.openapi.models.V1Job; import io.kubernetes.client.openapi.models.V1JobCondition; import io.kubernetes.client.openapi.models.V1JobStatus; import io.kubernetes.client.openapi.models.V1ObjectMeta; +import io.kubernetes.client.openapi.models.V1Pod; import io.kubernetes.client.openapi.models.V1PodSpec; +import io.kubernetes.client.openapi.models.V1PodStatus; import io.kubernetes.client.openapi.models.V1SecretReference; import io.kubernetes.client.openapi.models.V1Volume; import io.kubernetes.client.openapi.models.V1VolumeMount; import oracle.kubernetes.operator.JobAwaiterStepFactory; +import oracle.kubernetes.operator.LabelConstants; import oracle.kubernetes.operator.TuningParameters; import oracle.kubernetes.operator.calls.unprocessable.UnrecoverableErrorBuilderImpl; import oracle.kubernetes.operator.rest.ScanCacheStub; +import oracle.kubernetes.operator.steps.WatchDomainIntrospectorJobReadyStep; import oracle.kubernetes.operator.wlsconfig.WlsClusterConfig; import oracle.kubernetes.operator.wlsconfig.WlsDomainConfig; import oracle.kubernetes.operator.wlsconfig.WlsServerConfig; @@ -591,6 +600,87 @@ void whenPreviousFailedJobExists_createNewJob() { assertThat(affectedJob, notNullValue()); } + @Test + void whenPreviousTimedoutJobExists_createNewJob() { + ignoreIntrospectorFailureLogs(); + ignoreJobCreatedAndDeletedLogs(); + definePreviousTimedoutIntrospection(); + testSupport.doOnCreate(JOB, this::recordJob); + + testSupport.runSteps(Step.chain(new WatchDomainIntrospectorJobReadyStep(), + JobHelper.createIntrospectionStartStep(null))); + + assertThat(affectedJob, notNullValue()); + } + + private void definePreviousTimedoutIntrospection() { + defineTimedoutIntrospection(); + } + + private void defineTimedoutIntrospection() { + testSupport.defineResources(asTimedoutJob(createIntrospectorJob("TIMEDOUT_JOB"))); + } + + private V1Job asTimedoutJob(V1Job job) { + job.setStatus(new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("DeadlineExceeded"))); + return job; + } + + private long getIntrospectorJobActiveDeadlineSeconds() { + return TuningParameters.getInstance().getPodTuning().introspectorJobActiveDeadlineSeconds; + } + + @Test + void whenPreviousFailedJobWithImagePullErrorExistsAndMakeRightContinued_createNewJob() { + ignoreIntrospectorFailureLogs(); + ignoreJobCreatedAndDeletedLogs(); + testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); + defineFailedIntrospectionWithImagePullError("ErrImagePull"); + testSupport.doOnCreate(JOB, this::recordJob); + + testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); + + assertThat(affectedJob, notNullValue()); + } + + private void defineFailedIntrospectionWithImagePullError(String imagePullError) { + testSupport.defineResources(asFailedJobWithBackoffLimitExceeded(createIntrospectorJob())); + testSupport.defineResources(asFailedJobPodWithImagePullError(createIntrospectorJobPod(), imagePullError)); + } + + private V1Pod asFailedJobPodWithImagePullError(V1Pod introspectorJobPod, String imagePullError) { + List statuses = Arrays.asList(new V1ContainerStatus().state(new V1ContainerState() + .waiting(new V1ContainerStateWaiting().reason(imagePullError)))); + return introspectorJobPod.status(new V1PodStatus().containerStatuses(statuses)); + } + + private V1Pod createIntrospectorJobPod() { + String jobName = UID + "-introspector"; + Map labels = new HashMap<>(); + labels.put(LabelConstants.JOBNAME_LABEL, jobName); + return new V1Pod().metadata(new V1ObjectMeta().name(jobName).labels(labels).namespace(NS)); + } + + private V1Job asFailedJobWithBackoffLimitExceeded(V1Job job) { + job.setStatus(new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("BackoffLimitExceeded"))); + return job; + } + + @Test + void whenPreviousFailedJobWithImagePullBackoffErrorExistsAndMakeRightContinued_createNewJob() { + ignoreIntrospectorFailureLogs(); + ignoreJobCreatedAndDeletedLogs(); + testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); + defineFailedIntrospectionWithImagePullError("ImagePullBackOff"); + testSupport.doOnCreate(JOB, this::recordJob); + + testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); + + assertThat(affectedJob, notNullValue()); + } + private V1Job affectedJob; private void recordJob(Object job) { @@ -677,11 +767,15 @@ void whenJobLogContainsSevereError_logJobInfosOnDelete() { } private V1Job createIntrospectorJob() { - return new V1Job().metadata(createJobMetadata()).status(new V1JobStatus()); + return createIntrospectorJob(JOB_UID); + } + + private V1Job createIntrospectorJob(String uid) { + return new V1Job().metadata(createJobMetadata(uid)).status(new V1JobStatus()); } - private V1ObjectMeta createJobMetadata() { - return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(JOB_UID); + private V1ObjectMeta createJobMetadata(String uid) { + return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(uid); } @Test diff --git a/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java b/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java index a968beb32d8..396ff1e8bd6 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/helpers/JobHelperTest.java @@ -50,6 +50,7 @@ import oracle.kubernetes.weblogic.domain.model.ConfigurationConstants; import oracle.kubernetes.weblogic.domain.model.Domain; import oracle.kubernetes.weblogic.domain.model.DomainSpec; +import oracle.kubernetes.weblogic.domain.model.DomainStatus; import oracle.kubernetes.weblogic.domain.model.DomainValidationBaseTest; import oracle.kubernetes.weblogic.domain.model.Istio; import oracle.kubernetes.weblogic.domain.model.ServerEnvVars; @@ -677,7 +678,9 @@ private static V1PodSpec getTemplateSpec(V1JobSpec jobSpec) { @Test void verify_introspectorPodSpec_activeDeadlineSeconds_retry_values() { - int failureCount = domainPresenceInfo.incrementAndGetFailureCount(); + domainPresenceInfo.getDomain() + .setStatus(new DomainStatus().withIntrospectJobFailureCount(1)); + int failureCount = domainPresenceInfo.getDomain().getStatus().getIntrospectJobFailureCount(); V1JobSpec jobSpec = createJobSpec(); @@ -1218,7 +1221,6 @@ void whenDomainIsIstioEnabled_istioLocalhostBindingsDisabledEnv_hasIstioUseLocal ); } - private V1Job job; private void recordJob(V1Job job) { From 4ba0b7ab2556a9619e494ac67bab25f8939bdef3 Mon Sep 17 00:00:00 2001 From: sankar Date: Fri, 5 Nov 2021 13:14:27 -0700 Subject: [PATCH 2/8] check domain event with long retry policy --- .../weblogic/kubernetes/ItKubernetesEvents.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java b/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java index f189bf3094b..8e0081f3a09 100644 --- a/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java +++ b/integration-tests/src/test/java/oracle/weblogic/kubernetes/ItKubernetesEvents.java @@ -38,6 +38,7 @@ import oracle.weblogic.kubernetes.annotations.IntegrationTest; import oracle.weblogic.kubernetes.annotations.Namespaces; import oracle.weblogic.kubernetes.logging.LoggingFacade; +import org.awaitility.core.ConditionFactory; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.DisplayName; @@ -48,6 +49,8 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; +import static java.util.concurrent.TimeUnit.MINUTES; +import static java.util.concurrent.TimeUnit.SECONDS; import static oracle.weblogic.kubernetes.TestConstants.ADMIN_PASSWORD_DEFAULT; import static oracle.weblogic.kubernetes.TestConstants.ADMIN_USERNAME_DEFAULT; import static oracle.weblogic.kubernetes.TestConstants.BASE_IMAGES_REPO_SECRET; @@ -109,6 +112,7 @@ import static oracle.weblogic.kubernetes.utils.SecretUtils.createSecretWithUsernamePassword; import static oracle.weblogic.kubernetes.utils.ThreadSafeLogger.getLogger; import static oracle.weblogic.kubernetes.utils.WLSTUtils.executeWLSTScript; +import static org.awaitility.Awaitility.with; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -155,6 +159,10 @@ class ItKubernetesEvents { private static LoggingFacade logger = null; + public static ConditionFactory withLongRetryPolicy = with().pollDelay(2, SECONDS) + .and().with().pollInterval(10, SECONDS) + .atMost(10, MINUTES).await(); + /** * Assigns unique namespaces for operator and domains. * Pull WebLogic image if running tests in Kind cluster. @@ -188,7 +196,7 @@ public static void initAll(@Namespaces(6) List namespaces) { externalRestHttpsPort = getServiceNodePort(opNamespace, "external-weblogic-operator-svc"); // This test uses the operator restAPI to scale the domain. To do this in OKD cluster, - // we need to expose the external service as route and set tls termination to passthrough + // we need to expose the external service as route and set tls termination to passthrough logger.info("Create a route for the operator external service - only for OKD"); String opExternalSvc = createRouteForOKD("external-weblogic-operator-svc", opNamespace); // Patch the route just created to set tls termination to passthrough @@ -917,7 +925,7 @@ public static void tearDown() { private static void checkEvent( String opNamespace, String domainNamespace, String domainUid, String reason, String type, OffsetDateTime timestamp) { - testUntil( + testUntil(withLongRetryPolicy, checkDomainEvent(opNamespace, domainNamespace, domainUid, reason, type, timestamp), logger, "domain event {0} to be logged in namespace {1}", From eaf7651ca823711568673b3c29fab8267957ea7e Mon Sep 17 00:00:00 2001 From: Anil Kedia Date: Mon, 8 Nov 2021 18:10:06 +0000 Subject: [PATCH 3/8] Changes to address review comments. --- .../operator/DomainStatusUpdater.java | 25 ++++++++----------- .../kubernetes/operator/JobWatcher.java | 5 ++-- .../operator/helpers/SecretHelper.java | 8 +++--- .../weblogic/domain/model/Domain.java | 2 +- .../weblogic/domain/model/DomainStatus.java | 10 +++++++- .../operator/DomainProcessorDelegateStub.java | 4 +-- .../operator/DomainProcessorTest.java | 8 +++--- .../helpers/DomainIntrospectorJobTest.java | 6 ----- 8 files changed, 33 insertions(+), 35 deletions(-) diff --git a/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java b/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java index b3c150054f1..1fedd09e698 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java +++ b/operator/src/main/java/oracle/kubernetes/operator/DomainStatusUpdater.java @@ -164,36 +164,33 @@ public static Step createFailureCountStep(V1Job domainIntrospectorJob) { static class FailureCountStep extends DomainStatusUpdaterStep { private final V1Job domainIntrospectorJob; - private final boolean resetFailureCount; public FailureCountStep(V1Job domainIntrospectorJob) { - this(domainIntrospectorJob, false); - } - - public FailureCountStep(V1Job domainIntrospectorJob, boolean resetFailureCount) { super(null); this.domainIntrospectorJob = domainIntrospectorJob; - this.resetFailureCount = resetFailureCount; } @Override void modifyStatus(DomainStatus domainStatus) { - if (resetFailureCount) { - domainStatus.resetIntrospectJobFailureCount(); - } else { - domainStatus.incrementIntrospectJobFailureCount(getJobUid()); - } + domainStatus.incrementIntrospectJobFailureCount(getJobUid()); } @Nullable private String getJobUid() { - return Optional.ofNullable(domainIntrospectorJob).map(V1Job::getMetadata) - .map(V1ObjectMeta::getUid).orElse(null); + return Optional.ofNullable(domainIntrospectorJob).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(null); + } + } + + static class ResetFailureCountStep extends DomainStatusUpdaterStep { + + @Override + void modifyStatus(DomainStatus domainStatus) { + domainStatus.resetIntrospectJobFailureCount(); } } public static Step createResetFailureCountStep() { - return new FailureCountStep(null, true); + return new ResetFailureCountStep(); } private static String getEventMessage(@Nonnull DomainFailureReason reason, String message) { diff --git a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java index 3ac0b957bfa..569b15372f5 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java +++ b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java @@ -138,9 +138,8 @@ public static boolean isComplete(V1Job job) { } /** - * Test if job is failed. - * @param job job - * @return true, if failed + * Returns true if the specified job has a failed status or condition. + * @param job job to be tested */ public static boolean isFailed(V1Job job) { if (job == null) { diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java index 319f68d2eb5..eefb9ea2ea7 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/SecretHelper.java @@ -62,12 +62,12 @@ public NextAction apply(Packet packet) { secretName = dpi.getDomain().getWebLogicCredentialsSecretName(); namespace = dpi.getNamespace(); - if (secretName != null) { + if (secretName == null) { + return doNext(packet); + } else { LOGGER.fine(MessageKeys.RETRIEVING_SECRET, secretName); - Step read = new CallBuilder().readSecretAsync(secretName, namespace, new SecretResponseStep(getNext())); + final Step read = new CallBuilder().readSecretAsync(secretName, namespace, new SecretResponseStep(getNext())); return doNext(read, packet); - } else { - return doNext(packet); } } } diff --git a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java index 8df3a1909b8..0da34436800 100644 --- a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java +++ b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/Domain.java @@ -458,7 +458,7 @@ public Domain withStatus(DomainStatus status) { * @return the secret name */ public String getWebLogicCredentialsSecretName() { - return Optional.ofNullable(spec.getWebLogicCredentialsSecret()).map(s -> s.getName()).orElse(null); + return Optional.ofNullable(spec.getWebLogicCredentialsSecret()).map(V1SecretReference::getName).orElse(null); } /** diff --git a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java index 62a9744e685..11b001352cf 100644 --- a/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java +++ b/operator/src/main/java/oracle/kubernetes/weblogic/domain/model/DomainStatus.java @@ -295,12 +295,20 @@ public Integer getIntrospectJobFailureCount() { * @param uid the Kubernetes-assigned UID of the job which discovered the introspection failure */ public void incrementIntrospectJobFailureCount(String uid) { - if ((uid == null) || (!Objects.equals(uid, failedIntrospectionUid))) { + if (fiberException(uid) || failedIntrospectionNotRecorded(uid)) { introspectJobFailureCount = introspectJobFailureCount + 1; } failedIntrospectionUid = uid; } + private boolean fiberException(String uid) { + return uid == null; + } + + private boolean failedIntrospectionNotRecorded(String uid) { + return !uid.equals(failedIntrospectionUid); + } + /** * Reset the number of introspect job failure to default. * diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java index 42c04217890..53698649bd1 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorDelegateStub.java @@ -52,7 +52,7 @@ public PodAwaiterStepFactory getPodAwaiterStepFactory(String namespace) { @Override public JobAwaiterStepFactory getJobAwaiterStepFactory(String namespace) { - return new PassthroughJobAwaiterStepFactory(); + return new TestJobAwaiterStepFactory(); } @Override @@ -98,7 +98,7 @@ public Step waitForDelete(V1Pod pod, Step next) { } } - private class PassthroughJobAwaiterStepFactory implements JobAwaiterStepFactory { + private class TestJobAwaiterStepFactory implements JobAwaiterStepFactory { @Override public Step waitForReady(V1Job job, Step next) { if (isFailed(job) && "DeadlineExceeded".equals(getFailedReason(job))) { diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java index 2d2bcab62cc..e401704826b 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java @@ -1424,10 +1424,10 @@ private void forceExceptionDuringProcessing() { } @Test - @Disabled - void whenExceptionDuringProcessing_sendAbortedEvent() { + void whenWebLogicCredentialsSecretRemoved_NullPointerExceptionAndAbortedEventNotGenerated() { + consoleHandlerMemento.ignoreMessage(NOT_STARTING_DOMAINUID_THREAD); DomainProcessorImpl.registerDomainPresenceInfo(new DomainPresenceInfo(domain)); - forceExceptionDuringProcessing(); + domain.getSpec().withWebLogicCredentialsSecret(null); int time = 0; for (int numRetries = 0; numRetries < DomainPresence.getDomainPresenceFailureRetryMaxCount(); numRetries++) { @@ -1436,7 +1436,7 @@ void whenExceptionDuringProcessing_sendAbortedEvent() { testSupport.setTime(time, TimeUnit.SECONDS); } - assertThat(getEvents().stream().anyMatch(this::isDomainProcessingAbortedEvent), is(true)); + assertThat(getEvents().stream().anyMatch(this::isDomainProcessingAbortedEvent), is(false)); } private List getEvents() { diff --git a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java index a64c7bb7512..769313f50e6 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java @@ -129,7 +129,6 @@ class DomainIntrospectorJobTest { private static final String INFO_MESSAGE = "@[INFO] just letting you know"; public static final String TEST_VOLUME_NAME = "test"; private static final String JOB_UID = "FAILED_JOB"; - private final TerminalStep terminalStep = new TerminalStep(); private final Domain domain = createDomain(); private final DomainPresenceInfo domainPresenceInfo = createDomainPresenceInfo(domain); @@ -627,10 +626,6 @@ private V1Job asTimedoutJob(V1Job job) { return job; } - private long getIntrospectorJobActiveDeadlineSeconds() { - return TuningParameters.getInstance().getPodTuning().introspectorJobActiveDeadlineSeconds; - } - @Test void whenPreviousFailedJobWithImagePullErrorExistsAndMakeRightContinued_createNewJob() { ignoreIntrospectorFailureLogs(); @@ -879,7 +874,6 @@ void whenJobLogContainsSevereErrorAndStatusHasDifferentJobUID_incrementFailureCo assertThat(getUpdatedDomain().getStatus().getIntrospectJobFailureCount(), equalTo(2)); } - // create job // add job uid to status // do NOT create pod log From 2b4f1c657d8c7c81a4b96e8631a3900e7837e4a9 Mon Sep 17 00:00:00 2001 From: Russell Gold Date: Tue, 9 Nov 2021 11:54:48 -0500 Subject: [PATCH 4/8] cleanup --- .../operator/helpers/JobHelper.java | 184 ++++++------------ .../helpers/DomainIntrospectorJobTest.java | 49 ++--- 2 files changed, 79 insertions(+), 154 deletions(-) diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java index 3c339738ec3..ff2a0c1538c 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java @@ -51,10 +51,9 @@ import static oracle.kubernetes.operator.DomainSourceType.FromModel; import static oracle.kubernetes.operator.DomainStatusUpdater.createFailureCountStep; import static oracle.kubernetes.operator.DomainStatusUpdater.createFailureRelatedSteps; -import static oracle.kubernetes.operator.JobWatcher.getFailedReason; -import static oracle.kubernetes.operator.JobWatcher.isFailed; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_DOMAIN_SPEC_GENERATION; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_STATE_LABEL; +import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECTOR_JOB; import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECT_REQUESTED; import static oracle.kubernetes.operator.helpers.ConfigMapHelper.readExistingIntrospectorConfigMap; import static oracle.kubernetes.operator.logging.MessageKeys.INTROSPECTOR_JOB_FAILED; @@ -216,73 +215,18 @@ private class VerifyIntrospectorJobResponseStep extends DefaultResponseStep callResponse) { V1Job job = (V1Job) callResponse.getResult(); - if ((job != null) && (packet.get(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB) == null)) { - packet.put(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB, job); + if ((job != null) && (packet.get(DOMAIN_INTROSPECTOR_JOB) == null)) { + packet.put(DOMAIN_INTROSPECTOR_JOB, job); } - return doNext(verifyIntrospectorJobPod(info.getNamespace(), getNext()), packet); - } - - private Step verifyIntrospectorJobPod(String namespace, Step next) { - return new CallBuilder() - .withLabelSelectors(LabelConstants.JOBNAME_LABEL) - .listPodAsync(namespace, createReadJobPodResponse(next)); - } - - @Nonnull - private VerifyIntrospectorJobPodResponseStep createReadJobPodResponse(Step next) { - return new VerifyIntrospectorJobPodResponseStep<>(next); - } - - private class VerifyIntrospectorJobPodResponseStep extends ResponseStep { - VerifyIntrospectorJobPodResponseStep(Step next) { - super(next); - } - - @Override - public NextAction onSuccess(Packet packet, CallResponse callResponse) { - V1Pod pod = getJobPod(callResponse); - V1Job job = (V1Job) packet.get(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); - boolean hasImagePullError = checkJobPodForImagePullError(pod); - if (hasImagePullError) { - packet.put(DOMAIN_INTROSPECT_REQUESTED, ReadPodLogResponseStep.INTROSPECTION_FAILED); - } - - if (isKnownFailedJob(job) || hasImagePullError) { - return doContinueListOrNext(callResponse, packet, cleanUpAndReintrospect()); - } else if (isJobTimedout(job, pod)) { - return doContinueListOrNext(callResponse, packet, deleteJobAndStartNewIntrospection()); - } else if (job != null) { - return doContinueListOrNext(callResponse, packet, processIntrospectionResults()); - } else if (isIntrospectionNeeded(packet)) { - return doContinueListOrNext(callResponse, packet, createIntrospectionSteps()); - } else { - return doContinueListOrNext(callResponse, packet); - } - } - - private V1Pod getJobPod(CallResponse callResponse) { - return Optional.ofNullable( - callResponse.getResult()).map(V1PodList::getItems).orElseGet(Collections::emptyList).stream() - .filter(pod -> isIntrospectionJobPod(pod)).findAny().orElse(null); - } - - private boolean checkJobPodForImagePullError(V1Pod pod) { - return Optional.ofNullable(getJobPodContainerWaitingReason(pod)) - .map(s -> s.contains("ErrImagePull") || s.contains("ImagePullBackOff")) - .orElse(false); - } - - private String getJobPodContainerWaitingReason(V1Pod pod) { - return Optional.ofNullable(pod).map(V1Pod::getStatus) - .map(V1PodStatus::getContainerStatuses).map(statuses -> statuses.get(0)) - .map(V1ContainerStatus::getState).map(V1ContainerState::getWaiting) - .map(V1ContainerStateWaiting::getReason).orElse(null); - } - - private boolean isIntrospectionJobPod(V1Pod pod) { - return Optional.ofNullable(pod).map(V1Pod::getMetadata) - .map(meta -> meta.getName().startsWith(getJobName())).orElse(false); + if (isKnownFailedJob(job)) { + return doNext(cleanUpAndReintrospect(getNext()), packet); + } else if (job != null) { + return doNext(processIntrospectionResults(getNext()), packet); + } else if (isIntrospectionNeeded(packet)) { + return doNext(createIntrospectionSteps(getNext()), packet); + } else { + return doNext(packet); } } @@ -290,20 +234,6 @@ private boolean isKnownFailedJob(V1Job job) { return getUid(job).equals(getLastFailedUid()); } - private boolean isJobTimedout(V1Job introspectorJob, V1Pod pod) { - return Optional.ofNullable(introspectorJob) - .map(job -> isFailed(job) && isDeadlineExceeded(job, pod)).orElse(false); - } - - private boolean isDeadlineExceeded(V1Job job, V1Pod pod) { - return "DeadlineExceeded".equals(getFailedReason(job)) - || "DeadlineExceeded".equals(getPodStatusReason(pod)); - } - - private String getPodStatusReason(V1Pod pod) { - return Optional.ofNullable(pod).map(V1Pod::getStatus).map(V1PodStatus::getReason).orElse(null); - } - @Nonnull private String getUid(V1Job job) { return Optional.ofNullable(job).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(""); @@ -314,27 +244,6 @@ private String getLastFailedUid() { return getDomain().getOrCreateStatus().getFailedIntrospectionUid(); } - private Step cleanUpAndReintrospect() { - return Step.chain(deleteIntrospectorJob(), createIntrospectionSteps()); - } - - private Step createIntrospectionSteps() { - return Step.chain( - readExistingIntrospectorConfigMap(getNamespace(), getDomainUid()), - startNewIntrospection(getNext())); - } - - private Step deleteJobAndStartNewIntrospection() { - return Step.chain( - deleteDomainIntrospectorJobStep(null), - startNewIntrospection(getNext())); - } - - @Nonnull - private Step startNewIntrospection(Step next) { - return Step.chain(createNewJob(), processIntrospectionResults(), next); - } - private boolean isIntrospectionNeeded(Packet packet) { return getDomainTopology() == null || isBringingUpNewDomain(packet) @@ -396,22 +305,31 @@ private String getIntrospectVersion() { } } - // Returns a chain of steps which attempt to read the introspection result into a config map. - private Step processIntrospectionResults() { + private Step cleanUpAndReintrospect(Step next) { + return Step.chain(deleteIntrospectorJob(), createIntrospectionSteps(next)); + } + + private Step createIntrospectionSteps(Step next) { return Step.chain( - waitForIntrospectionToComplete(), - findIntrospectorPodName(), - readNamedPodLog(), - deleteIntrospectorJob(), - createIntrospectorConfigMap() - ); + readExistingIntrospectorConfigMap(getNamespace(), getDomainUid()), + startNewIntrospection(next)); + } + + @Nonnull + private Step startNewIntrospection(Step next) { + return Step.chain(createNewJob(), processIntrospectionResults(next)); + } + + // Returns a chain of steps which read the job pod and decide how to handle it. + private Step processIntrospectionResults(Step next) { + return Step.chain(waitForIntrospectionToComplete(), verifyJobPod(), next); } private Step waitForIntrospectionToComplete() { return new WatchDomainIntrospectorJobReadyStep(); } - private Step findIntrospectorPodName() { + private Step verifyJobPod() { return new ReadDomainIntrospectorPodStep(); } @@ -471,7 +389,7 @@ private class ReadPodLogResponseStep extends ResponseStep { public NextAction onSuccess(Packet packet, CallResponse callResponse) { Optional.ofNullable(callResponse.getResult()).ifPresent(result -> processIntrospectionResult(packet, result)); - final V1Job domainIntrospectorJob = packet.getValue(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); + final V1Job domainIntrospectorJob = packet.getValue(DOMAIN_INTROSPECTOR_JOB); if (JobWatcher.isComplete(domainIntrospectorJob)) { return doNext(packet); } else { @@ -640,30 +558,54 @@ private class PodListResponseStep extends ResponseStep { @Override public NextAction onSuccess(Packet packet, CallResponse callResponse) { - Optional.ofNullable(callResponse.getResult()) + final V1Pod jobPod + = Optional.ofNullable(callResponse.getResult()) .map(V1PodList::getItems) .orElseGet(Collections::emptyList) .stream() - .map(this::getName) - .filter(this::isJobPodName) + .filter(this::isJobPod) .findFirst() - .ifPresent(name -> recordJobPodName(packet, name)); + .orElse(null); - return doContinueListOrNext(callResponse, packet); + if (jobPod == null) { + return doContinueListOrNext(callResponse, packet, processIntrospectorPodLog(getNext())); + } else if (hasImagePullFailure(jobPod)) { + return doNext(cleanUpAndReintrospect(getNext()), packet); + } else { + recordJobPodName(packet, getName(jobPod)); + return doNext(processIntrospectorPodLog(getNext()), packet); + } + } + + // Returns a chain of steps which read the pod log and create a config map. + private Step processIntrospectorPodLog(Step next) { + return Step.chain(readNamedPodLog(), deleteIntrospectorJob(), createIntrospectorConfigMap(), next); } private String getName(V1Pod pod) { return Optional.of(pod).map(V1Pod::getMetadata).map(V1ObjectMeta::getName).orElse(""); } - private boolean isJobPodName(String podName) { - return podName.startsWith(getJobName()); + private boolean isJobPod(V1Pod pod) { + return getName(pod).startsWith(getJobName()); + } + + private boolean hasImagePullFailure(V1Pod pod) { + return Optional.ofNullable(getJobPodContainerWaitingReason(pod)) + .map(s -> s.contains("ErrImagePull") || s.contains("ImagePullBackOff")) + .orElse(false); + } + + private String getJobPodContainerWaitingReason(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getStatus) + .map(V1PodStatus::getContainerStatuses).map(statuses -> statuses.get(0)) + .map(V1ContainerStatus::getState).map(V1ContainerState::getWaiting) + .map(V1ContainerStateWaiting::getReason).orElse(null); } private void recordJobPodName(Packet packet, String podName) { packet.put(ProcessingConstants.JOB_POD_NAME, podName); } - } } @@ -686,7 +628,7 @@ private static void logIntrospectorFailure(Packet packet, V1Job domainIntrospect static void logJobDeleted(String domainUid, String namespace, String jobName, Packet packet) { V1Job domainIntrospectorJob = - (V1Job) packet.remove(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); + (V1Job) packet.remove(DOMAIN_INTROSPECTOR_JOB); packet.remove(ProcessingConstants.INTROSPECTOR_JOB_FAILURE_LOGGED); if (domainIntrospectorJob != null diff --git a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java index 769313f50e6..aaac25fc064 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java @@ -39,7 +39,6 @@ import oracle.kubernetes.operator.TuningParameters; import oracle.kubernetes.operator.calls.unprocessable.UnrecoverableErrorBuilderImpl; import oracle.kubernetes.operator.rest.ScanCacheStub; -import oracle.kubernetes.operator.steps.WatchDomainIntrospectorJobReadyStep; import oracle.kubernetes.operator.wlsconfig.WlsClusterConfig; import oracle.kubernetes.operator.wlsconfig.WlsDomainConfig; import oracle.kubernetes.operator.wlsconfig.WlsServerConfig; @@ -129,6 +128,7 @@ class DomainIntrospectorJobTest { private static final String INFO_MESSAGE = "@[INFO] just letting you know"; public static final String TEST_VOLUME_NAME = "test"; private static final String JOB_UID = "FAILED_JOB"; + private static final String JOB_NAME = UID + "-introspector"; private final TerminalStep terminalStep = new TerminalStep(); private final Domain domain = createDomain(); private final DomainPresenceInfo domainPresenceInfo = createDomainPresenceInfo(domain); @@ -599,33 +599,6 @@ void whenPreviousFailedJobExists_createNewJob() { assertThat(affectedJob, notNullValue()); } - @Test - void whenPreviousTimedoutJobExists_createNewJob() { - ignoreIntrospectorFailureLogs(); - ignoreJobCreatedAndDeletedLogs(); - definePreviousTimedoutIntrospection(); - testSupport.doOnCreate(JOB, this::recordJob); - - testSupport.runSteps(Step.chain(new WatchDomainIntrospectorJobReadyStep(), - JobHelper.createIntrospectionStartStep(null))); - - assertThat(affectedJob, notNullValue()); - } - - private void definePreviousTimedoutIntrospection() { - defineTimedoutIntrospection(); - } - - private void defineTimedoutIntrospection() { - testSupport.defineResources(asTimedoutJob(createIntrospectorJob("TIMEDOUT_JOB"))); - } - - private V1Job asTimedoutJob(V1Job job) { - job.setStatus(new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") - .reason("DeadlineExceeded"))); - return job; - } - @Test void whenPreviousFailedJobWithImagePullErrorExistsAndMakeRightContinued_createNewJob() { ignoreIntrospectorFailureLogs(); @@ -633,28 +606,37 @@ void whenPreviousFailedJobWithImagePullErrorExistsAndMakeRightContinued_createNe testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); defineFailedIntrospectionWithImagePullError("ErrImagePull"); testSupport.doOnCreate(JOB, this::recordJob); + testSupport.doAfterCall(JOB, "deleteJob", this::replaceFailedJobPodWithSuccess); testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); assertThat(affectedJob, notNullValue()); } + private void replaceFailedJobPodWithSuccess() { + testSupport.deleteResources(createIntrospectorJobPod()); + testSupport.defineResources(createIntrospectorJobPod()); + } + private void defineFailedIntrospectionWithImagePullError(String imagePullError) { testSupport.defineResources(asFailedJobWithBackoffLimitExceeded(createIntrospectorJob())); testSupport.defineResources(asFailedJobPodWithImagePullError(createIntrospectorJobPod(), imagePullError)); } private V1Pod asFailedJobPodWithImagePullError(V1Pod introspectorJobPod, String imagePullError) { - List statuses = Arrays.asList(new V1ContainerStatus().state(new V1ContainerState() - .waiting(new V1ContainerStateWaiting().reason(imagePullError)))); + List statuses = List.of(createImagePullContainerStatus(imagePullError)); return introspectorJobPod.status(new V1PodStatus().containerStatuses(statuses)); } + private V1ContainerStatus createImagePullContainerStatus(String imagePullError) { + return new V1ContainerStatus().state( + new V1ContainerState().waiting(new V1ContainerStateWaiting().reason(imagePullError))); + } + private V1Pod createIntrospectorJobPod() { - String jobName = UID + "-introspector"; Map labels = new HashMap<>(); - labels.put(LabelConstants.JOBNAME_LABEL, jobName); - return new V1Pod().metadata(new V1ObjectMeta().name(jobName).labels(labels).namespace(NS)); + labels.put(LabelConstants.JOBNAME_LABEL, JOB_NAME); + return new V1Pod().metadata(new V1ObjectMeta().name(JOB_NAME).labels(labels).namespace(NS)); } private V1Job asFailedJobWithBackoffLimitExceeded(V1Job job) { @@ -670,6 +652,7 @@ void whenPreviousFailedJobWithImagePullBackoffErrorExistsAndMakeRightContinued_c testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); defineFailedIntrospectionWithImagePullError("ImagePullBackOff"); testSupport.doOnCreate(JOB, this::recordJob); + testSupport.doAfterCall(JOB, "deleteJob", this::replaceFailedJobPodWithSuccess); testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); From 14be2e6dc5bad8d29b6df1ed1ec2acf1960b58da Mon Sep 17 00:00:00 2001 From: Anil Kedia Date: Tue, 9 Nov 2021 18:03:19 +0000 Subject: [PATCH 5/8] Revert "cleanup". This reverts commit 2b4f1c657d8c7c81a4b96e8631a3900e7837e4a9. --- .../operator/helpers/JobHelper.java | 184 ++++++++++++------ .../helpers/DomainIntrospectorJobTest.java | 49 +++-- 2 files changed, 154 insertions(+), 79 deletions(-) diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java index ff2a0c1538c..3c339738ec3 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java @@ -51,9 +51,10 @@ import static oracle.kubernetes.operator.DomainSourceType.FromModel; import static oracle.kubernetes.operator.DomainStatusUpdater.createFailureCountStep; import static oracle.kubernetes.operator.DomainStatusUpdater.createFailureRelatedSteps; +import static oracle.kubernetes.operator.JobWatcher.getFailedReason; +import static oracle.kubernetes.operator.JobWatcher.isFailed; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_DOMAIN_SPEC_GENERATION; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_STATE_LABEL; -import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECTOR_JOB; import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECT_REQUESTED; import static oracle.kubernetes.operator.helpers.ConfigMapHelper.readExistingIntrospectorConfigMap; import static oracle.kubernetes.operator.logging.MessageKeys.INTROSPECTOR_JOB_FAILED; @@ -215,18 +216,73 @@ private class VerifyIntrospectorJobResponseStep extends DefaultResponseStep callResponse) { V1Job job = (V1Job) callResponse.getResult(); - if ((job != null) && (packet.get(DOMAIN_INTROSPECTOR_JOB) == null)) { - packet.put(DOMAIN_INTROSPECTOR_JOB, job); + if ((job != null) && (packet.get(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB) == null)) { + packet.put(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB, job); } - if (isKnownFailedJob(job)) { - return doNext(cleanUpAndReintrospect(getNext()), packet); - } else if (job != null) { - return doNext(processIntrospectionResults(getNext()), packet); - } else if (isIntrospectionNeeded(packet)) { - return doNext(createIntrospectionSteps(getNext()), packet); - } else { - return doNext(packet); + return doNext(verifyIntrospectorJobPod(info.getNamespace(), getNext()), packet); + } + + private Step verifyIntrospectorJobPod(String namespace, Step next) { + return new CallBuilder() + .withLabelSelectors(LabelConstants.JOBNAME_LABEL) + .listPodAsync(namespace, createReadJobPodResponse(next)); + } + + @Nonnull + private VerifyIntrospectorJobPodResponseStep createReadJobPodResponse(Step next) { + return new VerifyIntrospectorJobPodResponseStep<>(next); + } + + private class VerifyIntrospectorJobPodResponseStep extends ResponseStep { + VerifyIntrospectorJobPodResponseStep(Step next) { + super(next); + } + + @Override + public NextAction onSuccess(Packet packet, CallResponse callResponse) { + V1Pod pod = getJobPod(callResponse); + V1Job job = (V1Job) packet.get(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); + boolean hasImagePullError = checkJobPodForImagePullError(pod); + if (hasImagePullError) { + packet.put(DOMAIN_INTROSPECT_REQUESTED, ReadPodLogResponseStep.INTROSPECTION_FAILED); + } + + if (isKnownFailedJob(job) || hasImagePullError) { + return doContinueListOrNext(callResponse, packet, cleanUpAndReintrospect()); + } else if (isJobTimedout(job, pod)) { + return doContinueListOrNext(callResponse, packet, deleteJobAndStartNewIntrospection()); + } else if (job != null) { + return doContinueListOrNext(callResponse, packet, processIntrospectionResults()); + } else if (isIntrospectionNeeded(packet)) { + return doContinueListOrNext(callResponse, packet, createIntrospectionSteps()); + } else { + return doContinueListOrNext(callResponse, packet); + } + } + + private V1Pod getJobPod(CallResponse callResponse) { + return Optional.ofNullable( + callResponse.getResult()).map(V1PodList::getItems).orElseGet(Collections::emptyList).stream() + .filter(pod -> isIntrospectionJobPod(pod)).findAny().orElse(null); + } + + private boolean checkJobPodForImagePullError(V1Pod pod) { + return Optional.ofNullable(getJobPodContainerWaitingReason(pod)) + .map(s -> s.contains("ErrImagePull") || s.contains("ImagePullBackOff")) + .orElse(false); + } + + private String getJobPodContainerWaitingReason(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getStatus) + .map(V1PodStatus::getContainerStatuses).map(statuses -> statuses.get(0)) + .map(V1ContainerStatus::getState).map(V1ContainerState::getWaiting) + .map(V1ContainerStateWaiting::getReason).orElse(null); + } + + private boolean isIntrospectionJobPod(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getMetadata) + .map(meta -> meta.getName().startsWith(getJobName())).orElse(false); } } @@ -234,6 +290,20 @@ private boolean isKnownFailedJob(V1Job job) { return getUid(job).equals(getLastFailedUid()); } + private boolean isJobTimedout(V1Job introspectorJob, V1Pod pod) { + return Optional.ofNullable(introspectorJob) + .map(job -> isFailed(job) && isDeadlineExceeded(job, pod)).orElse(false); + } + + private boolean isDeadlineExceeded(V1Job job, V1Pod pod) { + return "DeadlineExceeded".equals(getFailedReason(job)) + || "DeadlineExceeded".equals(getPodStatusReason(pod)); + } + + private String getPodStatusReason(V1Pod pod) { + return Optional.ofNullable(pod).map(V1Pod::getStatus).map(V1PodStatus::getReason).orElse(null); + } + @Nonnull private String getUid(V1Job job) { return Optional.ofNullable(job).map(V1Job::getMetadata).map(V1ObjectMeta::getUid).orElse(""); @@ -244,6 +314,27 @@ private String getLastFailedUid() { return getDomain().getOrCreateStatus().getFailedIntrospectionUid(); } + private Step cleanUpAndReintrospect() { + return Step.chain(deleteIntrospectorJob(), createIntrospectionSteps()); + } + + private Step createIntrospectionSteps() { + return Step.chain( + readExistingIntrospectorConfigMap(getNamespace(), getDomainUid()), + startNewIntrospection(getNext())); + } + + private Step deleteJobAndStartNewIntrospection() { + return Step.chain( + deleteDomainIntrospectorJobStep(null), + startNewIntrospection(getNext())); + } + + @Nonnull + private Step startNewIntrospection(Step next) { + return Step.chain(createNewJob(), processIntrospectionResults(), next); + } + private boolean isIntrospectionNeeded(Packet packet) { return getDomainTopology() == null || isBringingUpNewDomain(packet) @@ -305,31 +396,22 @@ private String getIntrospectVersion() { } } - private Step cleanUpAndReintrospect(Step next) { - return Step.chain(deleteIntrospectorJob(), createIntrospectionSteps(next)); - } - - private Step createIntrospectionSteps(Step next) { + // Returns a chain of steps which attempt to read the introspection result into a config map. + private Step processIntrospectionResults() { return Step.chain( - readExistingIntrospectorConfigMap(getNamespace(), getDomainUid()), - startNewIntrospection(next)); - } - - @Nonnull - private Step startNewIntrospection(Step next) { - return Step.chain(createNewJob(), processIntrospectionResults(next)); - } - - // Returns a chain of steps which read the job pod and decide how to handle it. - private Step processIntrospectionResults(Step next) { - return Step.chain(waitForIntrospectionToComplete(), verifyJobPod(), next); + waitForIntrospectionToComplete(), + findIntrospectorPodName(), + readNamedPodLog(), + deleteIntrospectorJob(), + createIntrospectorConfigMap() + ); } private Step waitForIntrospectionToComplete() { return new WatchDomainIntrospectorJobReadyStep(); } - private Step verifyJobPod() { + private Step findIntrospectorPodName() { return new ReadDomainIntrospectorPodStep(); } @@ -389,7 +471,7 @@ private class ReadPodLogResponseStep extends ResponseStep { public NextAction onSuccess(Packet packet, CallResponse callResponse) { Optional.ofNullable(callResponse.getResult()).ifPresent(result -> processIntrospectionResult(packet, result)); - final V1Job domainIntrospectorJob = packet.getValue(DOMAIN_INTROSPECTOR_JOB); + final V1Job domainIntrospectorJob = packet.getValue(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); if (JobWatcher.isComplete(domainIntrospectorJob)) { return doNext(packet); } else { @@ -558,54 +640,30 @@ private class PodListResponseStep extends ResponseStep { @Override public NextAction onSuccess(Packet packet, CallResponse callResponse) { - final V1Pod jobPod - = Optional.ofNullable(callResponse.getResult()) + Optional.ofNullable(callResponse.getResult()) .map(V1PodList::getItems) .orElseGet(Collections::emptyList) .stream() - .filter(this::isJobPod) + .map(this::getName) + .filter(this::isJobPodName) .findFirst() - .orElse(null); + .ifPresent(name -> recordJobPodName(packet, name)); - if (jobPod == null) { - return doContinueListOrNext(callResponse, packet, processIntrospectorPodLog(getNext())); - } else if (hasImagePullFailure(jobPod)) { - return doNext(cleanUpAndReintrospect(getNext()), packet); - } else { - recordJobPodName(packet, getName(jobPod)); - return doNext(processIntrospectorPodLog(getNext()), packet); - } - } - - // Returns a chain of steps which read the pod log and create a config map. - private Step processIntrospectorPodLog(Step next) { - return Step.chain(readNamedPodLog(), deleteIntrospectorJob(), createIntrospectorConfigMap(), next); + return doContinueListOrNext(callResponse, packet); } private String getName(V1Pod pod) { return Optional.of(pod).map(V1Pod::getMetadata).map(V1ObjectMeta::getName).orElse(""); } - private boolean isJobPod(V1Pod pod) { - return getName(pod).startsWith(getJobName()); - } - - private boolean hasImagePullFailure(V1Pod pod) { - return Optional.ofNullable(getJobPodContainerWaitingReason(pod)) - .map(s -> s.contains("ErrImagePull") || s.contains("ImagePullBackOff")) - .orElse(false); - } - - private String getJobPodContainerWaitingReason(V1Pod pod) { - return Optional.ofNullable(pod).map(V1Pod::getStatus) - .map(V1PodStatus::getContainerStatuses).map(statuses -> statuses.get(0)) - .map(V1ContainerStatus::getState).map(V1ContainerState::getWaiting) - .map(V1ContainerStateWaiting::getReason).orElse(null); + private boolean isJobPodName(String podName) { + return podName.startsWith(getJobName()); } private void recordJobPodName(Packet packet, String podName) { packet.put(ProcessingConstants.JOB_POD_NAME, podName); } + } } @@ -628,7 +686,7 @@ private static void logIntrospectorFailure(Packet packet, V1Job domainIntrospect static void logJobDeleted(String domainUid, String namespace, String jobName, Packet packet) { V1Job domainIntrospectorJob = - (V1Job) packet.remove(DOMAIN_INTROSPECTOR_JOB); + (V1Job) packet.remove(ProcessingConstants.DOMAIN_INTROSPECTOR_JOB); packet.remove(ProcessingConstants.INTROSPECTOR_JOB_FAILURE_LOGGED); if (domainIntrospectorJob != null diff --git a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java index aaac25fc064..769313f50e6 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/helpers/DomainIntrospectorJobTest.java @@ -39,6 +39,7 @@ import oracle.kubernetes.operator.TuningParameters; import oracle.kubernetes.operator.calls.unprocessable.UnrecoverableErrorBuilderImpl; import oracle.kubernetes.operator.rest.ScanCacheStub; +import oracle.kubernetes.operator.steps.WatchDomainIntrospectorJobReadyStep; import oracle.kubernetes.operator.wlsconfig.WlsClusterConfig; import oracle.kubernetes.operator.wlsconfig.WlsDomainConfig; import oracle.kubernetes.operator.wlsconfig.WlsServerConfig; @@ -128,7 +129,6 @@ class DomainIntrospectorJobTest { private static final String INFO_MESSAGE = "@[INFO] just letting you know"; public static final String TEST_VOLUME_NAME = "test"; private static final String JOB_UID = "FAILED_JOB"; - private static final String JOB_NAME = UID + "-introspector"; private final TerminalStep terminalStep = new TerminalStep(); private final Domain domain = createDomain(); private final DomainPresenceInfo domainPresenceInfo = createDomainPresenceInfo(domain); @@ -599,6 +599,33 @@ void whenPreviousFailedJobExists_createNewJob() { assertThat(affectedJob, notNullValue()); } + @Test + void whenPreviousTimedoutJobExists_createNewJob() { + ignoreIntrospectorFailureLogs(); + ignoreJobCreatedAndDeletedLogs(); + definePreviousTimedoutIntrospection(); + testSupport.doOnCreate(JOB, this::recordJob); + + testSupport.runSteps(Step.chain(new WatchDomainIntrospectorJobReadyStep(), + JobHelper.createIntrospectionStartStep(null))); + + assertThat(affectedJob, notNullValue()); + } + + private void definePreviousTimedoutIntrospection() { + defineTimedoutIntrospection(); + } + + private void defineTimedoutIntrospection() { + testSupport.defineResources(asTimedoutJob(createIntrospectorJob("TIMEDOUT_JOB"))); + } + + private V1Job asTimedoutJob(V1Job job) { + job.setStatus(new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("DeadlineExceeded"))); + return job; + } + @Test void whenPreviousFailedJobWithImagePullErrorExistsAndMakeRightContinued_createNewJob() { ignoreIntrospectorFailureLogs(); @@ -606,37 +633,28 @@ void whenPreviousFailedJobWithImagePullErrorExistsAndMakeRightContinued_createNe testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); defineFailedIntrospectionWithImagePullError("ErrImagePull"); testSupport.doOnCreate(JOB, this::recordJob); - testSupport.doAfterCall(JOB, "deleteJob", this::replaceFailedJobPodWithSuccess); testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); assertThat(affectedJob, notNullValue()); } - private void replaceFailedJobPodWithSuccess() { - testSupport.deleteResources(createIntrospectorJobPod()); - testSupport.defineResources(createIntrospectorJobPod()); - } - private void defineFailedIntrospectionWithImagePullError(String imagePullError) { testSupport.defineResources(asFailedJobWithBackoffLimitExceeded(createIntrospectorJob())); testSupport.defineResources(asFailedJobPodWithImagePullError(createIntrospectorJobPod(), imagePullError)); } private V1Pod asFailedJobPodWithImagePullError(V1Pod introspectorJobPod, String imagePullError) { - List statuses = List.of(createImagePullContainerStatus(imagePullError)); + List statuses = Arrays.asList(new V1ContainerStatus().state(new V1ContainerState() + .waiting(new V1ContainerStateWaiting().reason(imagePullError)))); return introspectorJobPod.status(new V1PodStatus().containerStatuses(statuses)); } - private V1ContainerStatus createImagePullContainerStatus(String imagePullError) { - return new V1ContainerStatus().state( - new V1ContainerState().waiting(new V1ContainerStateWaiting().reason(imagePullError))); - } - private V1Pod createIntrospectorJobPod() { + String jobName = UID + "-introspector"; Map labels = new HashMap<>(); - labels.put(LabelConstants.JOBNAME_LABEL, JOB_NAME); - return new V1Pod().metadata(new V1ObjectMeta().name(JOB_NAME).labels(labels).namespace(NS)); + labels.put(LabelConstants.JOBNAME_LABEL, jobName); + return new V1Pod().metadata(new V1ObjectMeta().name(jobName).labels(labels).namespace(NS)); } private V1Job asFailedJobWithBackoffLimitExceeded(V1Job job) { @@ -652,7 +670,6 @@ void whenPreviousFailedJobWithImagePullBackoffErrorExistsAndMakeRightContinued_c testSupport.addToPacket(DOMAIN_TOPOLOGY, createDomainConfig("cluster-1")); defineFailedIntrospectionWithImagePullError("ImagePullBackOff"); testSupport.doOnCreate(JOB, this::recordJob); - testSupport.doAfterCall(JOB, "deleteJob", this::replaceFailedJobPodWithSuccess); testSupport.runSteps(JobHelper.createIntrospectionStartStep(null)); From 47826d04486a4380e0da459fc3e0dca9ce7e21fd Mon Sep 17 00:00:00 2001 From: Anil Kedia Date: Tue, 9 Nov 2021 18:28:48 +0000 Subject: [PATCH 6/8] Modified the test for introspector timeout and changed JobHelper to not use unitTestAdaptor method. --- .../oracle/kubernetes/operator/helpers/JobHelper.java | 4 +--- .../kubernetes/operator/DomainProcessorTest.java | 10 ++++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java index 3c339738ec3..3130847e191 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java @@ -248,10 +248,8 @@ public NextAction onSuccess(Packet packet, CallResponse callResponse) packet.put(DOMAIN_INTROSPECT_REQUESTED, ReadPodLogResponseStep.INTROSPECTION_FAILED); } - if (isKnownFailedJob(job) || hasImagePullError) { + if (isKnownFailedJob(job) || hasImagePullError || isJobTimedout(job, pod)) { return doContinueListOrNext(callResponse, packet, cleanUpAndReintrospect()); - } else if (isJobTimedout(job, pod)) { - return doContinueListOrNext(callResponse, packet, deleteJobAndStartNewIntrospection()); } else if (job != null) { return doContinueListOrNext(callResponse, packet, processIntrospectionResults()); } else if (isIntrospectionNeeded(packet)) { diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java index e401704826b..c3be9ad3a27 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java @@ -74,6 +74,7 @@ import oracle.kubernetes.weblogic.domain.model.DomainStatus; import oracle.kubernetes.weblogic.domain.model.ManagedServer; import oracle.kubernetes.weblogic.domain.model.ServerStatus; +import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -101,6 +102,7 @@ import static oracle.kubernetes.operator.WebLogicConstants.SHUTDOWN_STATE; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.CONFIG_MAP; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.DOMAIN; +import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.JOB; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.POD; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.SERVICE; import static oracle.kubernetes.operator.helpers.SecretHelper.PASSWORD_KEY; @@ -851,6 +853,14 @@ void whenIntrospectionJobTimedout_failureCountIncremented() throws Exception { new DomainPresenceInfo(newDomain)).interrupt(); makeRight.execute(); assertThat(newDomain.getStatus().getIntrospectJobFailureCount(), is(1)); + testSupport.setTime(10, TimeUnit.SECONDS); + makeRight.execute(); + assertThat(getJob().getSpec().getActiveDeadlineSeconds(), is(240L)); + } + + @NotNull + private V1Job getJob() { + return (V1Job) testSupport.getResources(JOB).stream().findFirst().get(); } V1JobStatus createTimedoutStatus() { From 993dc36e93e1758e0d95ceb0922528d12823b618 Mon Sep 17 00:00:00 2001 From: Russell Gold Date: Tue, 9 Nov 2021 15:16:57 -0500 Subject: [PATCH 7/8] fix job timeout --- .../kubernetes/operator/JobWatcher.java | 6 ++- .../operator/helpers/JobHelper.java | 2 +- .../operator/DomainProcessorTest.java | 41 +++++++++++++++---- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java index 569b15372f5..7d03f05330b 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java +++ b/operator/src/main/java/oracle/kubernetes/operator/JobWatcher.java @@ -306,7 +306,7 @@ void updatePacket(Packet packet, V1Job job) { // be available for reading @Override boolean shouldTerminateFiber(V1Job job) { - return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job))); + return isJobTimedOut(job); } // create an exception to terminate the fiber @@ -336,6 +336,10 @@ public NextAction onSuccess(Packet packet, CallResponse callResponse) { } } + public static boolean isJobTimedOut(V1Job job) { + return isFailed(job) && ("DeadlineExceeded".equals(getFailedReason(job))); + } + static class DeadlineExceededException extends Exception { final V1Job job; diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java index ff2a0c1538c..1e725335538 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java @@ -219,7 +219,7 @@ public NextAction onSuccess(Packet packet, CallResponse callResponse) { packet.put(DOMAIN_INTROSPECTOR_JOB, job); } - if (isKnownFailedJob(job)) { + if (isKnownFailedJob(job) || JobWatcher.isJobTimedOut(job)) { return doNext(cleanUpAndReintrospect(getNext()), packet); } else if (job != null) { return doNext(processIntrospectionResults(getNext()), packet); diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java index e401704826b..bd4babdca0c 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java @@ -74,6 +74,7 @@ import oracle.kubernetes.weblogic.domain.model.DomainStatus; import oracle.kubernetes.weblogic.domain.model.ManagedServer; import oracle.kubernetes.weblogic.domain.model.ServerStatus; +import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -101,6 +102,7 @@ import static oracle.kubernetes.operator.WebLogicConstants.SHUTDOWN_STATE; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.CONFIG_MAP; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.DOMAIN; +import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.JOB; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.POD; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.SERVICE; import static oracle.kubernetes.operator.helpers.SecretHelper.PASSWORD_KEY; @@ -839,21 +841,44 @@ void whenIntrospectionJobNotComplete_waitForIt() throws Exception { assertThat(processorDelegate.waitedForIntrospection(), is(true)); } + @Test - void whenIntrospectionJobTimedout_failureCountIncremented() throws Exception { + void whenIntrospectionJobTimedOut_failureCountIncremented() throws Exception { consoleHandlerMemento.ignoringLoggedExceptions(RuntimeException.class); consoleHandlerMemento.ignoreMessage(MessageKeys.NOT_STARTING_DOMAINUID_THREAD); establishPreviousIntrospection(null); - jobStatus = createTimedoutStatus(); - + jobStatus = createTimedOutStatus(); domainConfigurator.withIntrospectVersion(NEW_INTROSPECTION_STATE); - MakeRightDomainOperation makeRight = this.processor.createMakeRightOperation( - new DomainPresenceInfo(newDomain)).interrupt(); - makeRight.execute(); + processor.createMakeRightOperation(new DomainPresenceInfo(newDomain)).interrupt().execute(); + assertThat(newDomain.getStatus().getIntrospectJobFailureCount(), is(1)); } - V1JobStatus createTimedoutStatus() { + @Test + void whenIntrospectionJobTimedOut_activeDeadlineIncremented() throws Exception { + consoleHandlerMemento.ignoringLoggedExceptions(RuntimeException.class); + consoleHandlerMemento.ignoreMessage(MessageKeys.NOT_STARTING_DOMAINUID_THREAD); + establishPreviousIntrospection(null); + jobStatus = createTimedOutStatus(); + domainConfigurator.withIntrospectVersion(NEW_INTROSPECTION_STATE); + processor.createMakeRightOperation(new DomainPresenceInfo(newDomain)).interrupt().execute(); + + executeScheduledRetry(); + + assertThat(getJob().getSpec().getActiveDeadlineSeconds(), is(240L)); + } + + private void executeScheduledRetry() { + testSupport.setTime(10, TimeUnit.SECONDS); + } + + + @NotNull + private V1Job getJob() { + return (V1Job) testSupport.getResources(JOB).stream().findFirst().get(); + } + + V1JobStatus createTimedOutStatus() { return new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") .reason("DeadlineExceeded")); } @@ -904,6 +929,8 @@ private String defineTopology() throws JsonProcessingException { return IntrospectionTestUtils.createTopologyYaml(createDomainConfig()); } + // case 1: job was able to pull, time out during introspection: pod will have DEADLINE_EXCEEDED + @Test void afterIntrospection_introspectorConfigMapHasUpToDateLabel() throws Exception { establishPreviousIntrospection(null); From bd39be80f3dbc011b5c8150abf266a0d51bec801 Mon Sep 17 00:00:00 2001 From: Anil Kedia Date: Wed, 10 Nov 2021 00:12:33 +0000 Subject: [PATCH 8/8] Added unit test to check for introspector job pod timed out status and change to JobHelper to make the test pass. --- .../operator/helpers/JobHelper.java | 10 ++- .../operator/DomainProcessorTest.java | 70 +++++++++++++++++-- 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java index 1e725335538..ee7268247eb 100644 --- a/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java +++ b/operator/src/main/java/oracle/kubernetes/operator/helpers/JobHelper.java @@ -569,7 +569,7 @@ public NextAction onSuccess(Packet packet, CallResponse callResponse) if (jobPod == null) { return doContinueListOrNext(callResponse, packet, processIntrospectorPodLog(getNext())); - } else if (hasImagePullFailure(jobPod)) { + } else if (hasImagePullFailure(jobPod) || isJobPodTimedOut(jobPod)) { return doNext(cleanUpAndReintrospect(getNext()), packet); } else { recordJobPodName(packet, getName(jobPod)); @@ -577,6 +577,14 @@ public NextAction onSuccess(Packet packet, CallResponse callResponse) } } + private boolean isJobPodTimedOut(V1Pod jobPod) { + return "DeadlineExceeded".equals(getJobPodStatusReason(jobPod)); + } + + private String getJobPodStatusReason(V1Pod jobPod) { + return Optional.ofNullable(jobPod.getStatus()).map(V1PodStatus::getReason).orElse(null); + } + // Returns a chain of steps which read the pod log and create a config map. private Step processIntrospectorPodLog(Step next) { return Step.chain(readNamedPodLog(), deleteIntrospectorJob(), createIntrospectorConfigMap(), next); diff --git a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java index bd4babdca0c..a3d07d0dc7a 100644 --- a/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java +++ b/operator/src/test/java/oracle/kubernetes/operator/DomainProcessorTest.java @@ -98,6 +98,7 @@ import static oracle.kubernetes.operator.LabelConstants.DOMAINUID_LABEL; import static oracle.kubernetes.operator.LabelConstants.INTROSPECTION_STATE_LABEL; import static oracle.kubernetes.operator.LabelConstants.SERVERNAME_LABEL; +import static oracle.kubernetes.operator.ProcessingConstants.DOMAIN_INTROSPECTOR_JOB; import static oracle.kubernetes.operator.WebLogicConstants.RUNNING_STATE; import static oracle.kubernetes.operator.WebLogicConstants.SHUTDOWN_STATE; import static oracle.kubernetes.operator.helpers.KubernetesTestSupport.CONFIG_MAP; @@ -883,6 +884,71 @@ V1JobStatus createTimedOutStatus() { .reason("DeadlineExceeded")); } + @Test + void whenIntrospectionJobPodTimedOut_jobRecreatedAndFailedConditionCleared() throws Exception { + consoleHandlerMemento.ignoringLoggedExceptions(RuntimeException.class); + consoleHandlerMemento.ignoreMessage(MessageKeys.NOT_STARTING_DOMAINUID_THREAD); + jobStatus = createBackoffStatus(); + establishPreviousIntrospection(null); + defineTimedoutIntrospection(); + testSupport.doOnDelete(JOB, j -> deletePod()); + testSupport.doOnCreate(JOB, j -> createJobPodAndSetCompletedStatus(job)); + domainConfigurator.withIntrospectVersion(NEW_INTROSPECTION_STATE); + processor.createMakeRightOperation(new DomainPresenceInfo(newDomain)).interrupt().execute(); + + assertThat(isDomainConditionFailed(), is(false)); + } + + private boolean isDomainConditionFailed() { + return newDomain.getStatus().getConditions().stream().anyMatch(c -> c.getType() == Failed); + } + + V1JobStatus createBackoffStatus() { + return new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("BackoffLimitExceeded")); + } + + private void deletePod() { + testSupport.deleteResources(new V1Pod().metadata(new V1ObjectMeta().name(getJobName()).namespace(NS))); + } + + private static String getJobName() { + return LegalNames.toJobIntrospectorName(UID); + } + + private void createJobPodAndSetCompletedStatus(V1Job job) { + Map labels = new HashMap<>(); + labels.put(LabelConstants.JOBNAME_LABEL, getJobName()); + testSupport.defineResources(POD, + new V1Pod().metadata(new V1ObjectMeta().name(getJobName()).labels(labels).namespace(NS))); + job.setStatus(createCompletedStatus()); + } + + private void defineTimedoutIntrospection() { + V1Job job = asTimedoutJob(createIntrospectorJob("TIMEDOUT_JOB")); + testSupport.defineResources(job); + testSupport.addToPacket(DOMAIN_INTROSPECTOR_JOB, job); + setJobPodStatusReasonDeadlineExceeded(); + } + + private void setJobPodStatusReasonDeadlineExceeded() { + testSupport.getResourceWithName(POD, getJobName()).status(new V1PodStatus().reason("DeadlineExceeded")); + } + + private V1Job asTimedoutJob(V1Job job) { + job.setStatus(new V1JobStatus().addConditionsItem(new V1JobCondition().status("True").type("Failed") + .reason("BackoffLimitExceeded"))); + return job; + } + + private V1Job createIntrospectorJob(String uid) { + return new V1Job().metadata(createJobMetadata(uid)).status(new V1JobStatus()); + } + + private V1ObjectMeta createJobMetadata(String uid) { + return new V1ObjectMeta().name(getJobName()).namespace(NS).creationTimestamp(SystemClock.now()).uid(uid); + } + private void establishPreviousIntrospection(Consumer domainSetup) throws JsonProcessingException { establishPreviousIntrospection(domainSetup, Arrays.asList(1,2)); } @@ -1414,10 +1480,6 @@ private String getDesiredState(Domain domain, String serverName) { return Optional.ofNullable(getServerStatus(domain, serverName)).map(ServerStatus::getDesiredState).orElse(""); } - private String getActualState(Domain domain, String serverName) { - return Optional.ofNullable(getServerStatus(domain, serverName)).map(ServerStatus::getState).orElse(""); - } - private ServerStatus getServerStatus(Domain domain, String serverName) { for (ServerStatus status : domain.getStatus().getServers()) { if (status.getServerName().equals(serverName)) {