From 2031b022ef9ce356cf0829bbc75fe5f0cfbef74a Mon Sep 17 00:00:00 2001 From: MING KANG Date: Mon, 6 May 2024 15:24:24 -0400 Subject: [PATCH] Odsc-56003/fix eval delete (#818) --- ads/aqua/evaluation.py | 65 ++++++++++++------- .../with_extras/aqua/test_evaluation.py | 39 +++++------ 2 files changed, 59 insertions(+), 45 deletions(-) diff --git a/ads/aqua/evaluation.py b/ads/aqua/evaluation.py index 39ad21cca..f6d36d9ae 100644 --- a/ads/aqua/evaluation.py +++ b/ads/aqua/evaluation.py @@ -367,6 +367,9 @@ class AquaEvaluationApp(AquaApp): _report_cache = TTLCache(maxsize=10, ttl=timedelta(hours=5), timer=datetime.now) _metrics_cache = TTLCache(maxsize=10, ttl=timedelta(hours=5), timer=datetime.now) _eval_cache = TTLCache(maxsize=200, ttl=timedelta(hours=10), timer=datetime.now) + _deletion_cache = TTLCache( + maxsize=10, ttl=timedelta(minutes=10), timer=datetime.now + ) _cache_lock = Lock() @telemetry(entry_point="plugin=evaluation&action=create", name="aqua") @@ -1371,6 +1374,7 @@ def _cancel_job_run(run, model): @telemetry(entry_point="plugin=evaluation&action=delete", name="aqua") def delete(self, eval_id): """Deletes the job and the associated model for the given evaluation id. + Parameters ---------- eval_id: str @@ -1383,9 +1387,9 @@ def delete(self, eval_id): Raises ------ AquaRuntimeError: - if a model doesn't exist for the given eval_id + if a model doesn't exist for the given eval_id. AquaMissingKeyError: - if training_id is missing the job run id + if job/jobrun id is missing. """ model = DataScienceModel.from_id(eval_id) @@ -1407,9 +1411,21 @@ def delete(self, eval_id): self._delete_job_and_model(job, model) + try: + jobrun_id = model.custom_metadata_list.get( + EvaluationCustomMetadata.EVALUATION_JOB_RUN_ID.value + ).value + jobrun = utils.query_resource(jobrun_id, return_all=False) + except Exception: + logger.debug("Associated Job Run OCID is missing.") + jobrun = None + + self._eval_cache.pop(key=eval_id, default=None) + self._deletion_cache.__setitem__(key=eval_id, value="") + status = dict( id=eval_id, - lifecycle_state="DELETING", + lifecycle_state=jobrun.lifecycle_state if jobrun else "DELETING", time_accepted=datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f%z"), ) return status @@ -1697,30 +1713,33 @@ def _get_status( ] = None, ) -> dict: """Builds evaluation status based on the model status and job run status. - When detect `aqua_evaluation_error` in custom metadata, the jobrun is failed. - However, if jobrun failed before saving this meta, we need to check the existance - of the evaluation artifact. + When missing jobrun information, the status will be decided based on: + + * If the evaluation just has been deleted, the jobrun status should be deleted. + * When detect `aqua_evaluation_error` in custom metadata, the jobrun is failed. + * If jobrun failed before saving this meta, we need to check the existance + of the evaluation artifact. """ - # TODO: revisit for CANCELED evaluation - job_run_status = ( - JobRun.LIFECYCLE_STATE_FAILED - if self._get_attribute_from_model_metadata( + model_status = model.lifecycle_state + job_run_status = None + + if jobrun: + job_run_status = jobrun.lifecycle_state + + if jobrun is None: + if model.identifier in self._deletion_cache.keys(): + job_run_status = JobRun.LIFECYCLE_STATE_DELETED + + elif self._get_attribute_from_model_metadata( model, EvaluationCustomMetadata.EVALUATION_ERROR.value - ) - else None - ) + ): + job_run_status = JobRun.LIFECYCLE_STATE_FAILED - model_status = model.lifecycle_state - job_run_status = job_run_status or ( - jobrun.lifecycle_state - if jobrun and not jobrun.lifecycle_state == JobRun.LIFECYCLE_STATE_DELETED - else ( - JobRun.LIFECYCLE_STATE_SUCCEEDED - if self._if_eval_artifact_exist(model) - else JobRun.LIFECYCLE_STATE_FAILED - ) - ) + elif self._if_eval_artifact_exist(model): + job_run_status = JobRun.LIFECYCLE_STATE_SUCCEEDED + else: + job_run_status = JobRun.LIFECYCLE_STATE_FAILED lifecycle_state = utils.LifecycleStatus.get_status( evaluation_status=model_status, job_run_status=job_run_status diff --git a/tests/unitary/with_extras/aqua/test_evaluation.py b/tests/unitary/with_extras/aqua/test_evaluation.py index 5ea5b6e41..fde28d837 100644 --- a/tests/unitary/with_extras/aqua/test_evaluation.py +++ b/tests/unitary/with_extras/aqua/test_evaluation.py @@ -527,36 +527,28 @@ def test_create_evaluation( def test_get_service_model_name(self): # get service model name from fine tuned model deployment - source = ( - ModelDeployment() - .with_freeform_tags( - **{ - Tags.AQUA_TAG.value: UNKNOWN, - Tags.AQUA_FINE_TUNED_MODEL_TAG.value: "test_service_model_id#test_service_model_name", - Tags.AQUA_MODEL_NAME_TAG.value: "test_fine_tuned_model_name" - } - ) + source = ModelDeployment().with_freeform_tags( + **{ + Tags.AQUA_TAG.value: UNKNOWN, + Tags.AQUA_FINE_TUNED_MODEL_TAG.value: "test_service_model_id#test_service_model_name", + Tags.AQUA_MODEL_NAME_TAG.value: "test_fine_tuned_model_name", + } ) service_model_name = self.app._get_service_model_name(source) assert service_model_name == "test_service_model_name" # get service model name from model deployment - source = ( - ModelDeployment() - .with_freeform_tags( - **{ - Tags.AQUA_TAG.value: "active", - Tags.AQUA_MODEL_NAME_TAG.value: "test_service_model_name" - } - ) + source = ModelDeployment().with_freeform_tags( + **{ + Tags.AQUA_TAG.value: "active", + Tags.AQUA_MODEL_NAME_TAG.value: "test_service_model_name", + } ) service_model_name = self.app._get_service_model_name(source) assert service_model_name == "test_service_model_name" # get service model name from service model - source = DataScienceModel( - display_name="test_service_model_name" - ) + source = DataScienceModel(display_name="test_service_model_name") service_model_name = self.app._get_service_model_name(source) assert service_model_name == "test_service_model_name" @@ -807,6 +799,7 @@ def test_get_status(self): @parameterized.expand( [ ( + "artifact_exist", dict( return_value=oci.response.Response( status=200, request=MagicMock(), headers=MagicMock(), data=None @@ -815,6 +808,7 @@ def test_get_status(self): "SUCCEEDED", ), ( + "artifact_missing", dict( side_effect=oci.exceptions.ServiceError( status=404, code=None, message="error test msg", headers={} @@ -825,7 +819,7 @@ def test_get_status(self): ] ) def test_get_status_when_missing_jobrun( - self, mock_head_model_artifact_response, expected_output + self, name, mock_head_model_artifact_response, expected_output ): """Tests getting evaluation status correctly when missing jobrun association.""" self.app.ds_client.get_model_provenance = MagicMock( @@ -839,13 +833,14 @@ def test_get_status_when_missing_jobrun( ) ) self.app._fetch_jobrun = MagicMock(return_value=None) - + self.app._deletion_cache.clear() self.app.ds_client.head_model_artifact = MagicMock( side_effect=mock_head_model_artifact_response.get("side_effect", None), return_value=mock_head_model_artifact_response.get("return_value", None), ) response = self.app.get_status(TestDataset.EVAL_ID) + self.app.ds_client.head_model_artifact.assert_called_with( model_id=TestDataset.EVAL_ID )