Skip to content

Commit

Permalink
[Alerts] Update entity id to take specific id & fix slack job overview (
Browse files Browse the repository at this point in the history
  • Loading branch information
Yacouby committed May 22, 2024
1 parent 49415c7 commit 1198ed4
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 27 deletions.
2 changes: 1 addition & 1 deletion mlrun/alerts/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def _apply_template(self, template):
template = db.get_alert_template(template)

# Extract parameters from the template and apply them to the AlertConfig object
self.description = template.description
self.summary = template.summary
self.severity = template.severity
self.criteria = template.criteria
self.trigger = template.trigger
Expand Down
4 changes: 2 additions & 2 deletions mlrun/common/schemas/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ class AlertTemplate(
system_generated: bool = False

# AlertConfig fields that are pre-defined
description: Optional[str] = (
summary: Optional[str] = (
"String to be sent in the generated notifications e.g. 'Model {{project}}/{{entity}} is drifting.'"
"See AlertConfig.summary description"
)
Expand All @@ -175,7 +175,7 @@ class AlertTemplate(
def templates_differ(self, other):
return (
self.template_description != other.template_description
or self.description != other.description
or self.summary != other.summary
or self.severity != other.severity
or self.trigger != other.trigger
or self.reset_policy != other.reset_policy
Expand Down
9 changes: 9 additions & 0 deletions mlrun/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,15 @@ def get_ui_url(project, uid=None):
return url


def get_model_endpoint_url(project, model_name, model_endpoint_id):
url = ""
if mlrun.mlconf.resolve_ui_url():
url = f"{mlrun.mlconf.resolve_ui_url()}/{mlrun.mlconf.ui.projects_prefix}/{project}/models"
if model_name:
url += f"/model-endpoints/{model_name}/{model_endpoint_id}/overview"
return url


def get_workflow_url(project, id=None):
url = ""
if mlrun.mlconf.resolve_ui_url():
Expand Down
21 changes: 16 additions & 5 deletions mlrun/utils/notifications/notification/slack.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _get_alert_fields(
line = [
self._get_slack_row(f":bell: {alert.name} alert has occurred"),
self._get_slack_row(f"*Project:*\n{alert.project}"),
self._get_slack_row(f"*UID:*\n{event_data.entity.ids[0]}"),
self._get_slack_row(f"*ID:*\n{event_data.entity.ids[0]}"),
]

if alert.summary:
Expand All @@ -153,10 +153,21 @@ def _get_alert_fields(
data_text = "\n".join(data_lines)
line.append(self._get_slack_row(f"*Event data:*\n{data_text}"))

if url := mlrun.utils.helpers.get_ui_url(
alert.project, event_data.entity.ids[0]
):
line.append(self._get_slack_row(f"*Overview:*\n<{url}|*Job overview*>"))
if (
event_data.entity.kind == mlrun.common.schemas.alert.EventEntityKind.JOB
): # JOB entity
uid = event_data.value_dict.get("uid")
url = mlrun.utils.helpers.get_ui_url(alert.project, uid)
overview_type = "Job overview"
else: # MODEL entity
model_name = event_data.value_dict.get("model")
model_endpoint_id = event_data.value_dict.get("model_endpoint_id")
url = mlrun.utils.helpers.get_model_endpoint_url(
alert.project, model_name, model_endpoint_id
)
overview_type = "Model endpoint"

line.append(self._get_slack_row(f"*Overview:*\n<{url}|*{overview_type}*>"))

return line

Expand Down
6 changes: 3 additions & 3 deletions server/api/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class MaskOperations(StrEnum):
template_name="JobFailed",
template_description="Generic template for job failure alerts",
system_generated=True,
description="A job has failed",
summary="A job has failed",
severity=mlrun.common.schemas.alert.AlertSeverity.MEDIUM,
trigger={"events": [mlrun.common.schemas.alert.EventKind.FAILED]},
reset_policy=mlrun.common.schemas.alert.ResetPolicy.MANUAL,
Expand All @@ -48,7 +48,7 @@ class MaskOperations(StrEnum):
template_name="DriftDetected",
template_description="Generic template for drift detected alerts",
system_generated=True,
description="Model drift has been detected",
summary="Model drift has been detected",
severity=mlrun.common.schemas.alert.AlertSeverity.HIGH,
trigger={"events": [mlrun.common.schemas.alert.EventKind.DRIFT_DETECTED]},
reset_policy=mlrun.common.schemas.alert.ResetPolicy.MANUAL,
Expand All @@ -57,7 +57,7 @@ class MaskOperations(StrEnum):
template_name="DriftSuspected",
template_description="Generic template for drift suspected alerts",
system_generated=True,
description="Model drift is suspected",
summary="Model drift is suspected",
severity=mlrun.common.schemas.alert.AlertSeverity.MEDIUM,
trigger={"events": [mlrun.common.schemas.alert.EventKind.DRIFT_SUSPECTED]},
reset_policy=mlrun.common.schemas.alert.ResetPolicy.MANUAL,
Expand Down
2 changes: 2 additions & 0 deletions server/api/crud/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ def process_event(
obj=state_obj,
active=active,
)
else:
logger.debug("The entity of the alert does not match the one in event")

@staticmethod
def _event_entity_matches(alert_entity, event_entity):
Expand Down
16 changes: 9 additions & 7 deletions server/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,14 +777,16 @@ def _generate_event_on_failed_runs(

for run in runs:
project = run["metadata"]["project"]
uid = run["metadata"]["uid"]
entity = {
"kind": alert_objects.EventEntityKind.JOB,
"project": project,
"ids": [uid],
}
run_uid = run["metadata"]["uid"]
run_name = run["metadata"]["name"]
entity = mlrun.common.schemas.alert.EventEntities(
kind=alert_objects.EventEntityKind.JOB,
project=project,
ids=[run_name],
)
event_value = {"uid": run_uid}
event_data = mlrun.common.schemas.Event(
kind=alert_objects.EventKind.FAILED, entity=entity
kind=alert_objects.EventKind.FAILED, entity=entity, value_dict=event_value
)
mlrun.get_run_db().generate_event(alert_objects.EventKind.FAILED, event_data)

Expand Down
2 changes: 1 addition & 1 deletion tests/api/api/test_projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,7 +1251,7 @@ def _create_resources_of_all_kinds(
entities={
"kind": mlrun.common.schemas.alert.EventEntityKind.MODEL,
"project": project,
"ids": ["*"],
"ids": [1234],
},
trigger={"events": [mlrun.common.schemas.alert.EventKind.DRIFT_DETECTED]},
notifications=[{"notification": notification.to_dict()}],
Expand Down
9 changes: 3 additions & 6 deletions tests/integration/sdk_api/alerts/test_alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,15 @@ def test_alert_templates(self):

# generate an alert from a template
alert_name = "new_alert"
alert_summary = "Model is drifting"
alert_from_template = mlrun.alerts.alert.AlertConfig(
project=project_name,
name=alert_name,
summary=alert_summary,
template=drift_template,
)

# test modifiers on the alert config
entities = alert_objects.EventEntities(
kind=alert_objects.EventEntityKind.MODEL, project=project_name, ids=["*"]
kind=alert_objects.EventEntityKind.MODEL, project=project_name, ids=[1234]
)
alert_from_template.with_entities(entities=entities)

Expand All @@ -226,8 +224,7 @@ def test_alert_templates(self):
alert_from_template,
project_name=project_name,
alert_name=alert_name,
alert_summary=alert_summary,
alert_description=drift_template.description,
alert_summary=drift_template.summary,
alert_severity=drift_template.severity,
alert_trigger=drift_template.trigger,
alert_reset_policy=drift_template.reset_policy,
Expand Down Expand Up @@ -641,7 +638,7 @@ def _generate_alert_create_request(
name=name,
summary=summary,
severity=severity,
entities={"kind": entity_kind, "project": entity_project, "ids": ["*"]},
entities={"kind": entity_kind, "project": entity_project, "ids": [1234]},
trigger={"events": [event_name]},
criteria=criteria,
notifications=notifications,
Expand Down
8 changes: 6 additions & 2 deletions tests/system/alerts/test_alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,13 @@ def test_job_failure_alert(self):
# create an alert with webhook notification
alert_name = "failure_webhook"
alert_summary = "Job failed"
run_id = "test-func-handler"
notifications = self._generate_failure_notifications(nuclio_function_url)
self._create_alert_config(
self.project_name,
alert_name,
alert_objects.EventEntityKind.JOB,
run_id,
alert_summary,
alert_objects.EventKind.FAILED,
notifications,
Expand Down Expand Up @@ -92,11 +94,13 @@ def test_drift_detection_alert(self):
# create an alert with two webhook notifications
alert_name = "drift_webhook"
alert_summary = "Model is drifting"
endpoint_id = "demo-endpoint"
notifications = self._generate_drift_notifications(nuclio_function_url)
self._create_alert_config(
self.project_name,
alert_name,
alert_objects.EventEntityKind.MODEL,
endpoint_id,
alert_summary,
alert_objects.EventKind.DRIFT_DETECTED,
notifications,
Expand All @@ -108,7 +112,6 @@ def test_drift_detection_alert(self):
)
writer._wait_for_function_deployment(db=writer._get_db())

endpoint_id = "demo-endpoint"
mlrun.model_monitoring.api.get_or_create_model_endpoint(
project=self.project.metadata.name,
endpoint_id=endpoint_id,
Expand Down Expand Up @@ -220,6 +223,7 @@ def _create_alert_config(
project,
name,
entity_kind,
entity_id,
summary,
event_name,
notifications,
Expand All @@ -231,7 +235,7 @@ def _create_alert_config(
summary=summary,
severity=alert_objects.AlertSeverity.LOW,
entities=alert_objects.EventEntities(
kind=entity_kind, project=project, ids=["*"]
kind=entity_kind, project=project, ids=[entity_id]
),
trigger=alert_objects.AlertTrigger(events=[event_name]),
criteria=criteria,
Expand Down

0 comments on commit 1198ed4

Please sign in to comment.