From 2ff5db4813c0759f2c040e4c0429eb587a9f185a Mon Sep 17 00:00:00 2001 From: Zack Cao Date: Fri, 23 Aug 2024 12:30:28 -0700 Subject: [PATCH] Add appdef metadata to torchx event (#947) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/947 Add appdef metadata to torchx event class. Scuba logs does not get modified yet. The motivation of adding this is that so far we need to log distributed_ai_stack which is part of the App Metadata into AI Instrumentation framework which can be used for post data process for training jobs of various training stacks, such as MVAI, Pyper, Conda_On_MAST etc. Reviewed By: andywag Differential Revision: D61632621 --- torchx/runner/api.py | 5 ++++- torchx/runner/events/__init__.py | 6 +++++- torchx/runner/events/api.py | 6 +++++- torchx/runner/events/test/lib_test.py | 17 +++++++++++++++-- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/torchx/runner/api.py b/torchx/runner/api.py index ea7968db2..c33e70fc7 100644 --- a/torchx/runner/api.py +++ b/torchx/runner/api.py @@ -198,10 +198,12 @@ def run_component( parent_run_id=parent_run_id, ) handle = self.schedule(dryrun_info) + app = none_throws(dryrun_info._app) ctx._torchx_event.workspace = workspace ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler) - ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image + ctx._torchx_event.app_image = app.roles[0].image ctx._torchx_event.app_id = parse_app_handle(handle)[2] + ctx._torchx_event.app_metadata = app.metadata return handle def dryrun_component( @@ -263,6 +265,7 @@ def run( ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler) ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image ctx._torchx_event.app_id = parse_app_handle(handle)[2] + ctx._torchx_event.app_metadata = app.metadata return handle def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle: diff --git a/torchx/runner/events/__init__.py b/torchx/runner/events/__init__.py index cedba10d6..360cb3e7c 100644 --- a/torchx/runner/events/__init__.py +++ b/torchx/runner/events/__init__.py @@ -24,7 +24,7 @@ import time import traceback from types import TracebackType -from typing import Optional, Type +from typing import Dict, Optional, Type from torchx.runner.events.handlers import get_logging_handler @@ -84,6 +84,7 @@ def __init__( scheduler: Optional[str] = None, app_id: Optional[str] = None, app_image: Optional[str] = None, + app_metadata: Optional[Dict[str, str]] = None, runcfg: Optional[str] = None, workspace: Optional[str] = None, ) -> None: @@ -92,6 +93,7 @@ def __init__( scheduler or "", app_id, app_image=app_image, + app_metadata=app_metadata, runcfg=runcfg, workspace=workspace, ) @@ -128,6 +130,7 @@ def _generate_torchx_event( scheduler: str, app_id: Optional[str] = None, app_image: Optional[str] = None, + app_metadata: Optional[Dict[str, str]] = None, runcfg: Optional[str] = None, source: SourceType = SourceType.UNKNOWN, workspace: Optional[str] = None, @@ -138,6 +141,7 @@ def _generate_torchx_event( api=api, app_id=app_id, app_image=app_image, + app_metadata=app_metadata, runcfg=runcfg, source=source, workspace=workspace, diff --git a/torchx/runner/events/api.py b/torchx/runner/events/api.py index 5cb5f11ab..ce5bc8998 100644 --- a/torchx/runner/events/api.py +++ b/torchx/runner/events/api.py @@ -10,7 +10,7 @@ import json from dataclasses import asdict, dataclass from enum import Enum -from typing import Optional, Union +from typing import Dict, Optional, Union class SourceType(str, Enum): @@ -30,10 +30,13 @@ class TorchxEvent: api: Api name app_id: Unique id that is set by the underlying scheduler image: Image/container bundle that is used to execute request. + app_metadata: metadata to the app (treatment of metadata is scheduler dependent) runcfg: Run config that was used to schedule app. source: Type of source the event is generated. cpu_time_usec: CPU time spent in usec wall_time_usec: Wall time spent in usec + start_epoch_time_usec: Epoch time in usec when runner event starts + Workspace: Track how different workspaces/no workspace affects build and scheduler """ session: str @@ -41,6 +44,7 @@ class TorchxEvent: api: str app_id: Optional[str] = None app_image: Optional[str] = None + app_metadata: Optional[Dict[str, str]] = None runcfg: Optional[str] = None raw_exception: Optional[str] = None source: SourceType = SourceType.UNKNOWN diff --git a/torchx/runner/events/test/lib_test.py b/torchx/runner/events/test/lib_test.py index 14c025ad7..92bb3c828 100644 --- a/torchx/runner/events/test/lib_test.py +++ b/torchx/runner/events/test/lib_test.py @@ -31,6 +31,7 @@ def assert_event( self.assertEqual(actual_event.app_image, expected_event.app_image) self.assertEqual(actual_event.runcfg, expected_event.runcfg) self.assertEqual(actual_event.source, expected_event.source) + self.assertEqual(actual_event.app_metadata, expected_event.app_metadata) @patch("torchx.runner.events.get_logging_handler") def test_get_or_create_logger(self, logging_handler_mock: MagicMock) -> None: @@ -41,11 +42,13 @@ def test_get_or_create_logger(self, logging_handler_mock: MagicMock) -> None: self.assertIsInstance(logger.handlers[0], logging.NullHandler) def test_event_created(self) -> None: + test_metadata = {"test_key": "test_value"} event = TorchxEvent( session="test_session", scheduler="test_scheduler", api="test_api", app_image="test_app_image", + app_metadata=test_metadata, workspace="test_workspace", ) self.assertEqual("test_session", event.session) @@ -54,13 +57,16 @@ def test_event_created(self) -> None: self.assertEqual("test_app_image", event.app_image) self.assertEqual(SourceType.UNKNOWN, event.source) self.assertEqual("test_workspace", event.workspace) + self.assertEqual(test_metadata, event.app_metadata) def test_event_deser(self) -> None: + test_metadata = {"test_key": "test_value"} event = TorchxEvent( session="test_session", scheduler="test_scheduler", api="test_api", app_image="test_app_image", + app_metadata=test_metadata, workspace="test_workspace", source=SourceType.EXTERNAL, ) @@ -78,14 +84,17 @@ def assert_torchx_event(self, expected: TorchxEvent, actual: TorchxEvent) -> Non self.assertEqual(expected.app_image, actual.app_image) self.assertEqual(expected.source, actual.source) self.assertEqual(expected.workspace, actual.workspace) + self.assertEqual(expected.app_metadata, actual.app_metadata) def test_create_context(self, _) -> None: - cfg = json.dumps({"test_key": "test_value"}) + test_dict = {"test_key": "test_value"} + cfg = json.dumps(test_dict) context = log_event( "test_call", "local", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", ) @@ -95,6 +104,7 @@ def test_create_context(self, _) -> None: "test_call", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", ) @@ -102,12 +112,14 @@ def test_create_context(self, _) -> None: self.assert_torchx_event(expected_torchx_event, context._torchx_event) def test_record_event(self, record_mock: MagicMock) -> None: - cfg = json.dumps({"test_key": "test_value"}) + test_dict = {"test_key": "test_value"} + cfg = json.dumps(test_dict) with log_event( "test_call", "local", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", ) as ctx: @@ -119,6 +131,7 @@ def test_record_event(self, record_mock: MagicMock) -> None: "test_call", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", cpu_time_usec=ctx._torchx_event.cpu_time_usec,