diff --git a/torchx/components/dist.py b/torchx/components/dist.py index 817534a25..55718474d 100644 --- a/torchx/components/dist.py +++ b/torchx/components/dist.py @@ -248,6 +248,7 @@ def ddp( ) env["TORCHX_TRACKING_EXPERIMENT_NAME"] = argname.experiment_name + env["TORCHX_TRACKING_RUN_NAME"] = argname.run_name env.setdefault("LOGLEVEL", os.getenv("LOGLEVEL", "WARNING")) if debug: diff --git a/torchx/components/test/dist_test.py b/torchx/components/test/dist_test.py index 669f1a7fc..ac57a1bf0 100644 --- a/torchx/components/test/dist_test.py +++ b/torchx/components/test/dist_test.py @@ -56,6 +56,7 @@ def test_validate_spmd(self) -> None: def test_spmd_call_by_module_or_script_no_name(self) -> None: appdef = spmd(script="foo/bar.py") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual( "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], @@ -63,6 +64,7 @@ def test_spmd_call_by_module_or_script_no_name(self) -> None: appdef = spmd("-a", "b", script="foo/bar.py") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual( "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], @@ -70,6 +72,7 @@ def test_spmd_call_by_module_or_script_no_name(self) -> None: appdef = spmd(m="foo.bar") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual( "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], @@ -77,6 +80,7 @@ def test_spmd_call_by_module_or_script_no_name(self) -> None: appdef = spmd("-a", "b", m="foo.bar") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual( "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], @@ -91,35 +95,43 @@ def test_spmd_call_by_module_or_script_no_name(self) -> None: def test_spmd_call_by_module_or_script_with_name(self) -> None: appdef = spmd(script="foo/bar.py", name="baz/trial_1") self.assertEqual("trial_1", appdef.name) + self.assertEqual("trial_1", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) appdef = spmd("-a", "b", script="foo/bar.py", name="baz/trial_1") self.assertEqual("trial_1", appdef.name) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) + self.assertEqual("trial_1", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) appdef = spmd(m="foo.bar", name="baz/trial_1") self.assertEqual("trial_1", appdef.name) + self.assertEqual("trial_1", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) appdef = spmd("-a", "b", m="foo.bar", name="baz/trial_1") self.assertEqual("trial_1", appdef.name) + self.assertEqual("trial_1", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) def test_spmd_call_by_module_or_script_with_experiment_name(self) -> None: appdef = spmd(script="foo/bar.py", name="baz/") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) appdef = spmd("-a", "b", script="foo/bar.py", name="baz/") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) appdef = spmd(m="foo.bar", name="baz/") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) appdef = spmd("-a", "b", m="foo.bar", name="baz/") self.assertEqual("bar", appdef.name) + self.assertEqual("bar", appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"]) self.assertEqual("baz", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"]) def test_spmd_call_by_module_or_script_with_run_name(self) -> None: @@ -129,6 +141,10 @@ def test_spmd_call_by_module_or_script_with_run_name(self) -> None: "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], ) + self.assertEqual( + "trial_1", + appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"], + ) appdef = spmd("-a", "b", script="foo/bar.py", name="/trial_1") self.assertEqual("trial_1", appdef.name) @@ -136,6 +152,10 @@ def test_spmd_call_by_module_or_script_with_run_name(self) -> None: "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], ) + self.assertEqual( + "trial_1", + appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"], + ) appdef = spmd(m="foo.bar", name="/trial_1") self.assertEqual("trial_1", appdef.name) @@ -143,6 +163,10 @@ def test_spmd_call_by_module_or_script_with_run_name(self) -> None: "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], ) + self.assertEqual( + "trial_1", + appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"], + ) appdef = spmd("-a", "b", m="foo.bar", name="/trial_1") self.assertEqual("trial_1", appdef.name) @@ -150,3 +174,7 @@ def test_spmd_call_by_module_or_script_with_run_name(self) -> None: "default-experiment", appdef.roles[0].env["TORCHX_TRACKING_EXPERIMENT_NAME"], ) + self.assertEqual( + "trial_1", + appdef.roles[0].env["TORCHX_TRACKING_RUN_NAME"], + )