diff --git a/mast/.torchxconfig b/mast/.torchxconfig index 67379f2..ee01fbe 100644 --- a/mast/.torchxconfig +++ b/mast/.torchxconfig @@ -3,13 +3,14 @@ conda_path_in_fbpkg = conda activate_conda = False fbpkg_ids = fb-py-spy:prod hpcIdentity = pytorch_distributed -rmAttribution = pytorch4all_clients_approved +rmAttribution = msl_infra_pytorch_dev workspace_fbpkg_name = torchtitan_workspace conda_pack_ignore_missing_files = True git = False hpcJobOncall = meta_conda modelTypeName = gen_ai_conda -hpcClusterUuid = MastProdCluster +hpcClusterUuid = MastGenAICluster +localityConstraints = region;gtn forceSingleRegion = False use_caf = False diff --git a/mast/run_torchtitan.sh b/mast/run_torchtitan.sh index bd10e24..ad341f8 100755 --- a/mast/run_torchtitan.sh +++ b/mast/run_torchtitan.sh @@ -78,5 +78,4 @@ python torchtitan/train.py \ --validation.dataset_path "${dataset_path}" \ --metrics.save_tb_folder "${save_tb_folder}" \ --metrics.disable_color_printing \ ---job.print_args \ $overrides diff --git a/mast/sweep.py b/mast/sweep.py index e867bd4..3ab22ed 100644 --- a/mast/sweep.py +++ b/mast/sweep.py @@ -104,12 +104,12 @@ def maybe_find_pulp(maybe_path: Optional[str] = None) -> Optional[str]: ], "llama3_autop_1d_compile": llama3_1d_common_opts + [ - "--model.name=llama3_auto_parallel", + "--model.name=auto_parallel.llama3", "--compile.enable", ], "llama3_autop_1d_compile_bucket_reorder": llama3_1d_common_opts + [ - "--model.name=llama3_auto_parallel", + "--model.name=auto_parallel.llama3", "--compile.enable", "--experimental.bucket_all_gathers_fx=fsdp", "--experimental.bucket_reduce_scatters_fx=fsdp", @@ -125,12 +125,12 @@ def maybe_find_pulp(maybe_path: Optional[str] = None) -> Optional[str]: ], "llama3_autop_2d_compile": llama3_2d_common_opts + [ - "--model.name=llama3_auto_parallel", + "--model.name=auto_parallel.llama3", "--compile.enable", ], "llama3_autop_2d_compile_bucket_reorder": llama3_2d_common_opts + [ - "--model.name=llama3_auto_parallel", + "--model.name=auto_parallel.llama3", "--compile.enable", "--experimental.bucket_all_gathers_fx=fsdp", "--experimental.bucket_reduce_scatters_fx=fsdp", @@ -153,13 +153,13 @@ def maybe_find_pulp(maybe_path: Optional[str] = None) -> Optional[str]: | { "llama3_autop_1d_compile_ruisi_bucket_reorder": llama3_1d_common_opts + [ - "--model.name=llama3_auto_parallel", + "--model.name=auto_parallel.llama3", "--compile.enable", "--experimental.enable_simplefsdp_passes", ], "llama3_autop_2d_compile_ruisi_bucket_reorder": llama3_2d_common_opts + [ - "--model.name=llama3_auto_parallel", + "--model.name=auto_parallel.llama3", "--compile.enable", "--experimental.enable_simplefsdp_passes", ],