[ci/train] Add Ray Train storage refactor CI tests (ray-project#38457)

This PR adds CI runners for the Ray Train and Tune tests with the new storage context path enabled. Many tests are excluded at first. We will iteratively work on enabling them to avoid having to fix a bunch of issues in one giant PR. Signed-off-by: Kai Fricke <kai@anyscale.com> Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
lmco · Aug 31, 2023 · 817bb0a · 817bb0a
1 parent 49fe543
commit 817bb0a
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 87 deletions.
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
@@ -53,6 +53,7 @@
     - ./ci/env/env_info.sh
     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest python/ray/train/...
 
+
 - label: ":brain: RLlib: Benchmarks (Torch 2.x)"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_AFFECTED"]
   instance_size: medium
@@ -314,6 +315,7 @@
       --test_env=AIR_VERBOSITY=1
       python/ray/tune/...
 
+
 - label: ":octopus: :brain: Tune tests and examples {using RLlib}"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_RLLIB_AFFECTED"]
   instance_size: large
@@ -335,6 +337,71 @@
     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/tests/horovod/...
 
 
+
+##### STORAGE REFACTOR
+
+- label: ":steam_locomotive: :floppy_disk: New persistence mode: Train tests and examples"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
+  instance_size: large
+  parallelism: 4
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    # Todo (krfricke): Move mosaicml to train-test-requirements.txt
+    - pip install "mosaicml==0.12.1"
+    - TRAIN_TESTING=1 DATA_PROCESSING_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - ./ci/run/run_bazel_test_with_sharding.sh
+      --config=ci $(./ci/run/bazel_export_options)
+      --test_tag_filters=-gpu_only,-gpu,-minimal,-tune,-needs_credentials,-doctest,-no_new_storage
+      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
+      python/ray/train/...
+
+- label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
+  instance_size: medium
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only 
+      --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
+      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
+      python/ray/train/...
+
+
+- label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (small)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
+  instance_size: small
+  parallelism: 3
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - ./ci/run/run_bazel_test_with_sharding.sh
+      --config=ci $(./ci/run/bazel_export_options) --build_tests_only
+      --test_tag_filters=-medium_instance,-soft_imports,-gpu_only,-rllib,-multinode,-no_new_storage
+      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
+      python/ray/tune/...
+
+- label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (medium)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
+  instance_size: medium
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TUNE_TESTING=1 DATA_PROCESSING_TESTING=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only
+      --test_tag_filters=medium_instance,-soft_imports,-gpu_only,-rllib,-multinode,-no_new_storage
+      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
+      python/ray/tune/...
+
+
+###### END STORAGE REFACTOR
+
+
+
+
+
 # TODO(amogkam): Re-enable Ludwig tests after Ludwig supports Ray 2.0
 #- label: ":octopus: Ludwig tests and examples. Python 3.7"
 #  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]

diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
@@ -104,7 +104,7 @@ py_test(
     size = "medium",
     main = "examples/pytorch/tune_cifar_torch_pbt_example.py",
     srcs = ["examples/pytorch/tune_cifar_torch_pbt_example.py"],
-    tags = ["team:ml", "exclusive", "pytorch", "tune"],
+    tags = ["team:ml", "exclusive", "pytorch", "tune", "no_new_storage"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
 )
@@ -114,7 +114,7 @@ py_test(
     size = "small",
     main = "examples/pytorch/tune_torch_regression_example.py",
     srcs = ["examples/pytorch/tune_torch_regression_example.py"],
-    tags = ["team:ml", "exclusive", "tune"],
+    tags = ["team:ml", "exclusive", "tune", "no_new_storage"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
 )
@@ -135,7 +135,7 @@ py_test(
     name = "horovod_cifar_pbt_example",
     size = "small",
     srcs = ["examples/horovod/horovod_cifar_pbt_example.py"],
-    tags = ["team:ml", "exlusive"],
+    tags = ["team:ml", "exlusive", "no_new_storage"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
 )
@@ -144,7 +144,7 @@ py_test(
     name = "horovod_pytorch_example",
     size = "small",
     srcs = ["examples/horovod/horovod_pytorch_example.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib"],
     args = ["--num-epochs=1"]
 )
@@ -163,7 +163,7 @@ py_test (
     size = "medium",
     srcs = ["examples/huggingface/huggingface_basic_language_modeling_example.py"],
     args = ["--smoke-test", "--num-epochs 3"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -172,7 +172,7 @@ py_test(
     size = "medium",
     main = "examples/tf/tensorflow_regression_example.py",
     srcs = ["examples/tf/tensorflow_regression_example.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
 )
@@ -215,7 +215,7 @@ py_test(
     size = "medium",
     main = "examples/pytorch/torch_regression_example.py",
     srcs = ["examples/pytorch/torch_regression_example.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
 )
@@ -236,7 +236,7 @@ py_test(
     size = "medium",
     main = "examples/tf/tune_tensorflow_mnist_example.py",
     srcs = ["examples/tf/tune_tensorflow_mnist_example.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib"],
     args = ["--smoke-test"]
 )
@@ -258,15 +258,15 @@ py_test(
     name = "test_backend",
     size = "large",
     srcs = ["tests/test_backend.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib",  ":conftest"]
 )
 
 py_test(
     name = "test_base_trainer",
     size = "medium",
     srcs = ["tests/test_base_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib", ":conftest"]
 )
 
@@ -298,23 +298,23 @@ py_test(
     name = "test_data_parallel_trainer",
     size = "medium",
     srcs = ["tests/test_data_parallel_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
 py_test(
     name = "test_data_parallel_trainer_checkpointing",
     size = "medium",
     srcs = ["tests/test_data_parallel_trainer_checkpointing.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
 py_test(
     name = "test_examples",
     size = "large",
     srcs = ["tests/test_examples.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib", ":conftest"]
 )
 
@@ -378,7 +378,7 @@ py_test(
     name = "test_horovod_trainer",
     size = "large",
     srcs = ["tests/test_horovod_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -402,7 +402,7 @@ py_test(
     name = "test_lightgbm_trainer",
     size = "medium",
     srcs = ["tests/test_lightgbm_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -490,7 +490,7 @@ py_test(
     name = "test_session",
     size = "small",
     srcs = ["tests/test_session.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib", ":conftest"]
 )
 
@@ -506,15 +506,15 @@ py_test(
     name = "test_sklearn_trainer",
     size = "medium",
     srcs = ["tests/test_sklearn_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
 py_test(
     name = "test_tensorflow_checkpoint",
     size = "small",
     srcs = ["tests/test_tensorflow_checkpoint.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -530,7 +530,7 @@ py_test(
     name = "test_tensorflow_trainer",
     size = "medium",
     srcs = ["tests/test_tensorflow_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -562,7 +562,7 @@ py_test(
     name = "test_torch_trainer",
     size = "large",
     srcs = ["tests/test_torch_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -578,7 +578,7 @@ py_test(
     name = "test_training_iterator",
     size = "large",
     srcs = ["tests/test_training_iterator.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -602,23 +602,23 @@ py_test(
     name = "test_transformers_trainer_steps",
     size = "enormous", # TODO: Reduce this.
     srcs = ["tests/test_transformers_trainer_steps.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
 py_test(
     name = "test_transformers_trainer",
     size = "large",
     srcs = ["tests/test_transformers_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
 py_test(
     name = "test_tune",
     size = "large",
     srcs = ["tests/test_tune.py"],
-    tags = ["team:ml", "exclusive", "tune"],
+    tags = ["team:ml", "exclusive", "tune", "no_new_storage"],
     deps = [":train_lib", ":conftest"]
 )
 
@@ -634,7 +634,7 @@ py_test(
     name = "test_e2e_wandb_integration",
     size = "small",
     srcs = ["tests/test_e2e_wandb_integration.py"],
-    tags = ["team:ml", "exclusive"],
+    tags = ["team:ml", "exclusive", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -658,7 +658,7 @@ py_test(
     name = "test_xgboost_trainer",
     size = "medium",
     srcs = ["tests/test_xgboost_trainer.py"],
-    tags = ["team:ml", "exclusive", "ray_air"],
+    tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"],
     deps = [":train_lib"]
 )
 
@@ -670,6 +670,7 @@ py_test(
         "exclusive",
         "ray_air",
         "team:ml",
+        "no_new_storage",
     ],
     deps = [":train_lib", ":conftest"],
 )

diff --git a/python/ray/train/_internal/storage.py b/python/ray/train/_internal/storage.py
@@ -169,7 +169,7 @@ def _download_from_fs_path(
         else:
             _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs)
     except Exception as e:
-        # Clean up the directory if downloading was unsuccessful.
+        # Clean up the directory if downloading was unsuccessful
         if not exists_before:
             shutil.rmtree(local_path, ignore_errors=True)
         raise e