Skip to content

Commit

Permalink
[Train] Added Llama 2 workspace template release tests (ray-project#3…
Browse files Browse the repository at this point in the history
…7871)

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
kouroshHakha authored and arvind-chandra committed Aug 31, 2023
1 parent 4eface7 commit 8d97d41
Show file tree
Hide file tree
Showing 12 changed files with 655 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,8 @@ def main():
"env_vars": {
"HF_HOME": "/mnt/local_storage/.cache/huggingface",
"TUNE_RESULT_DIR": os.environ["TUNE_RESULT_DIR"],
}
},
"working_dir": ".",
}
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def download_model_files_on_all_nodes(hf_model_id: str):
if __name__ == "__main__":

ray.init(
runtime_env={"env_vars": {"HF_HOME": "/mnt/local_storage/.cache/huggingface"}}
runtime_env={
"env_vars": {"HF_HOME": "/mnt/local_storage/.cache/huggingface"},
"working_dir": ".",
}
)

pargs = _parse_args()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def run(cmd: str):
parser.add_argument("args", nargs="*", type=str, help="string args to function")
args = parser.parse_args()

ray.init()
ray.init(runtime_env={"working_dir": "."})
if args.function not in globals():
raise ValueError(f"{args.function} doesn't exist")
fn = globals()[args.function]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
# See https://hub.docker.com/r/anyscale/ray for full list of
# available Ray, Python, and CUDA versions.
base_image: "anyscale/ray:2.6.1-py39-cu117"
base_image: anyscale/ray:nightly-py39-cu118

env_vars: {}

debian_packages: [
libaio1
]
debian_packages:
- libaio1

python:
pip_packages: [
Expand All @@ -30,4 +27,7 @@ python:
]
conda_packages: []

post_build_cmds: []
post_build_cmds:
# Install Ray
- pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

head_node_type:
name: head_node_type
instance_type: g5.48xlarge
resources:
custom_resources:
large_cpu_mem: 1

worker_node_types:
- name: gpu_worker
instance_type: g5.48xlarge
min_workers: 3
max_workers: 3
use_spot: false

aws:
TagSpecifications:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

head_node_type:
name: head_node_type
instance_type: g5.48xlarge
resources:
custom_resources:
large_cpu_mem: 1

worker_node_types:
- name: large_gpu_worker
instance_type: g5.48xlarge
min_workers: 2
max_workers: 2
use_spot: false

- name: medium_gpu_worker
instance_type: g5.24xlarge
min_workers: 2
max_workers: 2
use_spot: false

aws:
TagSpecifications:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# 1 g5.16xlarge + 15 g5.4xlarge --> 16 GPUs, 256G RAM on trainer and 64G RAM on workers
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

head_node_type:
name: head_node
instance_type: g5.16xlarge
resources:
custom_resources:
large_cpu_mem: 1

worker_node_types:
- name: worker_node
instance_type: g5.4xlarge
min_workers: 15
max_workers: 15
use_spot: false
resources:
custom_resources:
medium_cpu_mem: 1

aws:
TagSpecifications:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b

head_node_type:
name: head_node_type
instance_type: n1-highmem-64-nvidia-k80-12gb-1
resources:
custom_resources:
large_cpu_mem: 1

worker_node_types:
- name: gpu_worker
instance_type: n1-standard-16-nvidia-k80-12gb-1
min_workers: 15
max_workers: 15
use_spot: false
resources:
custom_resources:
medium_cpu_mem: 1
3 changes: 2 additions & 1 deletion release/ray_release/byod/requirements_debian_byod.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ libjemalloc-dev
libosmesa6-dev
patchelf
unzip
zip
zip
libaio1
8 changes: 8 additions & 0 deletions release/ray_release/byod/requirements_ml_byod_3.9.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ transformers
torch
torchtext
torchvision
bitsandbytes
wandb
pytorch-lightning
protobuf<3.21.0
torchmetrics
lm_eval
tiktoken
sentencepiece
Loading

0 comments on commit 8d97d41

Please sign in to comment.