In [None]:
%pip install codeflare-sdk -U

In [None]:
import os, sys
import yaml

# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
from codeflare_sdk.job import RayJobClient
from time import sleep

In [None]:
%pip show codeflare-sdk

In [None]:
#parameters
namespace = "has to be specified"
openshift_api_url = "has to be specified"
kubernetes_user_bearer_token = "has to be specified"
num_gpus = "has to be specified"
s3_bucket_name = "has to be specified"
s3_access_key_id = "has to be specified"
s3_secret_access_key = "has to be specified"
s3_default_region = "has to be specified"

In [None]:
!cp /opt/app-root/notebooks/* ./
!ls 

In [None]:
%pip install datasets
import create_dataset
create_dataset.main()

In [None]:
auth = TokenAuthentication(
    token=kubernetes_user_bearer_token,
    server=openshift_api_url,
    skip_tls=True,
)
auth.login()

In [None]:
# Create our cluster and submit appwrapper
cluster = Cluster(
    ClusterConfiguration(
        namespace=namespace,
        name='ray-finetune-test',
        head_cpus=2,
        head_gpus=0,
        num_workers=1,
        min_cpus=4,
        max_cpus=4,
        min_memory=48,
        max_memory=48,
        head_memory=48,
        num_gpus=int(num_gpus),
        write_to_file=True,
        verify_tls=False
    )
)

In [None]:
directory_path = os.path.expanduser("~/.codeflare/resources/")
outfile = os.path.join(directory_path, "mnisttest.yaml")
cluster_yaml = None
with open(outfile) as f:
    cluster_yaml = yaml.load(f, yaml.FullLoader)

# Add toleration for GPU nodes to Ray cluster worker pod
cluster_yaml["spec"]["workerGroupSpecs"][0]["template"]["spec"]["tolerations"]=[{"key": "nvidia.com/gpu", "value": "NONE", "effect": "NoSchedule"}]

with open(outfile, "w") as f:
    yaml.dump(cluster_yaml, f, default_flow_style=False)

In [None]:
# Bring up the cluster
cluster.up()
# Wait until status is updated
cluster.wait_ready()

In [None]:
cluster.status()

In [None]:
cluster.details()

In [None]:
ray_dashboard = cluster.cluster_dashboard_uri()
header = {"Authorization": f"Bearer {kubernetes_user_bearer_token}"}
client = RayJobClient(address=ray_dashboard, headers=header, verify=False)

submission_id = client.submit_job(
    entrypoint="python ray_finetune_llm_deepspeed.py "
               "--model-name=meta-llama/Llama-2-7b-chat-hf "
               "--lora "
               f"--num-devices=1 "
               "--num-epochs=1 "
               f"--ds-config=zero_3_llama_2_7b.json "
               f"--storage-path=s3://{s3_bucket_name}/ray-finetune-llm-deepspeed3/"
               "--batch-size-per-device=32 "
               "--eval-batch-size-per-device=32 "
               "--as-test ",
    runtime_env={
        "env_vars": {
            "AWS_ACCESS_KEY_ID": s3_access_key_id,
            "AWS_SECRET_ACCESS_KEY": s3_secret_access_key,
            "AWS_DEFAULT_REGION": s3_default_region,
        },
        "pip": "/opt/app-root/src/ray_finetune_requirements.txt",
        "working_dir": "/opt/app-root/src",
        "excludes": ["/docs/", "*.ipynb", "*.md"]
    },
)
print(submission_id)

In [None]:
finished = False
while not finished:
    sleep(1)
    status = client.get_job_status(submission_id)
    finished = (status == "SUCCEEDED")
if finished:
    print("Job completed Successfully !")
else:
    print("Job failed !")

In [None]:
client.stop_job(submission_id)

In [None]:
cluster.down()