From 8407bd31e77918ff4c5d73c08dfaa94c3b098aa8 Mon Sep 17 00:00:00 2001 From: Kai-Hsun Chen Date: Mon, 7 Nov 2022 11:07:24 -0800 Subject: [PATCH] [Feature] Test sample RayCluster YAMLs to catch invalid or out of date ones (#678) Use #605 to test sample RayCluster YAMLs. Signed-off-by: Kai-Hsun Chen --- .../actions/configuration/action.yaml | 76 ++++++++++++++++++ .github/workflows/test-job.yaml | 19 +++++ ray-operator/README.md | 10 +-- .../samples/ray-cluster.getting-started.yaml | 4 +- .../samples/ray-cluster.heterogeneous.yaml | 26 +++++++ .../config/samples/ray-cluster.mini.yaml | 8 +- tests/framework/config/requirements.txt | 3 + tests/framework/prototype.py | 63 ++++++++++----- .../framework/test_sample_raycluster_yamls.py | 78 +++++++++++++++++++ 9 files changed, 251 insertions(+), 36 deletions(-) create mode 100644 .github/workflows/actions/configuration/action.yaml create mode 100644 tests/framework/config/requirements.txt create mode 100644 tests/framework/test_sample_raycluster_yamls.py diff --git a/.github/workflows/actions/configuration/action.yaml b/.github/workflows/actions/configuration/action.yaml new file mode 100644 index 00000000000..8b3226de798 --- /dev/null +++ b/.github/workflows/actions/configuration/action.yaml @@ -0,0 +1,76 @@ +name: "Configuration test for sample YAML files" +description: "Run configuration tests" + +inputs: + ray_version: + description: "version of ray" + required: true + +runs: + using: "composite" + steps: + - name: Get revision SHA + id: vars + run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)" + shell: bash + + - name: Install Kind + run: | + curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64 + chmod +x ./kind + sudo mv ./kind /usr/local/bin/kind + shell: bash + + - name: Checkout Python + uses: actions/checkout@v2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install Python dependencies + run: pip install -r ./tests/framework/config/requirements.txt + shell: bash + + - name: Set up Docker + uses: docker-practice/actions-setup-docker@master + + - name: Download Artifact Operator + uses: actions/download-artifact@v2 + with: + name: operator_img + path: /tmp + + - name: Load KubeRay Operator Docker Image + run: | + docker load --input /tmp/operator.tar + docker images ls -a + shell: bash + + - name: Download Artifact Apiserver + uses: actions/download-artifact@v2 + with: + name: apiserver_img + path: /tmp + + - name: Load KubeRay Apiserver Docker Image + run: | + docker load --input /tmp/apiserver.tar + docker images ls -a + shell: bash + + - name: Run configuration tests for sample YAML files. + # compatibility test depends on operator & apiserver images built in previous steps. + run: | + pushd manifests/base/ + kustomize edit set image kuberay/operator=kuberay/operator:${{ steps.vars.outputs.sha_short }} + kustomize edit set image kuberay/apiserver=kuberay/apiserver:${{ steps.vars.outputs.sha_short }} + popd + echo "Using Ray image ${{ inputs.ray_version }}" + cd tests/framework + GITHUB_ACTIONS=true \ + RAY_IMAGE="rayproject/ray:${{ inputs.ray_version }}" \ + OPERATOR_IMAGE="kuberay/operator:${{ steps.vars.outputs.sha_short }}" \ + APISERVER_IMAGE="kuberay/apiserver:${{ steps.vars.outputs.sha_short }}" python test_sample_raycluster_yamls.py + shell: bash diff --git a/.github/workflows/test-job.yaml b/.github/workflows/test-job.yaml index f7c87ac3b32..8300c025f3c 100644 --- a/.github/workflows/test-job.yaml +++ b/.github/workflows/test-job.yaml @@ -339,3 +339,22 @@ jobs: - uses: ./.github/workflows/actions/compatibility with: ray_version: nightly + + sample-yaml-config-test-2_0_0: + needs: + - build_operator + - build_apiserver + - lint + runs-on: ubuntu-latest + name: Sample YAML Config Test - 2.0.0 + steps: + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + with: + # When checking out the repository that + # triggered a workflow, this defaults to the reference or SHA for that event. + # Default value should work for both pull_request and merge(push) event. + ref: ${{github.event.pull_request.head.sha}} + - uses: ./.github/workflows/actions/configuration + with: + ray_version: 2.0.0 diff --git a/ray-operator/README.md b/ray-operator/README.md index 969948f13c1..7096833a677 100644 --- a/ray-operator/README.md +++ b/ray-operator/README.md @@ -84,15 +84,9 @@ Sample | Description experimentation in local kind or minikube environments. ```shell -# Create a configmap with a hello world Ray code. -kubectl create -f config/samples/config-map-ray-code.yaml -configmap/ray-code created -``` - - -```shell -# Create a cluster. +# Create a RayCluster and a ConfigMap with hello world Ray code. $ kubectl create -f config/samples/ray-cluster.heterogeneous.yaml +configmap/ray-code created raycluster.ray.io/raycluster-heterogeneous created # List running clusters. diff --git a/ray-operator/config/samples/ray-cluster.getting-started.yaml b/ray-operator/config/samples/ray-cluster.getting-started.yaml index c6bee98e0d8..02784ad45bf 100644 --- a/ray-operator/config/samples/ray-cluster.getting-started.yaml +++ b/ray-operator/config/samples/ray-cluster.getting-started.yaml @@ -10,7 +10,7 @@ metadata: # An unique identifier for the head node and workers of this cluster. name: raycluster-getting-started spec: - rayVersion: '1.8.0' # should match the Ray version in the image of the containers + rayVersion: '2.0.0' # should match the Ray version in the image of the containers ######################headGroupSpecs################################# # head group template and specs, (perhaps 'group' is not needed in the name) headGroupSpec: @@ -45,7 +45,7 @@ spec: spec: containers: - name: ray-head - image: rayproject/ray:1.8.0 + image: rayproject/ray:2.0.0 env: - name: MY_POD_IP valueFrom: diff --git a/ray-operator/config/samples/ray-cluster.heterogeneous.yaml b/ray-operator/config/samples/ray-cluster.heterogeneous.yaml index b4a0ae2561b..6a651eecbe7 100644 --- a/ray-operator/config/samples/ray-cluster.heterogeneous.yaml +++ b/ray-operator/config/samples/ray-cluster.heterogeneous.yaml @@ -1,3 +1,29 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ray-code +data: + sample_code.py: | + import ray + from os import environ + redis_pass = environ.get("REDIS_PASSWORD") + print("trying to connect to Ray!") + ray.init(address="auto", _redis_password=redis_pass) + print("now executing some code with Ray!") + import time + start = time.time() + @ray.remote + def f(): + time.sleep(0.01) + return ray._private.services.get_node_ip_address() + values=set(ray.get([f.remote() for _ in range(1000)])) + print("Ray Nodes: ",str(values)) + file = open("/tmp/ray_nodes.txt","a") + file.write("available nodes: %s\n" % str(values)) + file.close() + end = time.time() + print("Execution time = ",end - start) +--- # The resource requests and limits in this config are too small for production! # For examples with more realistic resource configuration, see # ray-cluster.complete.large.yaml and diff --git a/ray-operator/config/samples/ray-cluster.mini.yaml b/ray-operator/config/samples/ray-cluster.mini.yaml index 9f068e18742..c297f9f0da6 100644 --- a/ray-operator/config/samples/ray-cluster.mini.yaml +++ b/ray-operator/config/samples/ray-cluster.mini.yaml @@ -10,7 +10,7 @@ metadata: # An unique identifier for the head node and workers of this cluster. name: raycluster-mini spec: - rayVersion: '1.12.1' # should match the Ray version in the image of the containers + rayVersion: '2.0.0' # should match the Ray version in the image of the containers ######################headGroupSpecs################################# # head group template and specs, (perhaps 'group' is not needed in the name) headGroupSpec: @@ -46,9 +46,7 @@ spec: spec: containers: - name: ray-head - image: rayproject/ray:1.12.1 - #image: rayproject/ray:nightly - #image: bonsaidev.azurecr.io/bonsai/lazer-0-9-0-cpu:dev + image: rayproject/ray:2.0.0 env: - name: MY_POD_IP valueFrom: @@ -59,7 +57,7 @@ spec: cpu: 1 memory: 2Gi requests: - cpu: 1 + cpu: 500m memory: 2Gi ports: - containerPort: 6379 diff --git a/tests/framework/config/requirements.txt b/tests/framework/config/requirements.txt new file mode 100644 index 00000000000..6112a9386d9 --- /dev/null +++ b/tests/framework/config/requirements.txt @@ -0,0 +1,3 @@ +docker +kubernetes +jsonpatch \ No newline at end of file diff --git a/tests/framework/prototype.py b/tests/framework/prototype.py index e1115d4396d..116bd08d64e 100644 --- a/tests/framework/prototype.py +++ b/tests/framework/prototype.py @@ -56,7 +56,10 @@ def download_images(docker_images): """Download Docker images from DockerHub""" docker_client = docker.from_env() for image in docker_images: - docker_client.images.pull(image) + # Only pull the image from DockerHub when the image does not + # exist in the local docker registry. + if os.system(f'docker image inspect {image} > /dev/null') != 0: + docker_client.images.pull(image) docker_client.close() def kind_load_images(docker_images): @@ -119,11 +122,14 @@ class CREvent: CREvent: Custom Resource Event can be mainly divided into 3 categories. (1) Add (create) CR (2) Update CR (3) Delete CR """ - def __init__(self, custom_resource, rulesets: List[RuleSet], timeout, namespace): + def __init__(self, custom_resource_object, + rulesets: List[RuleSet], timeout, namespace, filepath = None): self.rulesets = rulesets self.timeout = timeout self.namespace = namespace - self.custom_resource = custom_resource + self.custom_resource_object = custom_resource_object + # A file may consists of multiple Kubernetes resources (ex: ray-cluster.external-redis.yaml) + self.filepath = filepath def trigger(self): """ The member functions integrate together in `trigger()`. @@ -146,7 +152,7 @@ def wait(self): def check_rule_sets(self): """When the system converges, check all registered RuleSets.""" for ruleset in self.rulesets: - ruleset.check_rule_set(self.custom_resource, self.namespace) + ruleset.check_rule_set(self.custom_resource_object, self.namespace) # My implementations class HeadPodNameRule(Rule): @@ -159,7 +165,7 @@ def assert_rule(self, custom_resource=None, cr_namespace='default'): expected_val = search_path(custom_resource, "spec.headGroupSpec.template.spec.containers.0.name".split('.')) headpods = client.CoreV1Api().list_namespaced_pod( - namespace = cr_namespace, label_selector='rayNodeType=head') + namespace = cr_namespace, label_selector='ray.io/node-type=head') assert headpods.items[0].spec.containers[0].name == expected_val class HeadSvcRule(Rule): @@ -178,7 +184,7 @@ class EasyJobRule(Rule): """Submit a very simple Ray job to test the basic functionality of the Ray cluster.""" def assert_rule(self, custom_resource=None, cr_namespace='default'): headpods = client.CoreV1Api().list_namespaced_pod( - namespace = cr_namespace, label_selector='rayNodeType=head') + namespace = cr_namespace, label_selector='ray.io/node-type=head') headpod_name = headpods.items[0].metadata.name rtn = os.system( f"kubectl exec {headpod_name} --" + @@ -188,9 +194,12 @@ def assert_rule(self, custom_resource=None, cr_namespace='default'): class RayClusterAddCREvent(CREvent): """CREvent for RayCluster addition""" def exec(self): - client.CustomObjectsApi().create_namespaced_custom_object( - group = 'ray.io',version = 'v1alpha1', namespace = self.namespace, - plural = 'rayclusters', body = self.custom_resource) + if not self.filepath: + client.CustomObjectsApi().create_namespaced_custom_object( + group = 'ray.io',version = 'v1alpha1', namespace = self.namespace, + plural = 'rayclusters', body = self.custom_resource_object) + else: + os.system(f"kubectl apply -n {self.namespace} -f {self.filepath}") def wait(self): def check_pod_running(pods) -> bool: @@ -199,24 +208,36 @@ def check_pod_running(pods) -> bool: return False return True start_time = time.time() - expected_head_pods = search_path(self.custom_resource, - "spec.headGroupSpec.replicas".split('.'), default_value=0) - expected_worker_pods = search_path(self.custom_resource, - "spec.workerGroupSpecs.0.replicas".split('.'), default_value=0) + expected_head_pods = search_path(self.custom_resource_object, + "spec.headGroupSpec.replicas".split('.'), default_value=1) + worker_group_specs = search_path(self.custom_resource_object, + "spec.workerGroupSpecs".split('.'), default_value=[]) + expected_worker_pods = 0 + for spec in worker_group_specs: + expected_worker_pods += spec['replicas'] # Wait until: # (1) The number of head pods and worker pods are as expected. # (2) All head pods and worker pods are "Running". for _ in range(self.timeout): headpods = client.CoreV1Api().list_namespaced_pod( - namespace = self.namespace, label_selector='rayNodeType=head') + namespace = self.namespace, label_selector='ray.io/node-type=head') workerpods = client.CoreV1Api().list_namespaced_pod( - namespace = self.namespace, label_selector='rayNodeType=worker') + namespace = self.namespace, label_selector='ray.io/node-type=worker') if (len(headpods.items) == expected_head_pods and len(workerpods.items) == expected_worker_pods and check_pod_running(headpods.items) and check_pod_running(workerpods.items)): logger.info("--- RayClusterAddCREvent %s seconds ---", time.time() - start_time) return time.sleep(1) + + # Fail to converge. Print some information to debug. + logger.info("RayClusterAddCREvent failed to converge in %d seconds.", self.timeout) + logger.info("expected_head_pods: %d, expected_worker_pods: %d", + expected_head_pods, expected_worker_pods) + os.system(f'kubectl get all -n={self.namespace}') + os.system(f'kubectl describe pods -n={self.namespace}') + + # Raise an exception raise Exception("RayClusterAddCREvent wait() timeout") class RayClusterDeleteCREvent(CREvent): @@ -224,15 +245,15 @@ class RayClusterDeleteCREvent(CREvent): def exec(self): client.CustomObjectsApi().delete_namespaced_custom_object( group = 'ray.io', version = 'v1alpha1', namespace = self.namespace, - plural = 'rayclusters', name = self.custom_resource['metadata']['name']) + plural = 'rayclusters', name = self.custom_resource_object['metadata']['name']) def wait(self): start_time = time.time() for _ in range(self.timeout): headpods = client.CoreV1Api().list_namespaced_pod( - namespace = self.namespace, label_selector='rayNodeType=head') + namespace = self.namespace, label_selector='ray.io/node-type=head') workerpods = client.CoreV1Api().list_namespaced_pod( - namespace = self.namespace, label_selector='rayNodeType=worker') + namespace = self.namespace, label_selector='ray.io/node-type=worker') if (len(headpods.items) == 0 and len(workerpods.items) == 0): logger.info("--- RayClusterDeleteCREvent %s seconds ---", time.time() - start_time) return @@ -265,8 +286,8 @@ def runtest(self): def tearDown(self) -> None: try: - delete_event = RayClusterDeleteCREvent( - self.cr_event.custom_resource, [], self.cr_event.timeout, self.cr_event.namespace) + delete_event = RayClusterDeleteCREvent(self.cr_event.custom_resource_object, + [], self.cr_event.timeout, self.cr_event.namespace) delete_event.trigger() except Exception as ex: logger.error(str(ex)) @@ -321,5 +342,5 @@ def tearDown(self) -> None: for new_cr in mut.mutate(): addEvent = RayClusterAddCREvent(new_cr, [rs], 90, NAMESPACE) test_cases.addTest(GeneralTestCase('runtest', images, addEvent)) - runner=unittest.TextTestRunner() + runner = unittest.TextTestRunner() runner.run(test_cases) diff --git a/tests/framework/test_sample_raycluster_yamls.py b/tests/framework/test_sample_raycluster_yamls.py new file mode 100644 index 00000000000..c1dd1b29f0e --- /dev/null +++ b/tests/framework/test_sample_raycluster_yamls.py @@ -0,0 +1,78 @@ +''' Test sample RayCluster YAML files to catch invalid and outdated ones. ''' +import unittest +import os +import logging +import yaml + +from prototype import ( + RuleSet, + GeneralTestCase, + RayClusterAddCREvent, + HeadPodNameRule, + EasyJobRule, + HeadSvcRule, +) + +logger = logging.getLogger(__name__) + +if __name__ == '__main__': + NAMESPACE = 'default' + SAMPLE_PATH = '../../ray-operator/config/samples/' + + sample_yaml_files = [] + + # The free plan of GitHub Actions (i.e. KubeRay CI) only support 2-core CPU runners. Most + # sample YAMLs cannot schedule all pods on Kubernetes nodes due to insufficient CPUs. We + # decided to just run some tests on KubeRay CI and run all tests in the Ray CI. + GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS", default="False").lower() == "true" + github_action_tests = set([ + "ray-cluster.getting-started.yaml", + "ray-cluster.ingress.yaml", + "ray-cluster.mini.yaml" + ] + ) + + for filename in os.scandir(SAMPLE_PATH): + if filename.is_file(): + with open(filename, encoding="utf-8") as cr_yaml: + if GITHUB_ACTIONS and filename.name not in github_action_tests: + continue + for k8s_object in yaml.safe_load_all(cr_yaml): + if k8s_object['kind'] == 'RayCluster': + sample_yaml_files.append( + {'path': filename.path, 'name': filename.name, 'cr': k8s_object} + ) + break + + skip_tests = { + 'ray-cluster.complete.large.yaml': 'Skip this test because it requires a lot of resources.', + 'ray-cluster.external-redis.yaml': + 'It installs multiple Kubernetes resources and cannot clean up by DeleteCREvent.', + 'ray-cluster.autoscaler.large.yaml': + 'Skip this test because it requires a lot of resources.' + } + + rs = RuleSet([HeadPodNameRule(), EasyJobRule(), HeadSvcRule()]) + images = [ + os.getenv('RAY_IMAGE', default='rayproject/ray:2.0.0'), + os.getenv('OPERATOR_IMAGE', default='kuberay/operator:nightly'), + os.getenv('APISERVER_IMAGE', default='kuberay/apiserver:nightly') + ] + logger.info(images) + # Build a test plan + logger.info("Build a test plan ...") + test_cases = unittest.TestSuite() + for index, new_cr in enumerate(sample_yaml_files): + if new_cr['name'] in skip_tests: + logger.info('[SKIP TEST %d] %s: %s', index, new_cr['name'], skip_tests[new_cr['name']]) + continue + logger.info('[TEST %d]: %s', index, new_cr['name']) + addEvent = RayClusterAddCREvent(new_cr['cr'], [rs], 90, NAMESPACE, new_cr['path']) + test_cases.addTest(GeneralTestCase('runtest', images, addEvent)) + + # Execute all tests + runner = unittest.TextTestRunner() + test_result = runner.run(test_cases) + + # Without this line, the exit code will always be 0. + assert test_result.wasSuccessful()