Skip to content

Commit

Permalink
[Feature] Test sample RayCluster YAMLs to catch invalid or out of dat…
Browse files Browse the repository at this point in the history
…e ones (ray-project#678)

Use ray-project#605 to test sample RayCluster YAMLs.

Signed-off-by: Kai-Hsun Chen <kaihsun@apache.org>
  • Loading branch information
kevin85421 committed Nov 7, 2022
1 parent 353df33 commit 8407bd3
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 36 deletions.
76 changes: 76 additions & 0 deletions .github/workflows/actions/configuration/action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: "Configuration test for sample YAML files"
description: "Run configuration tests"

inputs:
ray_version:
description: "version of ray"
required: true

runs:
using: "composite"
steps:
- name: Get revision SHA
id: vars
run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
shell: bash

- name: Install Kind
run: |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.11.1/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
shell: bash

- name: Checkout Python
uses: actions/checkout@v2

- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install Python dependencies
run: pip install -r ./tests/framework/config/requirements.txt
shell: bash

- name: Set up Docker
uses: docker-practice/actions-setup-docker@master

- name: Download Artifact Operator
uses: actions/download-artifact@v2
with:
name: operator_img
path: /tmp

- name: Load KubeRay Operator Docker Image
run: |
docker load --input /tmp/operator.tar
docker images ls -a
shell: bash

- name: Download Artifact Apiserver
uses: actions/download-artifact@v2
with:
name: apiserver_img
path: /tmp

- name: Load KubeRay Apiserver Docker Image
run: |
docker load --input /tmp/apiserver.tar
docker images ls -a
shell: bash

- name: Run configuration tests for sample YAML files.
# compatibility test depends on operator & apiserver images built in previous steps.
run: |
pushd manifests/base/
kustomize edit set image kuberay/operator=kuberay/operator:${{ steps.vars.outputs.sha_short }}
kustomize edit set image kuberay/apiserver=kuberay/apiserver:${{ steps.vars.outputs.sha_short }}
popd
echo "Using Ray image ${{ inputs.ray_version }}"
cd tests/framework
GITHUB_ACTIONS=true \
RAY_IMAGE="rayproject/ray:${{ inputs.ray_version }}" \
OPERATOR_IMAGE="kuberay/operator:${{ steps.vars.outputs.sha_short }}" \
APISERVER_IMAGE="kuberay/apiserver:${{ steps.vars.outputs.sha_short }}" python test_sample_raycluster_yamls.py
shell: bash
19 changes: 19 additions & 0 deletions .github/workflows/test-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -339,3 +339,22 @@ jobs:
- uses: ./.github/workflows/actions/compatibility
with:
ray_version: nightly

sample-yaml-config-test-2_0_0:
needs:
- build_operator
- build_apiserver
- lint
runs-on: ubuntu-latest
name: Sample YAML Config Test - 2.0.0
steps:
- name: Check out code into the Go module directory
uses: actions/checkout@v2
with:
# When checking out the repository that
# triggered a workflow, this defaults to the reference or SHA for that event.
# Default value should work for both pull_request and merge(push) event.
ref: ${{github.event.pull_request.head.sha}}
- uses: ./.github/workflows/actions/configuration
with:
ray_version: 2.0.0
10 changes: 2 additions & 8 deletions ray-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,9 @@ Sample | Description
experimentation in local kind or minikube environments.

```shell
# Create a configmap with a hello world Ray code.
kubectl create -f config/samples/config-map-ray-code.yaml
configmap/ray-code created
```


```shell
# Create a cluster.
# Create a RayCluster and a ConfigMap with hello world Ray code.
$ kubectl create -f config/samples/ray-cluster.heterogeneous.yaml
configmap/ray-code created
raycluster.ray.io/raycluster-heterogeneous created

# List running clusters.
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/samples/ray-cluster.getting-started.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ metadata:
# An unique identifier for the head node and workers of this cluster.
name: raycluster-getting-started
spec:
rayVersion: '1.8.0' # should match the Ray version in the image of the containers
rayVersion: '2.0.0' # should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
Expand Down Expand Up @@ -45,7 +45,7 @@ spec:
spec:
containers:
- name: ray-head
image: rayproject/ray:1.8.0
image: rayproject/ray:2.0.0
env:
- name: MY_POD_IP
valueFrom:
Expand Down
26 changes: 26 additions & 0 deletions ray-operator/config/samples/ray-cluster.heterogeneous.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: ray-code
data:
sample_code.py: |
import ray
from os import environ
redis_pass = environ.get("REDIS_PASSWORD")
print("trying to connect to Ray!")
ray.init(address="auto", _redis_password=redis_pass)
print("now executing some code with Ray!")
import time
start = time.time()
@ray.remote
def f():
time.sleep(0.01)
return ray._private.services.get_node_ip_address()
values=set(ray.get([f.remote() for _ in range(1000)]))
print("Ray Nodes: ",str(values))
file = open("/tmp/ray_nodes.txt","a")
file.write("available nodes: %s\n" % str(values))
file.close()
end = time.time()
print("Execution time = ",end - start)
---
# The resource requests and limits in this config are too small for production!
# For examples with more realistic resource configuration, see
# ray-cluster.complete.large.yaml and
Expand Down
8 changes: 3 additions & 5 deletions ray-operator/config/samples/ray-cluster.mini.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ metadata:
# An unique identifier for the head node and workers of this cluster.
name: raycluster-mini
spec:
rayVersion: '1.12.1' # should match the Ray version in the image of the containers
rayVersion: '2.0.0' # should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
Expand Down Expand Up @@ -46,9 +46,7 @@ spec:
spec:
containers:
- name: ray-head
image: rayproject/ray:1.12.1
#image: rayproject/ray:nightly
#image: bonsaidev.azurecr.io/bonsai/lazer-0-9-0-cpu:dev
image: rayproject/ray:2.0.0
env:
- name: MY_POD_IP
valueFrom:
Expand All @@ -59,7 +57,7 @@ spec:
cpu: 1
memory: 2Gi
requests:
cpu: 1
cpu: 500m
memory: 2Gi
ports:
- containerPort: 6379
Expand Down
3 changes: 3 additions & 0 deletions tests/framework/config/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
docker
kubernetes
jsonpatch
63 changes: 42 additions & 21 deletions tests/framework/prototype.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ def download_images(docker_images):
"""Download Docker images from DockerHub"""
docker_client = docker.from_env()
for image in docker_images:
docker_client.images.pull(image)
# Only pull the image from DockerHub when the image does not
# exist in the local docker registry.
if os.system(f'docker image inspect {image} > /dev/null') != 0:
docker_client.images.pull(image)
docker_client.close()

def kind_load_images(docker_images):
Expand Down Expand Up @@ -119,11 +122,14 @@ class CREvent:
CREvent: Custom Resource Event can be mainly divided into 3 categories.
(1) Add (create) CR (2) Update CR (3) Delete CR
"""
def __init__(self, custom_resource, rulesets: List[RuleSet], timeout, namespace):
def __init__(self, custom_resource_object,
rulesets: List[RuleSet], timeout, namespace, filepath = None):
self.rulesets = rulesets
self.timeout = timeout
self.namespace = namespace
self.custom_resource = custom_resource
self.custom_resource_object = custom_resource_object
# A file may consists of multiple Kubernetes resources (ex: ray-cluster.external-redis.yaml)
self.filepath = filepath
def trigger(self):
"""
The member functions integrate together in `trigger()`.
Expand All @@ -146,7 +152,7 @@ def wait(self):
def check_rule_sets(self):
"""When the system converges, check all registered RuleSets."""
for ruleset in self.rulesets:
ruleset.check_rule_set(self.custom_resource, self.namespace)
ruleset.check_rule_set(self.custom_resource_object, self.namespace)

# My implementations
class HeadPodNameRule(Rule):
Expand All @@ -159,7 +165,7 @@ def assert_rule(self, custom_resource=None, cr_namespace='default'):
expected_val = search_path(custom_resource,
"spec.headGroupSpec.template.spec.containers.0.name".split('.'))
headpods = client.CoreV1Api().list_namespaced_pod(
namespace = cr_namespace, label_selector='rayNodeType=head')
namespace = cr_namespace, label_selector='ray.io/node-type=head')
assert headpods.items[0].spec.containers[0].name == expected_val

class HeadSvcRule(Rule):
Expand All @@ -178,7 +184,7 @@ class EasyJobRule(Rule):
"""Submit a very simple Ray job to test the basic functionality of the Ray cluster."""
def assert_rule(self, custom_resource=None, cr_namespace='default'):
headpods = client.CoreV1Api().list_namespaced_pod(
namespace = cr_namespace, label_selector='rayNodeType=head')
namespace = cr_namespace, label_selector='ray.io/node-type=head')
headpod_name = headpods.items[0].metadata.name
rtn = os.system(
f"kubectl exec {headpod_name} --" +
Expand All @@ -188,9 +194,12 @@ def assert_rule(self, custom_resource=None, cr_namespace='default'):
class RayClusterAddCREvent(CREvent):
"""CREvent for RayCluster addition"""
def exec(self):
client.CustomObjectsApi().create_namespaced_custom_object(
group = 'ray.io',version = 'v1alpha1', namespace = self.namespace,
plural = 'rayclusters', body = self.custom_resource)
if not self.filepath:
client.CustomObjectsApi().create_namespaced_custom_object(
group = 'ray.io',version = 'v1alpha1', namespace = self.namespace,
plural = 'rayclusters', body = self.custom_resource_object)
else:
os.system(f"kubectl apply -n {self.namespace} -f {self.filepath}")

def wait(self):
def check_pod_running(pods) -> bool:
Expand All @@ -199,40 +208,52 @@ def check_pod_running(pods) -> bool:
return False
return True
start_time = time.time()
expected_head_pods = search_path(self.custom_resource,
"spec.headGroupSpec.replicas".split('.'), default_value=0)
expected_worker_pods = search_path(self.custom_resource,
"spec.workerGroupSpecs.0.replicas".split('.'), default_value=0)
expected_head_pods = search_path(self.custom_resource_object,
"spec.headGroupSpec.replicas".split('.'), default_value=1)
worker_group_specs = search_path(self.custom_resource_object,
"spec.workerGroupSpecs".split('.'), default_value=[])
expected_worker_pods = 0
for spec in worker_group_specs:
expected_worker_pods += spec['replicas']
# Wait until:
# (1) The number of head pods and worker pods are as expected.
# (2) All head pods and worker pods are "Running".
for _ in range(self.timeout):
headpods = client.CoreV1Api().list_namespaced_pod(
namespace = self.namespace, label_selector='rayNodeType=head')
namespace = self.namespace, label_selector='ray.io/node-type=head')
workerpods = client.CoreV1Api().list_namespaced_pod(
namespace = self.namespace, label_selector='rayNodeType=worker')
namespace = self.namespace, label_selector='ray.io/node-type=worker')
if (len(headpods.items) == expected_head_pods
and len(workerpods.items) == expected_worker_pods
and check_pod_running(headpods.items) and check_pod_running(workerpods.items)):
logger.info("--- RayClusterAddCREvent %s seconds ---", time.time() - start_time)
return
time.sleep(1)

# Fail to converge. Print some information to debug.
logger.info("RayClusterAddCREvent failed to converge in %d seconds.", self.timeout)
logger.info("expected_head_pods: %d, expected_worker_pods: %d",
expected_head_pods, expected_worker_pods)
os.system(f'kubectl get all -n={self.namespace}')
os.system(f'kubectl describe pods -n={self.namespace}')

# Raise an exception
raise Exception("RayClusterAddCREvent wait() timeout")

class RayClusterDeleteCREvent(CREvent):
"""CREvent for RayCluster deletion"""
def exec(self):
client.CustomObjectsApi().delete_namespaced_custom_object(
group = 'ray.io', version = 'v1alpha1', namespace = self.namespace,
plural = 'rayclusters', name = self.custom_resource['metadata']['name'])
plural = 'rayclusters', name = self.custom_resource_object['metadata']['name'])

def wait(self):
start_time = time.time()
for _ in range(self.timeout):
headpods = client.CoreV1Api().list_namespaced_pod(
namespace = self.namespace, label_selector='rayNodeType=head')
namespace = self.namespace, label_selector='ray.io/node-type=head')
workerpods = client.CoreV1Api().list_namespaced_pod(
namespace = self.namespace, label_selector='rayNodeType=worker')
namespace = self.namespace, label_selector='ray.io/node-type=worker')
if (len(headpods.items) == 0 and len(workerpods.items) == 0):
logger.info("--- RayClusterDeleteCREvent %s seconds ---", time.time() - start_time)
return
Expand Down Expand Up @@ -265,8 +286,8 @@ def runtest(self):

def tearDown(self) -> None:
try:
delete_event = RayClusterDeleteCREvent(
self.cr_event.custom_resource, [], self.cr_event.timeout, self.cr_event.namespace)
delete_event = RayClusterDeleteCREvent(self.cr_event.custom_resource_object,
[], self.cr_event.timeout, self.cr_event.namespace)
delete_event.trigger()
except Exception as ex:
logger.error(str(ex))
Expand Down Expand Up @@ -321,5 +342,5 @@ def tearDown(self) -> None:
for new_cr in mut.mutate():
addEvent = RayClusterAddCREvent(new_cr, [rs], 90, NAMESPACE)
test_cases.addTest(GeneralTestCase('runtest', images, addEvent))
runner=unittest.TextTestRunner()
runner = unittest.TextTestRunner()
runner.run(test_cases)
Loading

0 comments on commit 8407bd3

Please sign in to comment.