In [1]:
import os
import random
import sys
import time

import numpy as np
import ray

In [2]:
large_matrix = np.random.rand(1024, 1024, 1024//8) # approx. 1 GB
size_in_bytes = sys.getsizeof(large_matrix)

print(f"large_matrix has: {size_in_bytes/1024/1024/1024:.2f} GB")

large_matrix has: 1.00 GB


In [3]:
obj_ref = ray.put(large_matrix)
print(f"Object reference: {obj_ref}")

2025-07-19 11:14:21,259	INFO worker.py:1879 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8267 [39m[22m


Object reference: ObjectRef(00ffffffffffffffffffffffffffffffffffffff0100000001e1f505)


[36m(mm pid=378546)[0m Took 4.094043016433716 s
[36m(mm pid=378700)[0m Took 2.7503392696380615 s


[33m(raylet)[0m [2025-07-19 11:31:21,270 E 359872 359872] (raylet) node_manager.cc:3287: 2 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: fc72b861b9298a6fd07f7354e2553dc2347052d8f00296b430fd6bc1, IP: 172.16.0.2) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.16.0.2`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
[33m(raylet)[0m [2025-07-19 11:33:21,273 E 359872 359872] (raylet) node_manager.cc:3287: 1

In [7]:
large_mat_from_object_store = ray.get(obj_ref)
np.array_equal(large_mat_from_object_store, large_matrix)


True

In [6]:
large_mat_from_object_store is large_matrix


False

In [8]:
@ray.remote
def compute(x, y):
    return int(np.matmul(x,y).sum())

mat1_ref = ray.put(np.random.rand(32, 32))
mat2_ref = ray.put(np.random.rand(32, 32))

collection = []
for _ in range(10):
    collection.append(compute.remote(mat1_ref, mat2_ref))

results = ray.get(collection)
results

[8285, 8285, 8285, 8285, 8285, 8285, 8285, 8285, 8285, 8285]

### Chaining Tasks

In [10]:
@ray.remote
def remote_add(a, b):
    return a + b

@ray.remote
def expensive_square(x):
    time.sleep(5)
    return x**2

# Chaining tasks
square_ref = expensive_square.remote(10)
squared_result = ray.get(square_ref)

# Chaining tasks with remote_add
sum_ref = remote_add.remote(1, squared_result)
final_result = ray.get(sum_ref)

sum_value = ray.get(sum_ref)
print(f"Final result: {final_result}, Sum value: {sum_value}")

Final result: 101, Sum value: 101


### Task retries -> system errors, application-level error

In [11]:
@ray.remote
def incorrect_square(x: int, probability: float) -> int:
    if random.random() < probability:
        raise ValueError("Simulated error")
    return x ** 2

try:
    ray.get([incorrect_square.remote(x=4, probability=0.5) for _ in range(10)])
except Exception as e:
    print("At least one of the tasks failed", flush=True)

At least one of the tasks failed


2025-07-19 11:24:20,742	ERROR worker.py:421 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::incorrect_square()[39m (pid=359934, ip=172.16.0.2)
  File "/tmp/ipykernel_348556/3426736041.py", line 4, in incorrect_square
ValueError: Simulated error
2025-07-19 11:24:20,743	ERROR worker.py:421 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::incorrect_square()[39m (pid=359929, ip=172.16.0.2)
  File "/tmp/ipykernel_348556/3426736041.py", line 4, in incorrect_square
ValueError: Simulated error
2025-07-19 11:24:20,744	ERROR worker.py:421 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::incorrect_square()[39m (pid=359933, ip=172.16.0.2)
  File "/tmp/ipykernel_348556/3426736041.py", line 4, in incorrect_square
ValueError: Simulated error
2025-07-19 11:24:20,747	ERROR worker.py:421 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::incorrect_square()[39m (pid=359930, ip=172.16.0.2

2025-07-19 11:24:20,749	ERROR worker.py:421 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::incorrect_square()[39m (pid=359935, ip=172.16.0.2)
  File "/tmp/ipykernel_348556/3426736041.py", line 4, in incorrect_square
ValueError: Simulated error


### Runtime-Environments

In [12]:
@ray.remote(runtime_env={"env_vars": {"MY_CUSTOM_ENV": "production"}})
def f():
    env = os.environ["MY_CUSTOM_ENV"]
    return f"My custom environment variable is set to: {env}"

ray.get(f.remote())

'My custom environment variable is set to: production'

### Resource Allocation & Management

In [13]:
@ray.remote(num_cpus=1)
def add_random_function(x, y):
    time.sleep(1)
    return x + y

@ray.remote(num_cpus=1)
def mm(n: int = 4000):
    A = np.random.rand(n, n)
    B = np.random.rand(n, n)

    start = time.time()
    C = np.matmul(A, B)
    end = time.time()
    print(f"Took {end - start} s", flush=True)

ray.get(mm.options(runtime_env={"env_vars": {"OMP_NUM_THREADS": "1"}}).remote())
ray.get(mm.options(runtime_env={"env_vars": {"OMP_NUM_THREADS": "8"}}).remote())

In [18]:
print(ray.available_resources())
print(ray.cluster_resources())

@ray.remote(num_cpus=1, num_gpus=1)
def gpu_task():
    import torch
    return torch.rand(100, 100).cuda()
gpu_task_ref = gpu_task.options(num_gpus=1).remote()
gpu_task_result = ray.get(gpu_task_ref)
print(f"GPU task result shape: {gpu_task_result.shape}")


@ray.remote(num_cpus=0.5)
def cpu_task():
    import time
    time.sleep(1)
    return "CPU task completed"

cpu_task_ref = cpu_task.options(num_cpus=0.5).remote()
cpu_task_result = ray.get(cpu_task_ref)
print(f"CPU task result: {cpu_task_result}")

@ray.remote(num_cpus=0.5)
def remote_add(a, b):
    return a + b
ref = remote_add.remote(3, 2)
ref
result = ray.get(ref)
print(f"Result of remote_add: {result}")

@ray.remote
def main():
    square_ref = expensive_square.remote(3)
    square_ref_2 = expensive_square.remote(4)
    add_ref = remote_add.remote(square_ref, square_ref_2)
    return ray.get(add_ref)
result = ray.get(main.remote())
print(f"Result of main: {result}")

{'CPU': 8.0, 'node:__internal_head__': 1.0, 'accelerator_type:G': 1.0, 'GPU': 1.0, 'memory': 3136458752.0, 'node:172.16.0.2': 1.0, 'object_store_memory': 270437636.0}
{'CPU': 8.0, 'node:__internal_head__': 1.0, 'accelerator_type:G': 1.0, 'GPU': 1.0, 'memory': 3136458752.0, 'node:172.16.0.2': 1.0, 'object_store_memory': 1344196608.0}
GPU task result shape: torch.Size([100, 100])
CPU task result: CPU task completed
Result of remote_add: 5
Result of main: 25


### Ray Actors

In [24]:
@ray.remote
class Accounting:
    def __init__(self):
        self.total = 0
    
    def add(self, amount):
        self.total += amount
        
    def remove(self, amount):
        self.total -= amount
        
    def total(self):
        return self.total

acc = Accounting.remote()
acc.total.remote()
ray.get(acc.add.remote(100))
acc.add.remote(100)
acc.remove.remote(10)
ray.get(acc.total.remote())

190

In [25]:
@ray.remote
class LinearModel:
    def __init__(self, w0, w1):
        self.w0 = w0
        self.w1 = w1

    def convert(self, celsius):
        return self.w1 * celsius + self.w0

model = LinearModel.remote(w1=9/5, w0=32)
ray.get(model.convert.remote(100))

212.0