# RVAI SDK - Ray runtime

In [None]:
!pip install -qqq rvai==1.1.0rc51 pygraphviz

## Creating the cells

For testing purposes we will create a dummy cell that takes an image as input, sleeps for a configurable amount of time to mimmick computation and produce a copy of the image as an output.

In [None]:
import time
# import base classes
from dataclasses import dataclass
from typing import Type
from rvai.base.cell import Cell, cell
from rvai.base.data import (
    Annotations,
    Inputs,
    Outputs,
    Parameters,
    ProcessedParameters,
)
from rvai.base.context import InferenceContext
from rvai.types import Float, Image, Integer

### BenchmarkCell

In [None]:
# Inputs
@dataclass
class BenchmarkInputs(Inputs):
    image_in: Image = Inputs.field(
        name="Image", description="An input image."
    )
    counter_in: Integer = Inputs.field(
        name="Counter", description="Counter input value."
    )

# Outputs
@dataclass
class BenchmarkOutputs(Outputs):
    image_out: Image = Outputs.field(
        name="Image", description="An input image."
    )
    counter_out: Integer = Outputs.field(
        name="Counter", description="Counter output value."
    )

# Parameters
@dataclass
class BenchmarkParameters(Parameters):
    delay: Float = Parameters.field(
        default=Float(0.1),
        name="Processing delay", description="Processing delay in seconds"
    )

# Cell
@cell
class BenchmarkCell(Cell):

    @classmethod
    def call(
        cls, context: InferenceContext, parameters: BenchmarkParameters, inputs: BenchmarkInputs,
    ) -> BenchmarkOutputs:
        # Sleep to mimmick processing
        time.sleep(parameters.delay)
        outimage = inputs.image_in.copy()
        outimage[0, 0, :] = outimage[0, 0, :] / 2
        return BenchmarkOutputs(image_out=Image(outimage), counter_out=Integer(int(inputs.counter_in)+1))


## Creating a pipeline

In [None]:
from rvai.base.pipeline import DeclarativePipeline, PipelineCells, pipeline

We create a two types of pipelines with 4 benchmarking cells:
- SingleInput: the pipeline input image is connected to each of the 4 cells, counters are chained
- Chained: the pipeline input image is connected to the first cell, both image and counter are chained

In [None]:
class BenchmarkingPipelineCells(PipelineCells):
    a: BenchmarkCell
    b: BenchmarkCell
    c: BenchmarkCell
    d: BenchmarkCell

@pipeline
class SingleInputPipeline(DeclarativePipeline):
        
    cells = BenchmarkingPipelineCells
    inputs = {
        "image": [cells.a.inputs.image_in, cells.b.inputs.image_in, cells.c.inputs.image_in, cells.d.inputs.image_in],
        "counter": cells.a.inputs.counter_in,
    }
    outputs = {"counter": cells.d.outputs.counter_out}

    connections = [
        (cells.a.outputs.counter_out, cells.b.inputs.counter_in),
        (cells.b.outputs.counter_out, cells.c.inputs.counter_in),
        (cells.c.outputs.counter_out, cells.d.inputs.counter_in),
    ]

@pipeline
class ChainedPipeline(DeclarativePipeline):
        
    cells = BenchmarkingPipelineCells
    inputs = {
        "image": cells.a.inputs.image_in,
        "counter": cells.a.inputs.counter_in,
    }
    outputs = {"counter": cells.d.outputs.counter_out}

    connections = [
        (cells.a.outputs.counter_out, cells.b.inputs.counter_in),
        (cells.a.outputs.image_out, cells.b.inputs.image_in),
        (cells.b.outputs.counter_out, cells.c.inputs.counter_in),
        (cells.b.outputs.image_out, cells.c.inputs.image_in),
        (cells.c.outputs.counter_out, cells.d.inputs.counter_in),
        (cells.c.outputs.image_out, cells.d.inputs.image_in),
    ]

In [None]:
singleinput_pipeline = SingleInputPipeline.build()
# %matplotlib inline
singleinput_pipeline.show()

In [None]:
chained_pipeline = ChainedPipeline.build()
# %matplotlib inline
chained_pipeline.show()

# Define benchmarking setup

## Benchmarking parameters

In [None]:
image_sizes = {
    "dummy": (1, 1, 3),
    "480p": (480, 852, 3),
    "720p": (720, 1280, 3),
    "1080p": (1080, 1920, 3),
}
pipeline_fps = [(20, 10), (40, 20), (100, 50), (200, 100)]

## Benchmarking function

In [None]:
import numpy as np
import timeit
from collections import defaultdict
from rvai.base.runtime import Inference
import matplotlib.pyplot as plt 

def benchmark_task(runtime, task, benchmark_images, replicas=None):
    # Start an inference process for the task
    proc = runtime.start_inference(task)
    
    # Scale if needed
    if replicas is not None:
        proc.set_replicas(replicas)
    # Do a couple of predictions or warmup
    for i in range(5):
        out = proc.predict({"image": benchmark_images[i], "counter": Integer(0)}).result()
        # Check that output is valid
        assert int(out["counter"]) == 4
        
    # Do throughput measurements
    n_iter = len(benchmark_images)
    start = time.time()
    # Do prediction requests
    for i in range(n_iter):
        fut = proc.predict({"image": benchmark_images[i], "counter": Integer(0)})
    # Get the last result
    fut.result()
    stop = time.time()
    
    # Estimate throughput
    throughput = n_iter/(stop-start)
    
    # Measure the latency
    latency = timeit.timeit(proc.predict({"image": benchmark_images[0], "counter": Integer(0)}).result, number=n_iter)
    
    # Stop the process
    proc.stop()
    
    return throughput, latency


def benchmark(runtime, pipeline, n_iter, replicas=None):
    results = defaultdict(dict)
    for img_name, image_size in image_sizes.items():
        # Prepare data for testing
        images = [Image(np.random.randint(255, size=image_size, dtype=np.uint8)) for _ in range(n_iter)]
        results[img_name] = {}
        for default_fps, bottleneck_fps in pipeline_fps:
            # Construct inference task
            inference = Inference(pipeline=pipeline, parameters={
                "a": BenchmarkParameters(delay=Float(1./default_fps)),
                "b": BenchmarkParameters(delay=Float(1./bottleneck_fps)),
                "c": BenchmarkParameters(delay=Float(1./default_fps)),
                "d": BenchmarkParameters(delay=Float(1./default_fps)),
            })
            # Calculate theoretical performance
            theoretical_latency = (3./default_fps + 1./bottleneck_fps)
            sequence_fps = 1./theoretical_latency
            
            # Perform benchmarking
            res = benchmark_task(runtime, inference, images, replicas=replicas)
            results[img_name][(default_fps, bottleneck_fps)] = res
    return results
            

def print_results(results):
    for img_name, img_results in results.items():
        image = images[img_name]
        print(f'Results for {img_name} image ({"x".join([str(i) for i in image.shape])})')
        for (default_fps, bottleneck_fps), (throughput, latency) in img_results.items():
            # Calculate theoretical performance
            theoretical_latency = (3./default_fps + 1./bottleneck_fps)
            sequence_fps = 1./theoretical_latency
            
            print(f'  * Default cell {default_fps:.2f}FPS - bottleneck {bottleneck_fps:.2f}FPS')
            print(f'    - Throughput {throughput:.2f}FPS ({sequence_fps:.2f}FPS - {bottleneck_fps:.2f}FPS)')
            print(f'    - Latency {latency:.3f}s ({theoretical_latency:.3f}s)')
            
            
def print_combined(debug_results, ray_results, scaled_results):
    plot_theoretical = {img_name: [] for img_name in debug_results}
    plot_debug = {img_name: [] for img_name in debug_results}
    plot_ray = {img_name: [] for img_name in debug_results}
    plot_scaled = {img_name: [] for img_name in debug_results}
    plt.figure(figsize=(20, 5))
    subplot = 141
    for img_name in debug_results:
        image_size = image_sizes[img_name]
        print(f'Results for {img_name} image ({"x".join([str(i) for i in image_size])})')
        for default_fps, bottleneck_fps in debug_results[img_name]:
            # Calculate theoretical performance
            theoretical_latency = (3./default_fps + 1./bottleneck_fps)
            sequence_fps = 1./theoretical_latency
            
            # Get results
            debug_throughput, debug_latency = debug_results[img_name][(default_fps, bottleneck_fps)]
            ray_throughput, ray_latency = ray_results[img_name][(default_fps, bottleneck_fps)]
            scaled_throughput, scaled_latency = scaled_results[img_name][(default_fps, bottleneck_fps)]
            
            print(f'  * Default cell {default_fps:.2f}FPS - bottleneck {bottleneck_fps:.2f}FPS')
            print(f'    - Throughput:')
            print(f'      - Theoretical         {bottleneck_fps:03.2f}FPS ({sequence_fps:03.2f}FPS sequential)')
            print(f'      - Debug runtime       {debug_throughput:03.2f}FPS')
            print(f'      - Ray runtime         {ray_throughput:03.2f}FPS')
            print(f'      - Scaled bottleneck   {scaled_throughput:03.2f}FPS')
            print(f'    - Latency:')
            print(f'      - Theoretical         {theoretical_latency:.3f}s')
            print(f'      - Debug runtime       {debug_latency:.3f}s')
            print(f'      - Ray runtime         {ray_latency:.3f}s')
            print(f'      - Scaled bottleneck   {scaled_latency:.3f}s')        
            plot_theoretical[img_name].append(bottleneck_fps)
            plot_debug[img_name].append(debug_throughput)
            plot_ray[img_name].append(ray_throughput)
            plot_scaled[img_name].append(scaled_throughput)
        # plotting the points
        plt.subplot(subplot)
        plt.plot(plot_theoretical[img_name], plot_theoretical[img_name], label=f'Theoretic',linestyle='dotted')
        plt.plot(plot_theoretical[img_name], plot_debug[img_name], label=f'Debug')
        plt.plot(plot_theoretical[img_name], plot_ray[img_name], label=f'Ray') 
        plt.plot(plot_theoretical[img_name], plot_scaled[img_name], label=f'Ray Scaled') 
        subplot += 1

        # naming the x axis 
        plt.xlabel('Theoretical Bottleneck FPS') 
        # naming the y axis 
        plt.ylabel('Experimental Pipeline FPS') 

        # giving a title to my graph 
        plt.title(f'Runtime Benchmarks FPS - {img_name}') 
        plt.legend()
        plt.grid(True)

    # function to show the plot 
    plt.show() 

# Run benchmarking

In [None]:
from rvai.base.runtime import init

# Benchmark size
n_loops = 10

# Choose which pipeline to benchmark
chained = True
if chained:
    pipeline = chained_pipeline
else:
    pipeline = singleinput_pipeline

## Debug runtime benchmarking

In [None]:
%%capture
# init debug runtime
debug_rt = init("debug")

# Perform benchmarking
debug_results = benchmark(debug_rt, pipeline, n_loops)
# Stop the runtime
debug_rt.stop()

## Ray runtime benchmarking

In [None]:
#%%capture
# init debug runtime
ray_rt = init("ray")

# Perform benchmarking
ray_results = benchmark(ray_rt, pipeline, n_loops)
# Stop the runtime
ray_rt.stop()

## Scaled bottleneck Ray runtime benchmarking

In [None]:
%%capture
# shutdown ray to be sure
import ray
ray.shutdown()

# init debug runtime
ray_rt = init("ray")

# scale bottleneck to 2
replicas = {'b': 2}  # cell b is the bottleneck

# Perform benchmarking
scaled_results = benchmark(ray_rt, pipeline, n_loops, replicas=replicas)

# Stop the runtime
ray_rt.stop()

## Print benchmarking results

In [None]:
print_combined(debug_results, ray_results, scaled_results)