# Example of using Triton Server Wrapper in Jupyter notebook

## Triton server setup with custom model

Install dependencies

In [1]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install cupy-cuda115 --extra-index-url=https://pypi.ngc.nvidia.com
!pip install -U nvidia-pytriton

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting cupy-cuda115
  Using cached cupy_cuda115-10.6.0-cp38-cp38-manylinux1_x86_64.whl (83.3 MB)
Collecting fastrlock>=0.5 (from cupy-cuda115)
  Using cached fastrlock-0.8.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_24_x86_64.whl (48 kB)
Installing collected packages: fastrlock, cupy-cuda115
Successfully installed cupy-cuda115-10.6.0 fastrlock-0.8.1
Collecting nvidia-pytriton
  Using cached nvidia_pytriton-0.1.4-py3-none-manylinux_2_31_x86_64.whl (36.7 MB)
Collecting pyzmq~=23.0 (from nvidia-pytriton)
  Downloading pyzmq-23.2.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting sh~=1.14 (from nvidia-pytriton)
  Using cached sh-1.14.3.tar.gz (62 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting tritonclient[al

Required imports:

In [2]:
import numpy as np

from pytriton.model_config import ModelConfig, Tensor
from pytriton.triton import Triton

Define inference callable:

In [3]:
def _add_sub(**inputs):
    a_batch, b_batch = inputs.values()
    add_batch = a_batch + b_batch
    sub_batch = a_batch - b_batch
    return {"add": add_batch, "sub": sub_batch}

Instantiate titon wrapper class and load model with defined callable:

In [4]:
triton = Triton()

In [5]:
triton.bind(
        model_name="AddSub",
        infer_func=_add_sub,
        inputs=[
            Tensor(dtype=np.float32, shape=(-1,)),
            Tensor(dtype=np.float32, shape=(-1,)),
        ],
        outputs=[
            Tensor(name="add", dtype=np.float32, shape=(-1,)),
            Tensor(name="sub", dtype=np.float32, shape=(-1,)),
        ],
        config=ModelConfig(max_batch_size=128),
    )

Run triton server with defined model inference callable

In [6]:
triton.run()

I0511 08:33:07.682040 36642 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f41f4000000' with size 268435456
I0511 08:33:07.682708 36642 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864
I0511 08:33:07.687473 36642 model_lifecycle.cc:459] loading: AddSub:1
I0511 08:33:09.017897 36642 python_be.cc:1856] TRITONBACKEND_ModelInstanceInitialize: AddSub_0 (CPU device 0)
I0511 08:33:09.210668 36642 model_lifecycle.cc:694] successfully loaded 'AddSub' version 1
I0511 08:33:09.210796 36642 server.cc:563] 
+------------------+------+
| Repository Agent | Path |
+------------------+------+
+------------------+------+

I0511 08:33:09.210875 36642 server.cc:590] 
+---------+---------------------------------+---------------------------------+
| Backend | Path                            | Config                          |
+---------+---------------------------------+---------------------------------+
| python  | /home/Startupcolors/doze/wound/ |

## Example inference performed with ModelClient calling triton server

In [7]:
from pytriton.client import ModelClient
batch_size = 2
a_batch = np.ones((batch_size, 1), dtype=np.float32)
b_batch = np.ones((batch_size, 1), dtype=np.float32)

In [8]:
with ModelClient("localhost", "AddSub") as client:
    result_batch = client.infer_batch(a_batch, b_batch)

for output_name, data_batch in result_batch.items():
    print(f"{output_name}: {data_batch.tolist()}")

Error occurred during calling model callable: Traceback (most recent call last):
  File "/home/Startupcolors/doze/wound/notebooks/.venv/lib/python3.8/site-packages/pytriton/proxy/inference_handler.py", line 107, in run
    outputs = self._model_callable(inputs)
TypeError: _add_sub() takes 0 positional arguments but 1 was given



PyTritonClientInferenceServerError: Error occurred on Triton Inference Server side:
 Failed to process the request(s) for model instance 'AddSub_0', message: TritonModelException: Traceback (most recent call last):
  File "/home/Startupcolors/doze/wound/notebooks/.venv/lib/python3.8/site-packages/pytriton/proxy/inference_handler.py", line 107, in run
    outputs = self._model_callable(inputs)
TypeError: _add_sub() takes 0 positional arguments but 1 was given


At:
  /home/Startupcolors/.cache/pytriton/workspace_yv9yvp7s/model-store/AddSub/1/model.py(192): _exec_requests
  /home/Startupcolors/.cache/pytriton/workspace_yv9yvp7s/model-store/AddSub/1/model.py(110): execute




## Re-setup triton server with modified inference callable

Stop triton server

In [None]:
triton.stop()

Redefine inference callable

In [None]:
def _add_sub(**inputs):
    a_batch, b_batch = inputs.values()
    add_batch = (a_batch + b_batch) * 2
    sub_batch = (a_batch - b_batch) * 3
    return {"add": add_batch, "sub": sub_batch}

Load model again

In [None]:
triton.bind(
        model_name="AddSub",
        infer_func=_add_sub,
        inputs=[
            Tensor(dtype=np.float32, shape=(-1,)),
            Tensor(dtype=np.float32, shape=(-1,)),
        ],
        outputs=[
            Tensor(name="add", dtype=np.float32, shape=(-1,)),
            Tensor(name="sub", dtype=np.float32, shape=(-1,)),
        ],
        config=ModelConfig(max_batch_size=128),
    )

Run triton server with new model inference callable

In [None]:
triton.run()

## The same inference performed with modified inference callable

In [None]:
with ModelClient("localhost", "AddSub") as client:
    result_batch = client.infer_batch(a_batch, b_batch)

for output_name, data_batch in result_batch.items():
    print(f"{output_name}: {data_batch.tolist()}")

Stop server at the end

In [None]:
triton.stop()