Copyright (c) Microsoft Corporation.
Licensed under the MIT license.

The following example demonstrates how to initialize the MSCCL++ library and perform necessary setup for communicating from GPU kernels. First we define a function for registering memory, making connections and creating channels.

In [2]:
import mscclpp

def setup_channels(comm, memory, proxy_service):
    # Register the memory with the communicator
    reg_mem = comm.register_memory(memory.data.ptr, memory.nbytes, mscclpp.Transport.CudaIpc)

    # Create connections to all other ranks and exchange registered memories
    connections = []
    remote_memories = []
    for r in range(comm.bootstrap.size):
        if r == comm.bootstrap.rank: # Don't connect to self
            continue
        connections.append(comm.connect(r, 0, mscclpp.Transport.CudaIpc))
        comm.send_memory(reg_mem, r, 0)
        remote_mem = comm.recv_memory(r, 0)
        remote_memories.append(remote_mem)

    # Both connections and received remote memories are returned as futures,
    # so we wait for them to complete and unwrap them.
    connections = [conn.get() for conn in connections]
    remote_memories = [mem.get() for mem in remote_memories]

    # Finally, create proxy channels for each connection
    proxy_channels = [mscclpp.SimpleProxyChannel(
        proxy_service.proxy_channel(proxy_service.build_and_add_semaphore(comm, conn)),
        proxy_service.add_memory(remote_memories[i]),
        proxy_service.add_memory(reg_mem),
    ) for i, conn in enumerate(connections)]

    return proxy_channels

Now we are ready to write the top-level code for each rank.

In [3]:
import cupy as cp

def run(rank, world_size, if_ip_port_trio):
    # Use the right GPU for this rank
    cp.cuda.Device(rank).use()
    
    # Allocate memory on the GPU
    memory = cp.zeros(1024, dtype=cp.int32)

    # Initialize a bootstrapper using a known interface/IP/port trio for the root rank
    boot = mscclpp.TcpBootstrap.create(rank, world_size)
    boot.initialize(if_ip_port_trio)

    # Create a communicator for the processes in the bootstrapper
    comm = mscclpp.Communicator(boot)

    # Create a proxy service, which enables GPU kernels to use connections
    proxy_service = mscclpp.ProxyService()

    if rank == 0:
        print("Setting up channels")
    proxy_channels = setup_channels(comm, memory, proxy_service)

    if rank == 0:
        print("Starting proxy service")
    proxy_service.start_proxy()

    # This is where we could launch a GPU kernel that uses proxy_channels[i].device_handle
    # to initiate communication. See include/mscclpp/proxy_channel_device.hpp for details.
    if rank == 0:
        print("GPU kernels that use the proxy go here.")

    if rank == 0:
        print(f"Stopping proxy service")
    proxy_service.stop_proxy()

Finally, to test the code we can run each process using the `multiprocessing` package.

In [4]:
import multiprocessing as mp

world_size = 2
processes = [mp.Process(target=run, args=(rank, world_size, "eth0:localhost:50051")) for rank in range(world_size)]
for p in processes:
    p.start()
for p in processes:
    p.join()

Setting up channels
Starting proxy service
GPU kernels that use the proxy go here.
Stopping proxy service

Starting proxy service
GPU kernels that use the proxy go here.
Stopping proxy service
