In [2]:
!poetry add -D cupy

Using version [1m^11.1.0[0m for [36mcupy[0m

[34mUpdating dependencies[0m
[2K[34mResolving dependencies...[0m [39;2m(35.5s)[0m[34mResolving dependencies...[0m [39;2m(3.6s)[0m[34mResolving dependencies...[0m [39;2m(8.2s)[0m[34mResolving dependencies...[0m [39;2m(12.6s)[0m[34mResolving dependencies...[0m [39;2m(14.4s)[0m[34mResolving dependencies...[0m [39;2m(21.3s)[0m[34mResolving dependencies...[0m [39;2m(25.8s)[0m

[34mWriting lock file[0m

[1mPackage operations[0m: [34m2[0m installs, [34m0[0m updates, [34m0[0m removals

  [34;1m•[0m [39mInstalling [0m[36mfastrlock[0m[39m ([0m[39;1m0.8[0m[39m)[0m: [34mPending...[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mfastrlock[0m[39m ([0m[39;1m0.8[0m[39m)[0m: [34mDownloading...[0m [1m0%[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mfastrlock[0m[39m ([0m[39;1m0.8[0m[39m)[0m: [34mDownloading...[0m [1m100%[0m
[1A[0J  [34;1m•[0m [39mInstalling [0m[36mfa

In [3]:
# An example CUDA Array Interface implementation that wraps a pointer provided
# by cudaMalloc.

from numba import cuda
from ctypes import CDLL, POINTER, byref, c_void_p, c_size_t
import cupy as cp

In [4]:

class MyArray:
    def __init__(self, shape, typestr, data):
        if isinstance(shape, int):
            shape = (shape,)

        self._shape = shape
        self._data = data
        self._typestr = typestr

    @property
    def __cuda_array_interface__(self):
        return {
            'shape': self._shape,
            'typestr': self._typestr,
            'data': (self._data, False),
            'version': 2
        }


In [5]:


# Use ctypes to get the cudaMalloc function from Python
cudart = CDLL('libcudart.so')
cudaMalloc = cudart.cudaMalloc
cudaMalloc.argtypes = [POINTER(c_void_p), c_size_t]

# Allocate some Numba-external memory with cudaMalloc
ptr = c_void_p()
float32_size = 4
nelems = 32
alloc_size = float32_size * nelems
cudaMalloc(byref(ptr), alloc_size)

# Wrap our memory in a CUDA Array Interface object
arr = MyArray(nelems, 'f4', ptr.value)


In [7]:
# Call a kernel on our object wrapping the pointer

@cuda.jit
def initialize(x):
    i = cuda.grid(1)
    if i < len(x):
        x[i] = 3.14 + i


initialize[1, nelems](arr)


# Use CuPy for a convenient way to print our data to show that the kernel
# initialized it
print(cp.asarray(arr))


[ 3.14  4.14  5.14  6.14  7.14  8.14  9.14 10.14 11.14 12.14 13.14 14.14
 15.14 16.14 17.14 18.14 19.14 20.14 21.14 22.14 23.14 24.14 25.14 26.14
 27.14 28.14 29.14 30.14 31.14 32.14 33.14 34.14]


