diff --git a/test/arrayimpl.py b/test/arrayimpl.py
index 5df8cd9f9..84ebf7b05 100644
--- a/test/arrayimpl.py
+++ b/test/arrayimpl.py
@@ -240,6 +240,47 @@ def size(self):
             return self.array.size
 
 
+try:
+    import dlpackimpl as dlpack
+except ImportError:
+    dlpack = None
+
+class BaseDLPackCPU(object):
+
+    def __dlpack_device__(self):
+        return (dlpack.DLDeviceType.kDLCPU, 0)
+
+    def __dlpack__(self, stream=None):
+        assert stream is None
+        capsule = dlpack.make_py_capsule(self.array)
+        return capsule
+
+    def as_raw(self):
+        return self
+
+
+if dlpack is not None and array is not None:
+
+    @add_backend
+    class DLPackArray(BaseDLPackCPU, ArrayArray):
+
+        backend = 'dlpack-array'
+
+        def __init__(self, arg, typecode, shape=None):
+            super(DLPackArray, self).__init__(arg, typecode, shape)
+
+
+if dlpack is not None and numpy is not None:
+
+    @add_backend
+    class DLPackNumPy(BaseDLPackCPU, ArrayNumPy):
+
+        backend = 'dlpack-numpy'
+
+        def __init__(self, arg, typecode, shape=None):
+            super(DLPackNumPy, self).__init__(arg, typecode, shape)
+
+
 def typestr(typecode, itemsize):
     typestr = ''
     if sys.byteorder == 'little':
diff --git a/test/dlpackimpl.py b/test/dlpackimpl.py
new file mode 100644
index 000000000..9279e1261
--- /dev/null
+++ b/test/dlpackimpl.py
@@ -0,0 +1,230 @@
+import sys
+import ctypes
+try:
+    from enum import IntEnum
+except ImportError:
+    IntEnum = object
+if hasattr(sys, 'pypy_version_info'):
+    raise ImportError("unsupported on PyPy")
+
+class DLDeviceType(IntEnum):
+    kDLCPU = 1
+    kDLCUDA = 2
+    kDLCUDAHost = 3
+    kDLOpenCL = 4
+    kDLVulkan = 7
+    kDLMetal = 8
+    kDLVPI = 9
+    kDLROCM = 10
+    kDLROCMHost = 11
+    kDLExtDev = 12
+    kDLCUDAManaged = 13
+
+class DLDevice(ctypes.Structure):
+  _fields_ = [
+      ("device_type", ctypes.c_uint),
+      ("device_id", ctypes.c_int),
+  ]
+
+class DLDataTypeCode(IntEnum):
+    kDLInt = 0
+    kDLUInt = 1
+    kDLFloat = 2
+    kDLOpaqueHandle = 3
+    kDLBfloat = 4
+    kDLComplex = 5
+
+class DLDataType(ctypes.Structure):
+  _fields_ = [
+      ("code", ctypes.c_uint8),
+      ("bits", ctypes.c_uint8),
+      ("lanes", ctypes.c_uint16),
+  ]
+
+class DLTensor(ctypes.Structure):
+  _fields_ = [
+      ("data", ctypes.c_void_p),
+      ("device", DLDevice),
+      ("ndim", ctypes.c_int),
+      ("dtype", DLDataType),
+      ("shape", ctypes.POINTER(ctypes.c_int64)),
+      ("strides", ctypes.POINTER(ctypes.c_int64)),
+      ("byte_offset", ctypes.c_uint64),
+  ]
+
+DLManagedTensorDeleter = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+
+class DLManagedTensor(ctypes.Structure):
+    _fields_ = [
+    ("dl_tensor", DLTensor),
+    ("manager_ctx", ctypes.c_void_p),
+    ("deleter", DLManagedTensorDeleter),
+]
+
+pyapi = ctypes.pythonapi
+
+DLManagedTensor_p = ctypes.POINTER(DLManagedTensor)
+
+Py_IncRef = pyapi.Py_IncRef
+Py_IncRef.restype = None
+Py_IncRef.argtypes = [ctypes.py_object]
+
+Py_DecRef = pyapi.Py_DecRef
+Py_DecRef.restype = None
+Py_DecRef.argtypes = [ctypes.py_object]
+
+PyCapsule_Destructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+
+PyCapsule_New = pyapi.PyCapsule_New
+PyCapsule_New.restype = ctypes.py_object
+PyCapsule_New.argtypes = [ctypes.c_void_p, ctypes.c_char_p, PyCapsule_Destructor]
+
+PyCapsule_IsValid = pyapi.PyCapsule_IsValid
+PyCapsule_IsValid.restype = ctypes.c_int
+PyCapsule_IsValid.argtypes = [ctypes.py_object]
+
+PyCapsule_GetPointer = pyapi.PyCapsule_GetPointer
+PyCapsule_GetPointer.restype = ctypes.c_void_p
+PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
+
+PyCapsule_SetContext = pyapi.PyCapsule_SetContext
+PyCapsule_SetContext.restype = ctypes.c_int
+PyCapsule_SetContext.argtypes = [ctypes.py_object, ctypes.c_void_p]
+
+PyCapsule_GetContext = pyapi.PyCapsule_GetContext
+PyCapsule_GetContext.restype = ctypes.c_void_p
+PyCapsule_GetContext.argtypes = [ctypes.py_object]
+
+
+def make_dl_datatype(typecode, itemsize):
+    code = None
+    bits = itemsize * 8
+    lanes = 1
+    if typecode in "bhilqnp":
+        code = DLDataTypeCode.kDLInt
+    if typecode in "BHILQNP":
+        code = DLDataTypeCode.kDLUInt
+    if typecode in "efdg":
+        code = DLDataTypeCode.kDLFloat
+    if typecode in "FDG":
+        code = DLDataTypeCode.kDLComplex
+    if typecode == "G" and itemsize == 32:
+        code = DLDataTypeCode.kDLFloat
+        bits //= 2
+        lanes *= 2
+    datatype = DLDataType()
+    datatype.code = code
+    datatype.bits = bits
+    datatype.lanes = lanes
+    return datatype
+
+
+def make_dl_shape(shape, order=None, strides=None):
+    null = ctypes.cast(0, ctypes.POINTER(ctypes.c_int64))
+    if isinstance(shape, int):
+        shape = [shape]
+    ndim = len(shape)
+    if ndim == 0:
+        shape = null
+        strides = null
+    else:
+        shape = (ctypes.c_int64*ndim)(*shape)
+        if order == 'C':
+            size = 1
+            strides = []
+            for i in range(ndim-1, -1, -1):
+                strides.append(size)
+                size *= shape[i]
+            strides = (ctypes.c_int64*ndim)(*strides)
+        elif order == 'F':
+            size = 1
+            strides = []
+            for i in range(ndim):
+                strides.append(size)
+                size *= shape[i]
+            strides = (ctypes.c_int64*ndim)(*strides)
+        elif strides is not None:
+            strides = (ctypes.c_int64*ndim)(*strides)
+        else:
+            strides = null
+    return ndim, shape, strides
+
+
+def make_dl_tensor(obj):
+    try:
+        data, size = obj.buffer_info()
+        typecode = obj.typecode
+        itemsize = obj.itemsize
+    except AttributeError:
+        data = obj.ctypes.data
+        size = obj.size
+        typecode = obj.dtype.char
+        itemsize = obj.itemsize
+
+    device = DLDevice(DLDeviceType.kDLCPU, 0)
+    datatype = make_dl_datatype(typecode, itemsize)
+    ndim, shape, strides = make_dl_shape(size)
+
+    dltensor = DLTensor()
+    dltensor.data = data if size > 0 else 0
+    dltensor.device = device
+    dltensor.ndim = ndim
+    dltensor.dtype = datatype
+    dltensor.shape = shape
+    dltensor.strides = strides
+    dltensor.byte_offset = 0
+    return dltensor
+
+
+def make_dl_manager_ctx(obj):
+    py_obj = ctypes.py_object(obj)
+    if False: Py_IncRef(py_obj)
+    void_p = ctypes.c_void_p.from_buffer(py_obj)
+    return void_p
+
+
+@DLManagedTensorDeleter
+def dl_managed_tensor_deleter(void_p):
+    managed = ctypes.cast(void_p, DLManagedTensor_p)
+    manager_ctx = managed.contents.manager_ctx
+    py_obj = ctypes.cast(manager_ctx, ctypes.py_object)
+    if False: Py_DecRef(py_obj)
+
+
+def make_dl_managed_tensor(obj):
+    managed = DLManagedTensor()
+    managed.dl_tensor = make_dl_tensor(obj)
+    managed.manager_ctx = make_dl_manager_ctx(obj)
+    managed.deleter = dl_managed_tensor_deleter
+    return managed
+
+
+def make_py_context(context):
+    py_obj = ctypes.py_object(context)
+    Py_IncRef(py_obj)
+    context = ctypes.c_void_p.from_buffer(py_obj)
+    return ctypes.c_void_p(context.value)
+
+
+@PyCapsule_Destructor
+def py_capsule_destructor(void_p):
+    capsule = ctypes.cast(void_p, ctypes.py_object)
+    if PyCapsule_IsValid(capsule, b"dltensor"):
+        pointer = PyCapsule_GetPointer(capsule, b"dltensor")
+        managed = ctypes.cast(pointer, DLManagedTensor_p)
+        deleter = managed.contents.deleter
+        if deleter:
+            deleter(managed)
+    context = PyCapsule_GetContext(capsule)
+    managed = ctypes.cast(context, ctypes.py_object)
+    Py_DecRef(managed)
+
+
+def make_py_capsule(managed):
+    if not isinstance(managed, DLManagedTensor):
+        managed = make_dl_managed_tensor(managed)
+    pointer = ctypes.pointer(managed)
+    capsule = PyCapsule_New(pointer, b"dltensor", py_capsule_destructor)
+    context = make_py_context(managed)
+    PyCapsule_SetContext(capsule, context)
+    return capsule
diff --git a/test/test_msgspec.py b/test/test_msgspec.py
index 8bd1f8207..dafeef8ab 100644
--- a/test/test_msgspec.py
+++ b/test/test_msgspec.py
@@ -41,21 +41,10 @@
 
 # ---
 
-class GPUBuf(object):
+class BaseBuf(object):
 
-    def __init__(self, typecode, initializer, readonly=False):
+    def __init__(self, typecode, initializer):
         self._buf = array.array(typecode, initializer)
-        address = self._buf.buffer_info()[0]
-        typecode = self._buf.typecode
-        itemsize = self._buf.itemsize
-        self.__cuda_array_interface__ = dict(
-            version = 0,
-            data    = (address, readonly),
-            typestr = typestr(typecode, itemsize),
-            shape   = (len(self._buf), 1, 1),
-            strides = (itemsize,) * 3,
-            descr   = [('', typestr(typecode, itemsize))],
-        )
 
     def __eq__(self, other):
         return self._buf == other._buf
@@ -72,6 +61,54 @@ def __getitem__(self, item):
     def __setitem__(self, item, value):
         self._buf[item] = value._buf
 
+# ---
+
+try:
+    import dlpackimpl as dlpack
+except ImportError:
+    dlpack = None
+
+class DLPackBuf(BaseBuf):
+
+    def __init__(self, typecode, initializer):
+        super(DLPackBuf, self).__init__(typecode, initializer)
+        self.managed = dlpack.make_dl_managed_tensor(self._buf)
+
+    def __del__(self):
+        self.managed = None
+        if not pypy and sys.getrefcount(self._buf) > 2:
+            raise RuntimeError('dlpack: possible reference leak')
+
+    def __dlpack_device__(self):
+        device = self.managed.dl_tensor.device
+        return (device.device_type, device.device_id)
+
+    def __dlpack__(self, stream=None):
+        managed = self.managed
+        if managed.dl_tensor.device.device_type == \
+           dlpack.DLDeviceType.kDLCPU:
+            assert stream == None
+        capsule = dlpack.make_py_capsule(managed)
+        return capsule
+
+# ---
+
+class GPUBuf(BaseBuf):
+
+    def __init__(self, typecode, initializer, readonly=False):
+        super(GPUBuf, self).__init__(typecode, initializer)
+        address = self._buf.buffer_info()[0]
+        typecode = self._buf.typecode
+        itemsize = self._buf.itemsize
+        self.__cuda_array_interface__ = dict(
+            version = 0,
+            data    = (address, readonly),
+            typestr = typestr(typecode, itemsize),
+            shape   = (len(self._buf), 1, 1),
+            strides = (itemsize,) * 3,
+            descr   = [('', typestr(typecode, itemsize))],
+        )
+
 
 cupy_issue_2259 = False
 if cupy is not None:
@@ -386,6 +423,15 @@ def testNotContiguous(self):
                           Sendrecv, sbuf, rbuf)
 
 
+@unittest.skipIf(array is None, 'array')
+@unittest.skipIf(dlpack is None, 'dlpack')
+class TestMessageSimpleDLPackBuf(unittest.TestCase,
+                                 BaseTestMessageSimpleArray):
+
+    def array(self, typecode, initializer):
+        return DLPackBuf(typecode, initializer)
+
+
 @unittest.skipIf(array is None, 'array')
 class TestMessageSimpleGPUBuf(unittest.TestCase,
                               BaseTestMessageSimpleArray):
@@ -470,6 +516,125 @@ def testNotContiguous(self):
 
 # ---
 
+@unittest.skipIf(array is None, 'array')
+@unittest.skipIf(dlpack is None, 'dlpack')
+class TestMessageDLPackBuf(unittest.TestCase):
+
+    def testDevice(self):
+        buf = DLPackBuf('i', [0,1,2,3])
+        buf.__dlpack_device__ = None
+        MPI.Get_address(buf)
+        buf.__dlpack_device__ = lambda: None
+        self.assertRaises(TypeError, MPI.Get_address, buf)
+        buf.__dlpack_device__ = lambda: (None, 0)
+        self.assertRaises(TypeError, MPI.Get_address, buf)
+        buf.__dlpack_device__ = lambda: (1, None)
+        self.assertRaises(TypeError, MPI.Get_address, buf)
+        buf.__dlpack_device__ = lambda: (1,)
+        self.assertRaises(ValueError, MPI.Get_address, buf)
+        buf.__dlpack_device__ = lambda: (1, 0, 1)
+        self.assertRaises(ValueError, MPI.Get_address, buf)
+        del buf.__dlpack_device__
+        MPI.Get_address(buf)
+
+    def testCapsule(self):
+        buf = DLPackBuf('i', [0,1,2,3])
+        #
+        capsule = buf.__dlpack__()
+        MPI.Get_address(buf)
+        MPI.Get_address(buf)
+        del capsule
+        #
+        capsule = buf.__dlpack__()
+        retvals = [capsule] * 2
+        buf.__dlpack__ = lambda *args, **kwargs: retvals.pop()
+        MPI.Get_address(buf)
+        self.assertRaises(BufferError, MPI.Get_address, buf)
+        del buf.__dlpack__
+        del capsule
+        #
+        buf.__dlpack__ = lambda *args, **kwargs:  None
+        self.assertRaises(BufferError, MPI.Get_address, buf)
+        del buf.__dlpack__
+
+    def testNdim(self):
+        buf = DLPackBuf('i', [0,1,2,3])
+        dltensor = buf.managed.dl_tensor
+        #
+        for ndim in (2, 1, 0):
+            dltensor.ndim = ndim
+            MPI.Get_address(buf)
+        #
+        dltensor.ndim = -1
+        self.assertRaises(BufferError, MPI.Get_address, buf)
+        #
+        del dltensor
+
+    def testShape(self):
+        buf = DLPackBuf('i', [0,1,2,3])
+        dltensor = buf.managed.dl_tensor
+        #
+        dltensor.ndim = 1
+        dltensor.shape[0] = -1
+        self.assertRaises(BufferError, MPI.Get_address, buf)
+        #
+        dltensor.ndim = 0
+        dltensor.shape = None
+        MPI.Get_address(buf)
+        #
+        dltensor.ndim = 1
+        dltensor.shape = None
+        self.assertRaises(BufferError, MPI.Get_address, buf)
+        #
+        del dltensor
+
+    def testStrides(self):
+        buf = DLPackBuf('i', range(8))
+        dltensor = buf.managed.dl_tensor
+        #
+        for order in ('C', 'F'):
+            dltensor.ndim, dltensor.shape, dltensor.strides = \
+                dlpack.make_dl_shape([2, 2, 2], order=order)
+            MPI.Get_address(buf)
+            dltensor.strides[0] = -1
+            self.assertRaises(BufferError, MPI.Get_address, buf)
+        #
+        del dltensor
+
+    def testContiguous(self):
+        buf = DLPackBuf('i', range(8))
+        dltensor = buf.managed.dl_tensor
+        #
+        dltensor.ndim, dltensor.shape, dltensor.strides = \
+            dlpack.make_dl_shape([2, 2, 2], order='C')
+        s = dltensor.strides
+        strides = [s[i] for i in range(dltensor.ndim)]
+        s[0], s[1], s[2] = [strides[i] for i in [0, 1, 2]]
+        MPI.Get_address(buf)
+        s[0], s[1], s[2] = [strides[i] for i in [2, 1, 0]]
+        MPI.Get_address(buf)
+        s[0], s[1], s[2] = [strides[i] for i in [0, 2, 1]]
+        self.assertRaises(BufferError, MPI.Get_address, buf)
+        s[0], s[1], s[2] = [strides[i] for i in [1, 0, 2]]
+        self.assertRaises(BufferError, MPI.Get_address, buf)
+        del s
+        #
+        del dltensor
+
+    def testByteOffset(self):
+        buf = DLPackBuf('B', [0,1,2,3])
+        dltensor = buf.managed.dl_tensor
+        #
+        dltensor.ndim = 1
+        for i in range(len(buf)):
+            dltensor.byte_offset = i
+            mem = MPI.memory(buf)
+            self.assertEqual(mem[0], buf[i])
+        #
+        del dltensor
+
+# ---
+
 @unittest.skipIf(array is None, 'array')
 class TestMessageGPUBufInterface(unittest.TestCase):