In [1]:
# https://wiki.tiker.net/PyCuda/Examples/KernelConcurrency
#! /usr/bin/env python
# A simple program to illustrate kernel concurrency with PyCuda.
# Reference: Chapter 3.2.6.5 in Cuda C Programming Guide Version 3.2.
# Jesse Lu, 2011-04-04

import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule

In [2]:
#
# Set up test scenario.
# 

# Create a simple test kernel.
mod = SourceModule("""
__global__ void my_kernel(float *d) {
    const int i = threadIdx.x;
    for (int m=0; m<100; m++) {
        for (int k=0; k<100 ; k++)
            d[i] = d[i] * 2.0;
        for (int k=0; k<100 ; k++)
            d[i] = d[i] / 2.0;
    }
    d[i] = d[i] * 2.0;
}
""")
my_kernel = mod.get_function("my_kernel")

In [3]:
# Create the test data on the host.
N = 800 # Size of datasets.
n = 32 # Number of datasets (and concurrent operations) used.
data, data_check, d_data = [], [], []
for k in range(n):
    data.append(np.random.randn(N).astype(np.float32)) # Create random data.
    data_check.append(data[k].copy()) # For checking the result afterwards. 
    d_data.append(drv.mem_alloc(data[k].nbytes)) # Allocate memory on device.

#
# Start concurrency test.
#

# Use this event as a reference point.
ref = drv.Event()
ref.record()

<pycuda._driver.Event at 0x7f93a4742d50>

In [4]:
# Create the streams and events needed.
stream, event = [], []
marker_names = ['kernel_begin', 'kernel_end']
for k in range(n):
    stream.append(drv.Stream())
    event.append(dict([(marker_names[l], drv.Event()) for l in range(len(marker_names))]))

# Transfer to device.
for k in range(n):
    drv.memcpy_htod(d_data[k], data[k]) 

# Run kernels many times, we will only keep data from last loop iteration.
for j in range(10):
    for k in range(n):
        event[k]['kernel_begin'].record(stream[k])
        my_kernel(d_data[k], block=(N,1,1), stream=stream[k]) 
    for k in range(n): # Commenting out this line should break concurrency.
        event[k]['kernel_end'].record(stream[k])

# Transfer data back to host.
for k in range(n):
    drv.memcpy_dtoh(data[k], d_data[k]) 

In [5]:
print('=== Device attributes')
dev = pycuda.autoinit.device
print('Name:', dev.name())
print('Compute capability:', dev.compute_capability())
print('Concurrent Kernels:', \
    bool(dev.get_attribute(drv.device_attribute.CONCURRENT_KERNELS)))

print('\n=== Checking answers')
for k in range(n):
    if (np.linalg.norm((data_check[k] * 2**(j+1)) - data[k]) == 0.0):
        print('Dataset %d: passed.' % k)
    else:
        print('Dataset %d: FAILED!' % k)

print('\n=== Timing info (for last set of kernel launches)')
for k in range(n):
    str = 'Dataset %d:\t' % k
    for m in marker_names:
        str += '%s: %.2f\t' % (m, ref.time_till(event[k][m]))
    print(str)

=== Device attributes
Name: GeForce GTX TITAN X
Compute capability: (5, 2)
Concurrent Kernels: True

=== Checking answers
Dataset 0: passed.
Dataset 1: passed.
Dataset 2: passed.
Dataset 3: passed.
Dataset 4: passed.
Dataset 5: passed.
Dataset 6: passed.
Dataset 7: passed.
Dataset 8: passed.
Dataset 9: passed.
Dataset 10: passed.
Dataset 11: passed.
Dataset 12: passed.
Dataset 13: passed.
Dataset 14: passed.
Dataset 15: passed.
Dataset 16: passed.
Dataset 17: passed.
Dataset 18: passed.
Dataset 19: passed.
Dataset 20: passed.
Dataset 21: passed.
Dataset 22: passed.
Dataset 23: passed.
Dataset 24: passed.
Dataset 25: passed.
Dataset 26: passed.
Dataset 27: passed.
Dataset 28: passed.
Dataset 29: passed.
Dataset 30: passed.
Dataset 31: passed.

=== Timing info (for last set of kernel launches)
Dataset 0:	kernel_begin: 74.67	kernel_end: 74.82	
Dataset 1:	kernel_begin: 74.69	kernel_end: 74.83	
Dataset 2:	kernel_begin: 74.71	kernel_end: 74.84	
Dataset 3:	kernel_begin: 74.73	kernel_end: 74.8