In [1]:
  
# Add with a single thread on the GPU

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy



In [6]:
# Define CUDA function
mod = SourceModule("""
__global__ void add(int *a, int *b, int *c)  {
  int id = blockIdx.x;
  c[id] = a[id] + b[id];
}""")

func = mod.get_function("add")

# Vector size
N = 100

# Host vectors
a = numpy.array(range(0,N))
b = 1 - a
c = numpy.zeros(N)

a = a.astype(numpy.int32)
b = b.astype(numpy.int32)
c = c.astype(numpy.int32)

# Allocate on device
a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)

# Copy from host to device
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

func(a_gpu, b_gpu, c_gpu, block=(1,1,1), grid=(N,1))

# Copy result to host
cuda.memcpy_dtoh(c, c_gpu)

# Display results
print("Should be %d" % N)
print("Results: %d" % numpy.sum(c))

Should be 100
Results: 100


In [4]:
import pycuda.gpuarray as gpuarray 
a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
a_doubled = (a_gpu).get()
print(a_doubled)
print(a_gpu)

[[-0.39456362 -0.2233163  -1.5685459  -1.1875296 ]
 [-0.8722133   0.21616156 -0.03867308 -0.49743065]
 [-0.83817786 -2.2521834  -1.7828822  -0.3493693 ]
 [ 0.03721766 -0.17830046  0.848356    0.345347  ]]
[[-0.39456362 -0.2233163  -1.5685459  -1.1875296 ]
 [-0.8722133   0.21616156 -0.03867308 -0.49743065]
 [-0.83817786 -2.2521834  -1.7828822  -0.3493693 ]
 [ 0.03721766 -0.17830046  0.848356    0.345347  ]]
