In [1]:
import pycuda
import pycuda.driver as cuda
import pycuda.autoinit

from pycuda.compiler import SourceModule

import numpy as np
import time

print(pycuda.VERSION)

"""
Add two vectors
"""

(2019, 1, 1)


'\nAdd two vectors\n'

In [2]:
# Create a tensor and copy it to gpu memory
# Initialize a zero vector
a = np.zeros(shape=(64*64,64*64), dtype=np.float32)
b = np.ones(shape=(64*64,64*64), dtype=np.float32)
# Allocate memory at device
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
# Allocate same size of memory for result vector
result_gpu = cuda.mem_alloc(a.nbytes)
# Copy the cpu vector to gpu
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

In [3]:
mod = SourceModule("""
    __global__ void addTest2D(float *a, float *b, float *res)
    {
        int blockId = blockIdx.x + blockIdx.y * gridDim.x;
        int idx = blockId * (blockDim.x * blockDim.y * blockDim.z)
        + (threadIdx.z * (blockDim.x * blockDim.y))
        + (threadIdx.y * blockDim.x) + threadIdx.x;
        // Add a and b
        res[idx] = a[idx] + b[idx];
    }
""")

In [4]:
# Let's compare operating time
# GPU
startTime = time.time()
func = mod.get_function("addTest2D")
func(a_gpu, b_gpu, result_gpu, block=(16,16,4), grid =(256,256))
consumedTime = time.time() - startTime
print("Time for gpu operation : ", consumedTime)

Time for gpu operation :  0.0005121231079101562


In [5]:
# CPU
startTime = time.time()
result_cpu = a + b
consumedTime = time.time() - startTime
print("Time for cpu operation : ", consumedTime)

Time for cpu operation :  0.02899312973022461


In [6]:
# Copy the result from device to host
result = np.empty_like(a)
cuda.memcpy_dtoh(result, result_gpu)
a_gpu.free()
b_gpu.free()
result_gpu.free()

In [7]:
# Compare the results
print("Is it same? : ", (result == result_cpu).all())

Is it same? :  True
