In [1]:
import pycuda
import pycuda.driver as cuda
import pycuda.autoinit

from pycuda.compiler import SourceModule

import numpy as np
import time

print(pycuda.VERSION)

(2019, 1, 1)


In [2]:
# Create a tensor and copy it to gpu memory
# Initialize a zero vector
a = np.zeros(shape=(64*64,64*64))
# Cast it to float32 type
a = a.astype(np.float32)
# Allocate memory at device
a_gpu = cuda.mem_alloc(a.nbytes)
# Copy the cpu vector to gpu
cuda.memcpy_htod(a_gpu, a)

In [3]:
mod = SourceModule("""
    __global__ void cudaTest2D(float *a)
    {
        // For x axis
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        // For y axis
        idx += (blockDim.y * gridDim.x) * (blockDim.y * blockIdx.y + threadIdx.y);
        // Do your own operations
        a[idx] += 1;
    }
""")

In [4]:
# Let's compare operating time
# GPU
startTime = time.time()
func = mod.get_function("coordTest2D")
func(a_gpu, block=(16,16,1), grid =(256,256))
consumedTime = time.time() - startTime
print("Time for gpu operation : ", consumedTime)

Time for gpu operation :  0.0004508495330810547


In [5]:
# CPU
startTime = time.time()
aNp = a + 1
consumedTime = time.time() - startTime
print("Time for cpu operation : ", consumedTime)

Time for cpu operation :  0.025727510452270508


In [6]:
# Copy the result from device to host
a_result = np.empty_like(a)
cuda.memcpy_dtoh(a_result, a_gpu)
a_gpu.free()

In [12]:
# Compare the results
print("Is it same? : ", (aNp == a_doubled).all())

Is it same? :  True
