In [1]:
import pycuda
import pycuda.driver as cuda
import pycuda.autoinit

from pycuda.compiler import SourceModule

import numpy as np
import time

print(pycuda.VERSION)

"""
ReLU operation
"""

(2019, 1, 1)


'\nReLU operation\n'

In [2]:
ROW = np.array(32, dtype=np.int)
COL = np.array(32, dtype=np.int)
CHANNEL = np.array(4, dtype=np.int)
# Sample input tensor
a = np.random.normal(size=(CHANNEL,ROW,COL)).astype(np.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

res = np.zeros(shape=(CHANNEL,ROW,COL), dtype=np.float32)
#res = np.array((range(4*4*4))).reshape((4,4,4))
res_gpu = cuda.mem_alloc(res.nbytes)

row_gpu = cuda.mem_alloc(ROW.nbytes)
col_gpu = cuda.mem_alloc(COL.nbytes)
channel_gpu = cuda.mem_alloc(CHANNEL.nbytes)

cuda.memcpy_htod(row_gpu, ROW)
cuda.memcpy_htod(col_gpu, COL)
cuda.memcpy_htod(channel_gpu, CHANNEL)

In [3]:
# passing single integer parameters
# initialize it as array then use & on the function
#
#
#
#
mod = SourceModule("""
    __global__ void ReLU3D(float *a, float *res, int &row, int &col, int &channel)
    {
        int xIdx = blockDim.x * blockIdx.x + threadIdx.x;
        int yIdx = blockDim.y * blockIdx.y + threadIdx.y;
        int zIdx = blockDim.z * blockIdx.z + threadIdx.z;
        
        int xDim = gridDim.x * blockDim.x;
        int yDim = gridDim.y * blockDim.y;
        int zDim = gridDim.z * blockDim.z;
        
        int globalIdx = zIdx * row * col + yIdx * row + xIdx;
        
        int bid = blockIdx.x + blockIdx.y * gridDim.x;
        int tid = bid * (blockDim.x * blockDim.y * blockDim.z)
                    + (threadIdx.z * (blockDim.x * blockDim.y))
                    + (threadIdx.y * blockDim.x)
                    + threadIdx.x;
        
        if (a[globalIdx] > 0)
        {
            res[globalIdx] = a[globalIdx];
        }
        else
        {
            res[globalIdx] = 0;
        }
    }
""")

In [4]:
# GPU
startTime = time.time()
func = mod.get_function("ReLU3D")
func(a_gpu, res_gpu, row_gpu, col_gpu, channel_gpu, block=(16,16,2), grid =(2,2,2))
consumedTime = time.time() - startTime
print("Time for gpu operation : ", consumedTime)

Time for gpu operation :  0.0004622936248779297


In [5]:
# Copy the result from device to host
cuda.memcpy_dtoh(res, res_gpu)
a_gpu.free()
res_gpu.free()

In [6]:
a[3].max()

3.311512

In [7]:
res[3].max()

3.311512