In [1]:
import pycuda
import pycuda.driver as cuda
import pycuda.autoinit

from pycuda.compiler import SourceModule

import numpy as np
import time

print(pycuda.VERSION)

"""
Dot product (Inner product)
"""

(2019, 1, 1)


'\nDot product (Inner product)\n'

In [2]:
VECTOR_LENGTH = 64
BLOCKSIZE = 16
GRIDSIZE = int(np.ceil(VECTOR_LENGTH/BLOCKSIZE))
THREAD_PER_BLOCK = 16

In [3]:
# Create a tensor and copy it to gpu memory
# Randomly initialize a vector
#a = np.ones(shape=(64*64), dtype=np.float32)
a = np.random.randn(VECTOR_LENGTH)
a = a.astype(np.float32)
b = np.ones(shape=(VECTOR_LENGTH), dtype=np.float32)
# Allocate memory at device
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
# Allocate same size of memory for result vector
result = np.zeros(4, dtype=np.float32)
result_gpu = cuda.mem_alloc(result.nbytes)
# Copy the cpu vector to gpu
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

In [4]:
mod = SourceModule("""
    __global__ void addTest2D(float *a, float *b, float *res)
    {
        __shared__ float cache[16];
        int blockId = blockIdx.x + blockIdx.y * gridDim.x;
        int idx = blockId * (blockDim.x * blockDim.y * blockDim.z)
        + (threadIdx.z * (blockDim.x * blockDim.y))
        + (threadIdx.y * blockDim.x)
        + threadIdx.x;
        // Add a and b
        cache[threadIdx.x] = a[idx] * b[idx];
        
        // waiting to finish the jobs
        __syncthreads();
        
        // sum all elements of cache at parallel
        int i = blockDim.x/2;
        while(i != 0)
        {
            if(threadIdx.x < i)
            {
                cache[threadIdx.x] += cache[threadIdx.x + i];
            }
            // syncthread makes stop every thread until they call syncthread each.
            // In this case, if this sync is in the if sentence, this will make the system stopped.
            __syncthreads();
            i /= 2;
        }
        
        // need to do final cummulatiuon on cpu for efficiency
        if(threadIdx.x == 0)
        {
            res[blockIdx.x] = cache[0];
        }
    }
""")

In [5]:
# Let's compare operating time
# GPU
startTime = time.time()
func = mod.get_function("addTest2D")
func(a_gpu, b_gpu, result_gpu, block=(THREAD_PER_BLOCK,1,1), grid =(GRIDSIZE,1))
consumedTime = time.time() - startTime
print("Time for gpu operation : ", consumedTime)

Time for gpu operation :  0.0006589889526367188


In [6]:
# CPU
startTime = time.time()
result_cpu = np.dot(a, b)
consumedTime = time.time() - startTime
print("Time for cpu operation : ", consumedTime)

Time for cpu operation :  9.703636169433594e-05


In [7]:
# Copy the result from device to host
result = np.zeros(shape=(4), dtype=np.float32)
cuda.memcpy_dtoh(result, result_gpu)
a_gpu.free()
b_gpu.free()
result_gpu.free()

In [8]:
# Compare the results
print("Is it same? : ", (result == result_cpu).all())

Is it same? :  False


In [9]:
result_cpu

-6.5065165

In [10]:
result.sum()

-6.506517