In [0]:
!nvcc --version
!gcc --version
!g++ --version

In [0]:
!sudo apt-get install python-numpy -y
!sudo apt-get install build-essential python-dev python-setuptools libboost-python-dev libboost-thread-dev -y
!apt-cache search pycuda
!apt install libnvidia-compute-390 python3-pycuda

In [0]:
!apt-cache search pycuda
!apt-get install python3-pycuda


In [51]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

ELEM_NUMBER = 1024
BLOCK_SIZE = 256

A = np.random.randint(0, 2, dtype=np.int, size=ELEM_NUMBER)
print (A)
B = np.random.randint(0, 2, dtype=np.int, size=ELEM_NUMBER)
print (B)

A_gpu = cuda.mem_alloc(A.nbytes)
B_gpu = cuda.mem_alloc(B.nbytes)
C_gpu = cuda.mem_alloc(A.nbytes)


cuda.memcpy_htod(A_gpu, A)
cuda.memcpy_htod(B_gpu, B)

mod = SourceModule("""
    __global__ void vecAdd(float *A, float *B, float *result) {

      unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
      result[idx] = A[idx] + B[idx];
    }
  """)

func = mod.get_function("vecAdd")


gridDim = ( int((A.size + BLOCK_SIZE - 1)/BLOCK_SIZE),1)
blockDim = (BLOCK_SIZE,1,1)
print("block: "+str(blockDim)+" grid: "+str(gridDim))

func(A_gpu, B_gpu, C_gpu, block=blockDim, grid=gridDim)

C = numpy.empty_like(A)
cuda.memcpy_dtoh(C, C_gpu)
print (C)


[1 0 0 ... 1 0 0]
[0 1 0 ... 0 0 1]
block: (256, 1, 1) grid: (4, 1)
[1 1 0 ... 1 0 0]


In [66]:
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy  as np

NUMBER_ELEMENT = 1024
BLOCK_SIZE = 256

A = np.random.randint(0, 2, dtype=np.int, size=NUMBER_ELEMENT)
print (A)

B = np.random.randint(0, 2, dtype=np.int, size=NUMBER_ELEMENT)
print (B)

A_gpu = gpuarray.to_gpu(A)
print (A_gpu.get())

B_gpu = gpuarray.to_gpu(B)
print (B_gpu.get())

C_gpu = gpuarray.to_gpu(np.empty(NUMBER_ELEMENT).astype(np.int))
print (C_gpu.get())

mod = SourceModule("""
    __global__ void vecAdd(float *A, float *B, float *result) {

      unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
      result[idx] = A[idx] + B[idx];
    }
  """)

gridDim = ( int((A.size + BLOCK_SIZE - 1)/BLOCK_SIZE),1)
blockDim = (BLOCK_SIZE,1,1)
print("block: "+str(blockDim)+" grid: "+str(gridDim))

func = mod.get_function("vecAdd")
func(A_gpu, B_gpu, C_gpu, block=blockDim, grid=gridDim)

result = C_gpu.get()
print (result)



[0 0 1 ... 1 1 0]
[0 1 1 ... 0 1 0]
[0 0 1 ... 1 1 0]
[0 1 1 ... 0 1 0]
[0 0 0 ... 0 0 0]
block: (256, 1, 1) grid: (4, 1)
[0 1 2 ... 0 0 0]


In [75]:
!pip3 install numba
import os
os.environ['NUMBAPRO_LIBDEVICE'] = "/usr/lib/nvidia-cuda-toolkit/libdevice"
os.environ['NUMBAPRO_NVVM'] = "/usr/lib/x86_64-linux-gnu/libnvvm.so"




In [99]:
from numba import cuda
import numpy  as np

NUMBER_ELEMENT = 102400
BLOCK_SIZE = 256

@cuda.jit
def vecADD(A,B,C):
    # Thread id in a 1D block
    idThreadX = cuda.threadIdx.x
    # Block id in a 1D grid
    idBlock = cuda.blockIdx.x
    # Block width, i.e. number of threads per block
    blockDim = cuda.blockDim.x
    # Compute flattened index inside the array
    idGlobal = idBlock * blockDim + idThreadX
    if idGlobal < A.size:  # Check array boundaries
        C[idGlobal] = A[idGlobal] + B[idGlobal]

        
A = np.random.randint(0, 2, dtype=np.int, size=NUMBER_ELEMENT)
print (A)

B = np.random.randint(0, 2, dtype=np.int, size=NUMBER_ELEMENT)
print (B)

C = np.empty(NUMBER_ELEMENT).astype(np.int)
print (C)

blockspergrid = (A.size + (BLOCK_SIZE - 1)) 

startEvent = cuda.event()
stopEvent = cuda.event()

startEvent.record()

vecADD[blockspergrid, BLOCK_SIZE](A,B,C)     

stopEvent.record()
stopEvent.wait()
stopEvent.synchronize()

startEvent.elapsed_time(stopEvent)

time = startEvent.elapsed_time(stopEvent)

print("Execution Time GPU is "+ str(time)+" ms" )
print(C)

[0 1 1 ... 1 1 0]
[1 1 1 ... 1 1 0]
[0 0 0 ... 0 0 0]
Execution Time GPU is 185.06285095214844 ms
[1 2 2 ... 2 2 0]
