In [1]:
from numba import cuda
import numpy as np
import math
from time import time


In [2]:
# create a function for vector sum, run it over GPU
@cuda.jit
def gpu_add(a, b, result, n):
    # a, b为输入向量，result为输出向量
    # 向量维度为n
    # 得到当前thread的编号
    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    if idx < n:
        result[idx] = a[idx] + b[idx]


In [3]:
#  create 2 x vectors in 20M-dimension. covert them into int32
#  Pass them to the function as the parameters
n = 20000000
x = np.arange(n).astype(np.int32)
y = 2 * x

In [4]:
# 创建n维全0向量, 作为vector sum的初始值
# gpu_result = np.zeros(n)
# cpu_result = np.zeros(n)

# manually copy data from the main memory to the GPU memory
x_device = cuda.to_device(x)
y_device = cuda.to_device(y)



# 在显卡设备上初始化一块GPU memory, for storing GPU计算结果, 以避免结果被回送到CPU
gpu_result = cuda.device_array(n)
# CPU’s calculation result will still be oin the main memory
cpu_result = np.empty(n)


In [5]:
# calculate CUDA execution configuration [gridDim, blockDim]
threads_per_block = 1024
blocks_per_grid = math.ceil(n / threads_per_block)

In [6]:
# get time-stamp when start
start = time()

# use GPU to do do vector sum, with execution configuration [19532, 1024]
# gpu_add[blocks_per_grid, threads_per_block](x, y, gpu_result, n)
# use data which has been manually copied into the GPU memory, instead of data in CPU memory
gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result, n)
cuda.synchronize()

# print time consumed by GPU
print("gpu vector add time " + str(time() - start))

gpu vector add time 0.176041841506958


In [7]:
print("- gpu_result=", gpu_result)
print("- gpu_result[1]=", gpu_result[1])

- gpu_result= <numba.cuda.cudadrv.devicearray.DeviceNDArray object at 0x7ff6c419fd90>
- gpu_result[1]= 3.0


In [8]:
# get time-stamp when start
start = time()

# use numpy function add() to do vector sum, by CPU
cpu_result = np.add(x, y)

# print time consumed by CPU
print("cpu vector add time " + str(time() - start))

cpu vector add time 0.03424239158630371


In [9]:
print("- cpu_result=", cpu_result)
print("- cpu_result[1]=", cpu_result[1])

- cpu_result= [       0        3        6 ... 59999991 59999994 59999997]
- cpu_result[1]= 3


In [10]:
if (np.array_equal(cpu_result, gpu_result)):
    print("result correct")

result correct
