In [1]:
import numpy as np
import time
import math
from numba import jit
from numba import cuda


In [2]:

@cuda.jit
def mutiply(arr_A, arr_B, arr_C):
    # (row, col) 为当前thread的横坐标和纵坐标
    row, col = cuda.grid(2)
    if row < arr_C.shape[0] and col < arr_C.shape[1]:
        tmp = 0.
        for k in range(arr_A.shape[1]):
            tmp += arr_A[row, k] * arr_B[k, col]
        arr_C[row, col] = tmp


In [3]:
M = 6000
N = 4800
P = 4000
A = np.random.random((M, N)) # 随机生成的 [M x N] 矩阵
B = np.random.random((N, P)) # 随机生成的 [N x P] 矩阵
C_gpu = np.zeros((M, P))
# manually copy numpy matrices from the main memory to the GPU global memory
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)

# 在显卡设备上初始化一块GPU memory, for storing GPU计算结果, 以避免结果被回送到CPU
C_global_mem = cuda.device_array((A.shape[0], B.shape[1]))


# execute configuration
threads_per_block = (8, 16)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(B.shape[1] / threads_per_block[1]))
blocksPerGrid = (blocks_per_grid_x, blocks_per_grid_y)


start = time.time()
# mutiply[blocksPerGrid, threads_per_block](A, B, C_gpu)
mutiply[blocksPerGrid, threads_per_block](A_global_mem, B_global_mem, C_global_mem)
cuda.synchronize()

print("GPU time required:  %s" %(time.time() - start))

# Copy the result matrix back to the host
C_gpu = C_global_mem.copy_to_host()

# 验证正确性
if np.allclose(C_gpu, np.dot(A, B)):
    print("gpu result correct")



GPU time required:  3.2184762954711914
gpu result correct
