<a href="https://colab.research.google.com/github/nariba/hello-pycuda/blob/main/2dim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/46/61/47d3235a4c13eec5a5f03594ddb268f4858734e02980afbcd806e6242fa5/pycuda-2020.1.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 13.0MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/b7/30/c9362a282ef89106768cba9d884f4b2e4f5dc6881d0c19b478d2a710b82b/pytools-2020.4.3.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 10.4MB/s 
Collecting appdirs>=1.4.0
  Downloading https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/a6/37/0e706200d22172eb8fa17d68a7ae22dec7631a0a92266634fb518a88a5b2/Mako-1.1.3-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 12.1MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (setup.py) .

In [2]:
import math
import numpy
import pycuda.gpuarray
from pycuda.compiler import SourceModule
import pycuda.autoinit

In [3]:
module = SourceModule("""
__global__ void add_two_array_2d(int nx, int ny, float *res, float *arr1, float *arr2){
  int x = threadIdx.x + blockDim.x * blockIdx.x;
  int y = threadIdx.y + blockDim.y * blockIdx.y;
  int ij = nx * y + x;
  if (x < nx && y < ny) {
    res[ij] = arr1[ij] + arr2[ij];
  }
}
""")

In [4]:
add_two_array = module.get_function("add_two_array_2d")

In [5]:
num_x, num_y = numpy.int32(5), numpy.int32(2)
num_components = num_x * num_y

In [6]:
arr1 = numpy.arange(num_components, dtype=numpy.float32).reshape(num_y, num_x)

In [7]:
numpy.random.seed(123)
arr2 = 10 * numpy.random.rand(num_y, num_x)
arr2 = arr2.astype(numpy.float32)

In [8]:
res = numpy.zeros([num_y, num_x], dtype=numpy.float32)

In [9]:
arr1_gpu = pycuda.gpuarray.to_gpu(arr1)
arr2_gpu = pycuda.gpuarray.to_gpu(arr2)
res_gpu = pycuda.gpuarray.to_gpu(res)

In [10]:
threads_per_block = (16, 16, 1)
block_x = math.ceil(num_x / threads_per_block[0])
block_y = math.ceil(num_y / threads_per_block[1])
blocks_per_grid = (block_x, block_y, 1)


In [11]:
add_two_array(num_x, num_y, res_gpu, arr1_gpu, arr2_gpu, block=threads_per_block, grid=blocks_per_grid)

In [12]:
res_gpu.get()

array([[ 6.9646916,  3.8613935,  4.2685146,  8.513147 , 11.19469  ],
       [ 9.231065 , 15.807642 , 13.848297 , 12.809319 , 12.921175 ]],
      dtype=float32)

In [13]:
arr1 + arr2

array([[ 6.9646916,  3.8613935,  4.2685146,  8.513147 , 11.19469  ],
       [ 9.231065 , 15.807642 , 13.848297 , 12.809319 , 12.921175 ]],
      dtype=float32)