<a href="https://colab.research.google.com/github/nariba/hello-pycuda/blob/main/CPUvsGPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/46/61/47d3235a4c13eec5a5f03594ddb268f4858734e02980afbcd806e6242fa5/pycuda-2020.1.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 13.2MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/b7/30/c9362a282ef89106768cba9d884f4b2e4f5dc6881d0c19b478d2a710b82b/pytools-2020.4.3.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 9.9MB/s 
Collecting appdirs>=1.4.0
  Downloading https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/a6/37/0e706200d22172eb8fa17d68a7ae22dec7631a0a92266634fb518a88a5b2/Mako-1.1.3-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 11.0MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (setup.py) ..

In [2]:
import os

In [3]:
os.mkdir("cuda")

In [4]:
import math
import numpy
import time
import pycuda.gpuarray
from pycuda.compiler import SourceModule

In [5]:
import pycuda.autoinit

In [6]:
cuda_file_path = os.path.abspath("./cuda")

In [7]:
module = SourceModule("""
#include "kernel_functions_for_math_1d.cu"
""", include_dirs=[cuda_file_path])

In [8]:
plus_one_kernel = module.get_function("plus_one_kernel")

In [9]:
num_components = numpy.int32(10)
x = numpy.arange(num_components, dtype=numpy.int32)

In [10]:
x_gpu = pycuda.gpuarray.to_gpu(x)
y_gpu = pycuda.gpuarray.zeros(num_components, dtype=numpy.int32)

In [11]:
threads_per_block = (256, 1, 1)
blocks_per_grid = (math.ceil(num_components / threads_per_block[0]), 1, 1)

In [12]:
plus_one_kernel(num_components, y_gpu, x_gpu, block=threads_per_block, grid=blocks_per_grid)

In [13]:
y_gpu.get()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int32)

In [14]:
# Measure CPU time
time_start_cpu = time.time()
x = x + 1
time_end_cpu = time.time()

In [15]:
print("CPU calculation {0} [msec]".format(1000 * (time_end_cpu - time_start_cpu)))

CPU calculation 0.08797645568847656 [msec]


In [16]:
# Measure GPU time
import pycuda.driver

In [17]:
time_start_gpu = pycuda.driver.Event()
time_end_gpu = pycuda.driver.Event()

In [18]:
time_start_gpu.record()
plus_one_kernel(num_components, y_gpu, x_gpu, block=threads_per_block, grid=blocks_per_grid)
time_end_gpu.record()
time_end_gpu.synchronize()

<pycuda._driver.Event at 0x7fe16bd0bce0>

In [19]:
print("GPU calculation {0} [msec]".format(time_start_gpu.time_till(time_end_gpu)))

GPU calculation 0.1987839937210083 [msec]


In [20]:
num_components = numpy.int32(10000)
x = numpy.arange(num_components, dtype=numpy.int32)

In [21]:
x_gpu = pycuda.gpuarray.to_gpu(x)
y_gpu = pycuda.gpuarray.zeros(num_components, dtype=numpy.int32)

In [22]:
threads_per_block = (256, 1, 1)
blocks_per_grid = (math.ceil(num_components / threads_per_block[0]), 1, 1)

In [23]:
plus_one_kernel(num_components, y_gpu, x_gpu, block=threads_per_block, grid=blocks_per_grid)

In [24]:
# Measure CPU time
time_start_cpu = time.time()
x = x + 1
time_end_cpu = time.time()

In [25]:
print("CPU calculation {0} [msec]".format(1000 * (time_end_cpu - time_start_cpu)))

CPU calculation 0.09751319885253906 [msec]


In [26]:
time_start_gpu.record()
plus_one_kernel(num_components, y_gpu, x_gpu, block=threads_per_block, grid=blocks_per_grid)
time_end_gpu.record()
time_end_gpu.synchronize()

<pycuda._driver.Event at 0x7fe16bd0bce0>

In [27]:
print("GPU calculation {0} [msec]".format(time_start_gpu.time_till(time_end_gpu)))

GPU calculation 0.12636800110340118 [msec]


In [28]:
num_components = numpy.int32(10000000)
x = numpy.arange(num_components, dtype=numpy.int32)

In [29]:
x_gpu = pycuda.gpuarray.to_gpu(x)
y_gpu = pycuda.gpuarray.zeros(num_components, dtype=numpy.int32)

In [30]:
threads_per_block = (256, 1, 1)
blocks_per_grid = (math.ceil(num_components / threads_per_block[0]), 1, 1)

In [31]:
plus_one_kernel(num_components, y_gpu, x_gpu, block=threads_per_block, grid=blocks_per_grid)

In [32]:
# Measure CPU time
time_start_cpu = time.time()
x = x + 1
time_end_cpu = time.time()

In [33]:
print("CPU calculation {0} [msec]".format(1000 * (time_end_cpu - time_start_cpu)))

CPU calculation 16.999006271362305 [msec]


In [34]:
time_start_gpu.record()
plus_one_kernel(num_components, y_gpu, x_gpu, block=threads_per_block, grid=blocks_per_grid)
time_end_gpu.record()
time_end_gpu.synchronize()

<pycuda._driver.Event at 0x7fe16bd0bce0>

In [35]:
print("GPU calculation {0} [msec]".format(time_start_gpu.time_till(time_end_gpu)))

GPU calculation 0.49670401215553284 [msec]


In [36]:
time_start_gpu.record()
# plus_one_kernel(num_components, y_gpu, x_gpu, block=threads_per_block, grid=blocks_per_grid)
y_gpu.get()
time_end_gpu.record()
time_end_gpu.synchronize()


<pycuda._driver.Event at 0x7fe16bd0bce0>

In [37]:
print("GPU calculation {0} [msec]".format(time_start_gpu.time_till(time_end_gpu)))

GPU calculation 15.400383949279785 [msec]
