#  Without CUDA

In [23]:
import numpy as np
from timeit import default_timer as timer
import cProfile

In [31]:
def vectoradd(a, b, c):
    for i in tqdm(range(a.size)):
        c[i] = a[i] + b[i]

In [32]:
def without_cuda():
    N = 32000000
    
    A = np.ones(N, dtype=np.float32)
    B = np.ones(N, dtype=np.float32)
    C = np.zeros(N, dtype=np.float32)
    
    vectoradd(A, B, C)
    
    print("C[:5] = " + str(C[:5]))
    print("C[:5] = " + str(C[:5]))
    
if __name__ == '__main__':
    cProfile.run('without_cuda()')

100%|██████████| 32000000/32000000 [00:17<00:00, 1857512.33it/s]


C[:5] = [2. 2. 2. 2. 2.]
C[:5] = [2. 2. 2. 2. 2.]
         32775929 function calls (32775919 primitive calls) in 17.260 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(amax)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(amin)
        2    0.000    0.000    0.027    0.013 <__array_function__ internals>:2(copyto)
        3    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
        1   12.146   12.146   17.229   17.229 <ipython-input-31-ba7be37d253c>:1(vectoradd)
        1    0.000    0.000   17.256   17.256 <ipython-input-32-606f2da09c1c>:1(without_cuda)
        1    0.004    0.004   17.260   17.260 <string>:1(<module>)
        2    0.000    0.000    0.000    0.000 _asarray.py:16(asarray)
        1    0.000    0.000    0.000    0.000 _monitor.py:32(__init__)
        1    0.000  

#  With CUDA

In [33]:
import numpy as np
from timeit import default_timer as timer
import cProfile
from numba import vectorize

In [34]:
def vectoradd(a, b):
    return a + b

In [35]:
def with_cuda():
    N = 32000000
    
    A = np.ones(N, dtype=np.float32)
    B = np.ones(N, dtype=np.float32)
    C = np.zeros(N, dtype=np.float32)
    
    C = vectoradd(A, B)
    
    print("C[:5] = " + str(C[:5]))
    print("C[:5] = " + str(C[:5]))
    
if __name__ == '__main__':
    cProfile.run('with_cuda()')

C[:5] = [2. 2. 2. 2. 2.]
C[:5] = [2. 2. 2. 2. 2.]
         404 function calls (394 primitive calls) in 0.085 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(amax)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(amin)
        2    0.000    0.000    0.028    0.014 <__array_function__ internals>:2(copyto)
        1    0.055    0.055    0.055    0.055 <ipython-input-34-06aadaa186b6>:1(vectoradd)
        1    0.000    0.000    0.083    0.083 <ipython-input-35-f4711a7a1b51>:1(with_cuda)
        1    0.001    0.001    0.085    0.085 <string>:1(<module>)
        2    0.000    0.000    0.000    0.000 _asarray.py:16(asarray)
        4    0.000    0.000    0.000    0.000 _ufunc_config.py:139(geterr)
        4    0.000    0.000    0.000    0.000 _ufunc_config.py:39(seterr)
        2    0.000    0.000    0.000    0.000 _ufunc_co