In [22]:
!hostname

nikizadeh-nzcacobaltgpuoffloadcopyp3-00012-1-0001


In [23]:
!nvidia-smi

Tue Aug 20 16:32:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB            Off| 00000000:00:1E.0 Off |                    0 |
| N/A   32C    P0               38W / 300W|   1074MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [24]:
!conda env list

# conda environments:
#
base                     /contrib/Niki.Zadeh/opt/miniconda3
platforms                /contrib/Niki.Zadeh/opt/miniconda3/envs/platforms
plattorch                /contrib/Niki.Zadeh/opt/miniconda3/envs/plattorch
plattorch2            *  /contrib/Niki.Zadeh/opt/miniconda3/envs/plattorch2
                         /home/Niki.Zadeh/pw/.miniconda
                         /home/Niki.Zadeh/source



In [25]:
#!which pip

In [26]:
#!pip install numba

In [27]:
import numpy as np

a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])
c = np.arange(4*4).reshape((4,4))

In [28]:
%timeit np.add(b, c)   # NumPy on CPU

1.47 µs ± 15.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [29]:
from numba import vectorize
@vectorize(['int64(int64, int64)'], target='cuda') # Type signature and target are required for the GPU
def add_ufunc(x, y):
    return x + y

In [30]:
%timeit add_ufunc(b, c) # Numba on GPU



1.27 ms ± 3.73 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [31]:
#import scipy.stats # for definition of gaussian distribution, so we can compare CPU to GPU time
#norm_pdf = scipy.stats.norm
#%timeit norm_pdf.pdf(x, loc=mean, scale=sigma)

In [32]:
import math # Note that for the CUDA target, we need to use the scalar functions from the math module, not NumPy

SQRT_2PI = np.float32((2*math.pi)**0.5)  # Precompute this constant as a float32.  Numba will inline it at compile time.

@vectorize(['float32(float32, float32, float32)'], target='cuda')
def gaussian_pdf(x, mean, sigma):
    '''Compute the value of a Gaussian probability density function at x with given mean and sigma.'''
    return math.exp(-0.5 * ((x - mean) / sigma)**2) / (sigma * SQRT_2PI)


@vectorize
def cpu_gaussian_pdf(x, mean, sigma):
    '''Compute the value of a Gaussian probability density function at x with given mean and sigma.'''
    return math.exp(-0.5 * ((x - mean) / sigma)**2) / (sigma * SQRT_2PI)

In [33]:
import numpy as np
# Evaluate the Gaussian a million times!
x = np.random.uniform(-3, 3, size=1000000).astype(np.float32)
mean = np.float32(0.0)
sigma = np.float32(1.0)

# Quick test on a single element just to make sure it works
gaussian_pdf(x[0], 0.0, 1.0)



array([0.02808081], dtype=float32)

In [34]:
%timeit gaussian_pdf(x, mean, sigma)

3.44 ms ± 72 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [35]:
%timeit cpu_gaussian_pdf(x, mean, sigma)

26.6 ms ± 169 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [36]:
from numba import cuda

@cuda.jit(device=True)
def polar_to_cartesian(rho, theta):
    x = rho * math.cos(theta)
    y = rho * math.sin(theta)
    return x, y

@vectorize(['float32(float32, float32, float32, float32)'], target='cuda')
def polar_distance(rho1, theta1, rho2, theta2):
    x1, y1 = polar_to_cartesian(rho1, theta1) # We can use device functions inside our GPU ufuncs
    x2, y2 = polar_to_cartesian(rho2, theta2)
    
    return ((x1 - x2)**2 + (y1 - y2)**2)**0.5

In [37]:
n = 1000000
rho1 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)
theta1 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)
rho2 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)
theta2 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)

In [38]:
%timeit polar_distance(rho1, theta1, rho2, theta2)

6.69 ms ± 886 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
def polar_to_cartesian_np(rho, theta):
    x = rho * np.cos(theta)
    y = rho * np.sin(theta)
    return x, y

def polar_distance_np(rho1, theta1, rho2, theta2):
    x1, y1 = polar_to_cartesian_np(rho1, theta1) # We can use device functions inside our GPU ufuncs
    x2, y2 = polar_to_cartesian_np(rho2, theta2)
    
    return ((x1 - x2)**2 + (y1 - y2)**2)**0.5

In [40]:
%timeit polar_distance_np(rho1, theta1, rho2, theta2)

28.7 ms ± 589 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [41]:
from numba import jit, cuda
import numpy as np
# to measure exec time
from timeit import default_timer as timer 

# normal function to run on cpu
def func(a):
    for i in range(100000000):
        a[i]+= 1
    return a

@jit 
def func_numba(a):
    for i in range(100000000):
        a[i]+= 1
    return a

@jit 
def func_numba_nonvecable(a):
    asum=0.0
    for i in range(100000000):
        a[i]+= 1
        asum += a[i]
    return asum

def func_nonvecable(a):
    asum=0.0
    for i in range(100000000):
        a[i]+= 1
        asum += a[i]
    return asum



n = 100000000
a = np.ones(n, dtype = np.float64)
start = timer()
b=func(a)
print("numpy:                 ", timer()-start, " result= ",b.sum()) 

a = np.ones(n, dtype = np.float64)
start = timer()
b=func_nonvecable(a)
print("numpy nonvecable:     ", timer()-start, " result= ",b)

a = np.ones(n, dtype = np.float64)
start = timer()
b=func_numba_nonvecable(a)
print("numba jit nonvecable:", timer()-start, " result= ",b)

a = np.ones(n, dtype = np.float64)
start = timer()
b=func_numba(a)
print("numba jit:           ", timer()-start, " result= ",b.sum())


numpy:                  25.569903383031487  result=  200000000.0
numpy nonvecable:      43.80844003899256  result=  200000000.0
numba jit nonvecable: 0.24698784801876172  result=  200000000.0
numba jit:            0.24826496798777953  result=  200000000.0


In [42]:
from numba import vectorize
@vectorize(['float32(float32)'], target='cuda')
def func_add1_numba_cuda(a):
    a += 1
    return a


@jit
def func_add1_numba(a):
    a += 1
    return a

def func_add1(a):
    a += 1
    return a

a = np.ones(n).astype(np.float32)
start = timer()
b=func_add1_numba_cuda(a)
print("numba jit cuda:      ", timer()-start, " result= ",b.sum())

a = np.ones(n).astype(np.float32)
start = timer()
b=func_add1_numba(a)
print("numba jit     :      ", timer()-start, " result= ",b.sum())

a = np.ones(n).astype(np.float32)
start = timer()
b=func_add1(a)
print("numpy         :      ", timer()-start, " result= ",b.sum())

numba jit cuda:       0.529195251991041  result=  200000000.0
numba jit     :       0.352728470985312  result=  200000000.0
numpy         :       0.08615502197062597  result=  200000000.0
