In [13]:
import time
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
import numpy as np

In [2]:
@cuda.jit
def compute_pi(rng_states, iterations, out):
    """Find the maximum value in values and store in result[0]"""
    thread_id = cuda.grid(1)

    # Compute pi by drawing random (x, y) points and finding what
    # fraction lie inside a unit circle
    inside = 0
    for i in range(iterations):
        x = xoroshiro128p_uniform_float32(rng_states, thread_id)
        y = xoroshiro128p_uniform_float32(rng_states, thread_id)
        if x**2 + y**2 <= 1.0:
            inside += 1

    out[thread_id] = 4.0 * inside / iterations

In [14]:
threads_per_block = 64
blocks = 4096
rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=1)
out = np.zeros(threads_per_block * blocks, dtype=np.float32)

In [19]:
iter = 10000
start = time.perf_counter()
compute_pi[blocks, threads_per_block](rng_states, iter, out)
end = time.perf_counter()
telapsed = end-start
print('pi:', out.mean())
print(f"Time elapsed = {telapsed:.6g} s -> {iter*threads_per_block * blocks/telapsed:.6g} iterations/sec")

pi: 3.1415925
Time elapsed = 0.493069 s -> 5.31658e+09 iterations/sec
