# Prototype timing arccos with gt4py
In here we tested different timing techniques.

In [7]:
import os
import gt4py.next as gtx
from gt4py.next import broadcast
from gt4py.next.ffront.fbuiltins import arccos
import numpy as np

In [8]:
I = gtx.Dimension("I")
J = gtx.Dimension("J")
K = gtx.Dimension("K")
IField = gtx.Field[gtx.Dims[I], gtx.float64]
IJKField = gtx.Field[gtx.Dims[I, J, K], gtx.float64]

In [9]:
n_max = 1e8
x_np = 2 * np.random.rand(int(n_max)) - 1

two_by_pi = 2 / np.pi
arccos_rescaled_np = lambda x: two_by_pi * np.arccos(x) - 1
ref_np = arccos_rescaled_np(arccos_rescaled_np(arccos_rescaled_np(arccos_rescaled_np(x_np))))

In [10]:
display(ref_np, x_np)

assert(len(x_np.shape) == 1)
domain_all = gtx.domain({
    I: (0, x_np.shape[0]),
})

array([-0.12665501, -0.24328836,  0.08006521, ..., -0.13992951,
        0.10686117, -0.03258796])

array([-0.6632596 , -0.93167756,  0.45773221, ..., -0.71086848,
        0.58310831, -0.19629008])

In [12]:
@gtx.field_operator
def arccos_once(x: IField) -> IField:
    return arccos(x)

###

@gtx.field_operator # 2^0
def arccos_rescaled(x: IField) -> IField:
    res = broadcast(2.0 / 3.141592653589793, (I,)) * arccos(x) - broadcast(1.0, (I,))
    return res

@gtx.field_operator # 2^1
def arccos_twice(x: IField) -> IField:
    return arccos_rescaled(arccos_rescaled(x))

@gtx.field_operator # 2^2
def arccos_four_times(x: IField) -> IField:
    return arccos_rescaled(arccos_rescaled(arccos_rescaled(arccos_rescaled(x))))

In [15]:
# backend = None
# backend = gtx.gtfn_cpu
backend = gtx.gtfn_gpu
# gtx_arccos = arccos_once.with_backend(backend)
gtx_4arcosc = arccos_four_times.with_backend(backend)

ref = gtx.as_field(data=ref_np, domain=domain_all, allocator=backend)
x = gtx.as_field(data=x_np, domain=domain_all, allocator=backend)

out_field = gtx.empty(domain=domain_all, dtype=x.dtype, allocator=backend)

In [16]:
%timeit gtx_4arcosc(x=x, out=out_field, domain=domain_all)

3.1 ms ± 311 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
np.isclose(ref_np, out_field.asnumpy()).all()

np.True_

In [20]:
# backend = None
backend = gtx.gtfn_cpu
# backend = gtx.gtfn_gpu
gtx_4arcosc_cpu = arccos_four_times.with_backend(gtx.gtfn_cpu)
gtx_4arcosc_gpu = arccos_four_times.with_backend(gtx.gtfn_gpu)

x_cpu = gtx.as_field(data=x_np, domain=domain_all, allocator=gtx.gtfn_cpu)
x_gpu = gtx.as_field(data=x_np, domain=domain_all, allocator=gtx.gtfn_gpu)
out_field_cpu = gtx.empty(domain=domain_all, dtype=x.dtype, allocator=gtx.gtfn_cpu)
out_field_gpu = gtx.empty(domain=domain_all, dtype=x.dtype, allocator=gtx.gtfn_gpu)

In [21]:
%timeit gtx_4arcosc_cpu(x=x_cpu, out=out_field_cpu, domain=domain_all)

34.8 ms ± 6.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%timeit gtx_4arcosc_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)

3.13 ms ± 21.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [23]:
import timeit
n_reps = 100
execution_time = timeit.timeit("gtx_4arcosc_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)", globals=globals(), number=n_reps)
print(f"Average time per run: {execution_time / n_reps:.6f} seconds")

Average time per run: 0.003197 seconds


In [24]:
import timeit
number = 100 # inner loop reps
repeats = 10 # timings (outer loop)
times = timeit.repeat("gtx_4arcosc_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)", globals=globals(), repeat=repeats, number=number)
mean_time, std_time = np.mean(times), np.std(times)
print(f"Average time per run: {mean_time / number:.6f} seconds")

Average time per run: 0.003170 seconds


## Test if cupy synchronization is required

In [28]:
import timeit
import statistics
import cupy as cp  # Required for GPU sync

def gt4py_timeit(func, *args, number=10, repeat=5, warm_up=1, synchronize=True, **kwargs):
    """
    Time a GT4Py function with optional GPU synchronization.
    
    Args:
        func: Function to time.
        *args, **kwargs: Arguments to pass to the function.
        number: How many executions per timing batch.
        repeat: How many timing batches.
        synchronize: If True, synchronize GPU before/after timing.

    Returns:
        mean_time_per_exec: Mean execution time (seconds).
        std_time_per_exec: Standard deviation (seconds).
        all_times: List of per-execution times (seconds).
    """
    times = []

    for _ in range(repeat + warm_up):
        if synchronize:
            cp.cuda.Device(0).synchronize()

        start = timeit.default_timer()

        for _ in range(number):
            func(*args, **kwargs)

        if synchronize:
            cp.cuda.Device(0).synchronize()

        end = timeit.default_timer()
        times.append((end - start) / number)  # Per-execution time

    for k in range(warm_up):
        print("    ", times[k])
    times = times[warm_up:]
    mean_time = np.mean(times)
    std_time = np.std(times)

    print(f"{func.__name__} mean time per execution: {mean_time:.6e} s ± {std_time:.6e} s over {repeat} repeats with {warm_up} warmups")

    return mean_time, std_time, times


In [29]:
gt4py_timeit(gtx_4arcosc_gpu, number=number, repeat=repeats, warm_up=1, synchronize=True, x=x_gpu, out=out_field_gpu, domain=domain_all)


     0.0032729881100021885
arccos_four_times mean time per execution: 3.219137e-03 s ± 8.199510e-06 s over 10 repeats with 1 warmups


(np.float64(0.0032191365610015053),
 np.float64(8.19950985000852e-06),
 [0.003217868930005352,
  0.0032195104700076626,
  0.0032086868100122956,
  0.003215228370008845,
  0.0032077297199975875,
  0.0032230844399964553,
  0.003222750049990282,
  0.0032209206599873143,
  0.0032169697500103213,
  0.00323861640999894])

In [27]:
gt4py_timeit(gtx_4arcosc_gpu, number=number, repeat=repeats, warm_up=1, synchronize=False, x=x_gpu, out=out_field_gpu, domain=domain_all)


     0.003259036260005814
arccos_four_times mean time per execution: 3.203960e-03 s ± 6.854653e-06 s over 10 repeats with 1 warmups


(np.float64(0.0032039603140019608),
 np.float64(6.854652866948029e-06),
 [0.0032063067600029173,
  0.003197313140008191,
  0.0032051416900139885,
  0.0031969649900020158,
  0.003197736160000204,
  0.0032157403800010796,
  0.0032027894500060937,
  0.003207083049983339,
  0.003214692740002647,
  0.0031958347799991315])



<div class="alert alert-block alert-success">
The measurements above result in almost the same times with and without the cupy synchronization. This means gt4py only returns once the calculation is effectively done we don't need to synchronize manually.
</div>

----------
## Test time_arccos from arccos_gt4py

In [None]:
# Test time_arccos without transfer back and forth to gpu
from arccos_gt4py import time_arccos

In [None]:
time_arccos(4, 128, number=1, repeats=10, do_print=True, incl_transfer=True)

### 4 128 -1 0.17052842390003206


(4, 128, np.float64(0.17052842390003206))

In [None]:
time_arccos(4, 128, number=1, repeats=10, do_print=True, incl_transfer=False)

### 4 128 -1 0.004496450499573257


(4, 128, np.float64(0.004496450499573257))

--------
## Time manually

In [None]:
import time
import cupy as cp

In [None]:
# time manually:
tic = time.perf_counter()
gtx_4arcosc_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)
# gtx_arccos_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)
cp.cuda.runtime.deviceSynchronize()
toc = time.perf_counter()

In [None]:
toc-tic 

0.007939551000163192

-------
## test path handling

In [None]:
import os
os.getcwd()

'/users/class182/project-overlapping/src/gt4py'

In [None]:
os.listdir()

['.gt4py_cache',
 '.ipynb_checkpoints',
 'TEMP_arccos.ipynb',
 '__pycache__',
 'arccos_gt4py.py',
 'run_arccos_gt4py.py',
 'run_arccos_gt4py.sh']

In [None]:
os.listdir("../../build/gt4py/")

['.gt4py_cache',
 '.ipynb_checkpoints',
 'TEMP_arccos.ipynb',
 '__pycache__',
 'arccos_gt4py.py',
 'run_arccos_gt4py.py',
 'run_arccos_gt4py.sh']

In [None]:
os.path.isfile("../../build/gt4py/run_arccos_gt4py.py")

True