# Why GT4Py?

# Example 1: A simple point-wise stencil

First, we compare a NumPy, CuPy and GT4Py implementation of the point-wise stencil
```
d[i, j, k] = a[i, j, k] + b[i, j, k] - c[i, j, k]
```
Timings are measured using the `%timeit` magic command. This command times a single statement or function call by averaging over multiple runs. It has the additional advantage of synchronizing the CPU and the GPU at the end of each run, thus making the output reliable.

### NumPy

In [1]:
import numpy as np

shape = (512, 512, 128)

def f_numpy(a, b, c, d):
    d[...] = a + b - c
    
a = np.random.rand(*shape)
b = np.random.rand(*shape)
c = np.random.rand(*shape)
d = np.empty_like(a)

%timeit f_numpy(a, b, c, d)

75.9 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### CuPy

In [2]:
import cupy as cp

shape = (512, 512, 128)

def f_cupy(a, b, c, d):
    d[...] = a + b - c
    
a = cp.asarray(np.random.rand(*shape))
b = cp.asarray(np.random.rand(*shape))
c = cp.asarray(np.random.rand(*shape))
d = cp.empty_like(a)

%timeit f_cupy(a, b, c, d)

ModuleNotFoundError: No module named 'cupy'

### GT4Py

In [6]:
import numpy as np

import gt4py.next as gtx
from gt4py.next.iterator.embedded import MutableLocatedField
from gt4py.next import neighbor_sum, where, Dims


def random_field(
    domain: gtx.Domain, low: float = -1.0, high: float = 1.0, *, allocator=None
) -> MutableLocatedField:
    return gtx.as_field(
        domain,
        np.random.default_rng().uniform(low=low, high=high, size=domain.shape),
        allocator=allocator,
    )



In [20]:
import gt4py.next as gtx
import numpy as np
import os

backend = "cpu"

backend_str_to_backend = {"None": None, "cpu": gtx.gtfn_cpu, "gpu": gtx.gtfn_gpu}

actual_backend = backend_str_to_backend[backend]

I = gtx.Dimension("I")
J = gtx.Dimension("J")
K = gtx.Dimension("K")

field_domain = gtx.domain({
    I: (0, 512),
    J: (0, 512),
    K: (0, 128),
})

IJKField = gtx.Field[gtx.Dims[I, J, K], gtx.float64]

@gtx.field_operator
def f_gt4py(
    a: IJKField,
    b: IJKField,
    c: IJKField,
) -> IJKField:
    return a + b - c
        
a = random_field(field_domain, allocator=actual_backend)
b = random_field(field_domain, allocator=actual_backend)
c = random_field(field_domain, allocator=actual_backend)
d = gtx.zeros(field_domain, dtype=gtx.float64, allocator=actual_backend)

f_gt4py_stencil = f_gt4py.with_backend(actual_backend)

f_gt4py_stencil(
    a=a, 
    b=b,
    c=c,
    out=d,
    domain=field_domain,
    offset_provider={"_IOff": I, "_JOff": J},
)

%timeit f_gt4py_stencil(a=a, b=b, c=c, out=d, domain=field_domain, offset_provider={"_IOff": I, "_JOff": J})


38.7 ms ± 1.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Example 2: Laplacian

The next example compares a NumPy, CuPy and GT4Py implementation of the Laplacian stencil that we saw before in our stencil2d code that we have been working with:
```
lap[i, j, k] = - 4 * phi[  i,   j, k] 
               +     phi[i-1,   j, k] 
               +     phi[i+1,   j, k] 
               +     phi[  i, j-1, k] 
               +     phi[  i, j+1, k]
```
Timings are measured using the `%timeit` magic command. This command times a single statement or function call by averaging over multiple runs. It has the additional advantage of synchronizing the CPU and the GPU at the end of each run, thus making the output reliable.

### NumPy

In [21]:
import numpy as np

shape = (512, 512, 128)

def lap_numpy(phi, lap):
    lap[1:-1, 1:-1] = (
        - 4. * phi[1:-1, 1:-1]
        +      phi[ :-2, 1:-1]
        +      phi[  2:, 1:-1]
        +      phi[1:-1,  :-2]
        +      phi[1:-1,   2:]
    )
    
phi = np.random.rand(*shape)
lap = np.empty_like(phi)

%timeit lap_numpy(phi, lap)

269 ms ± 23.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### CuPy

In [5]:
import cupy as cp

shape = (512, 512, 128)

def lap_cupy(phi, lap):
    lap[1:-1, 1:-1] = (
        - 4. * phi[1:-1, 1:-1]
        +      phi[ :-2, 1:-1]
        +      phi[  2:, 1:-1]
        +      phi[1:-1,  :-2]
        +      phi[1:-1,   2:]
    )
    
phi = cp.asarray(np.random.rand(*shape))
lap = cp.empty_like(phi)

%timeit lap_cupy(phi, lap)

7.74 ms ± 37.5 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### GT4Py

In [25]:
import gt4py.next as gtx
import numpy as np
import os

backend = "cpu"

backend_str_to_backend = {"None": None, "cpu": gtx.gtfn_cpu, "gpu": gtx.gtfn_gpu}

actual_backend = backend_str_to_backend[backend]

I = gtx.Dimension("I")
J = gtx.Dimension("J")
K = gtx.Dimension("K")

field_domain = gtx.domain({
    I: (-4, 516),
    J: (-4, 516),
    K: (-4, 132),
})

IJKField = gtx.Field[gtx.Dims[I, J, K], gtx.float64]

@gtx.field_operator
def lap_gt4py(in_field: IJKField) -> IJKField:
    lap_field = (
        -4.0 * in_field
        + in_field(I - 1)
        + in_field(I + 1)
        + in_field(J - 1)
        + in_field(J + 1)
    )
    return lap_field


phi = random_field(field_domain, allocator=actual_backend)
lap = gtx.zeros(field_domain, dtype=gtx.float64, allocator=actual_backend)

lap_gt4py_stencil = lap_gt4py.with_backend(actual_backend)

compute_domain = gtx.domain({
    I: (0, 512),
    J: (0, 512),
    K: (0, 128),
})

lap_gt4py_stencil(
    in_field=phi,
    out=lap,
    domain=compute_domain,
    offset_provider={"_IOff": I, "_JOff": J},
)

%timeit lap_gt4py_stencil(in_field=phi, out=lap, domain=compute_domain, offset_provider={"_IOff": I, "_JOff": J})


26.6 ms ± 1.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
