# Why GT4Py?

# Example 1: A simple point-wise stencil

First, we compare a NumPy, CuPy and GT4Py implementation of the point-wise stencil
```
d[i, j, k] = a[i, j, k] + b[i, j, k] - c[i, j, k]
```
Timings are measured using the `%timeit` magic command. This command times a single statement or function call by averaging over multiple runs. It has the additional advantage of synchronizing the CPU and the GPU at the end of each run, thus making the output reliable.

In [None]:
shape = (512, 512, 128)

### NumPy

In [None]:
import numpy as np

def f_numpy(a, b, c, d):
    d[...] = a + b - c
    
a = np.random.rand(*shape)
b = np.random.rand(*shape)
c = np.random.rand(*shape)
d = np.empty_like(a)

%timeit f_numpy(a, b, c, d)

### CuPy

In [None]:
import cupy as cp

def f_cupy(a, b, c, d):
    d[...] = a + b - c
    
a = cp.asarray(np.random.rand(*shape))
b = cp.asarray(np.random.rand(*shape))
c = cp.asarray(np.random.rand(*shape))
d = cp.empty_like(a)

%timeit f_cupy(a, b, c, d)

### GT4Py

In [None]:
import numpy as np
import gt4py.next as gtx

backend = None             # Embedded, native Python execution
#backend = gtx.gtfn_cpu     # Translated to C code
backend = gtx.gtfn_gpu     # Translated to GPU (CUDA or HIP) code

I = gtx.Dimension("I")
J = gtx.Dimension("J")
K = gtx.Dimension("K")

field_domain = gtx.domain({
    I: (0, shape[0]),
    J: (0, shape[1]),
    K: (0, shape[2]),
})

IJKField = gtx.Field[gtx.Dims[I, J, K], gtx.float64]

@gtx.field_operator
def f_gt4py(a: IJKField, b: IJKField, c: IJKField) -> IJKField:
    return a + b - c

@gtx.program
def f_gt4py_program(
    a: IJKField,
    b: IJKField,
    c: IJKField,
    nx: gtx.int32,
    ny: gtx.int32,
    nz: gtx.int32,
    out: IJKField
):
    f_gt4py(
        a=a,
        b=b,
        c=c,
        out=out,
        domain={
            I: (0, nx),
            J: (0, ny),
            K: (0, nz),
        },
    )
        
a = gtx.as_field(field_domain, np.random.rand(*field_domain.shape), allocator=backend)
b = gtx.as_field(field_domain, np.random.rand(*field_domain.shape), allocator=backend)
c = gtx.as_field(field_domain, np.random.rand(*field_domain.shape), allocator=backend)
d = gtx.zeros(field_domain, dtype=gtx.float64, allocator=backend)

f_gt4py_stencil = f_gt4py_program.with_backend(backend)

compute_domain = field_domain

f_gt4py_stencil(
    a=a, 
    b=b,
    c=c,
    nx=shape[0],
    ny=shape[1],
    nz=shape[2],
    out=d
)

%timeit f_gt4py_stencil(a=a, b=b, c=c, nx=shape[0], ny=shape[1], nz=shape[2], out=d)

# Example 2: Laplacian

The next example compares a NumPy, CuPy and GT4Py implementation of the Laplacian stencil that we saw before in our stencil2d code that we have been working with:
```
lap[i, j, k] = - 4 * phi[  i,   j, k] 
               +     phi[i-1,   j, k] 
               +     phi[i+1,   j, k] 
               +     phi[  i, j-1, k] 
               +     phi[  i, j+1, k]
```
Timings are measured using the `%timeit` magic command. This command times a single statement or function call by averaging over multiple runs. It has the additional advantage of synchronizing the CPU and the GPU at the end of each run, thus making the output reliable.

In [None]:
shape_with_halo = (shape[0] + 2, shape[1] + 2, shape[2])

### NumPy

In [None]:
import numpy as np

def lap_numpy(phi, lap):
    lap[1:-1, 1:-1] = (
        - 4. * phi[1:-1, 1:-1]
        +      phi[ :-2, 1:-1]
        +      phi[  2:, 1:-1]
        +      phi[1:-1,  :-2]
        +      phi[1:-1,   2:]
    )
    
phi = np.random.rand(*shape_with_halo)
lap = np.empty_like(phi)

%timeit lap_numpy(phi, lap)

### CuPy

In [None]:
import cupy as cp

def lap_cupy(phi, lap):
    lap[1:-1, 1:-1] = (
        - 4. * phi[1:-1, 1:-1]
        +      phi[ :-2, 1:-1]
        +      phi[  2:, 1:-1]
        +      phi[1:-1,  :-2]
        +      phi[1:-1,   2:]
    )
    
phi = cp.asarray(np.random.rand(*shape_with_halo))
lap = cp.empty_like(phi)

%timeit lap_cupy(phi, lap)

### GT4Py

In [None]:
import gt4py.next as gtx
import numpy as np

backend = None             # Embedded, native Python execution
#backend = gtx.gtfn_cpu     # Translated to C code
backend = gtx.gtfn_gpu     # Translated to GPU (CUDA or HIP) code

I = gtx.Dimension("I")
J = gtx.Dimension("J")
K = gtx.Dimension("K")

field_domain = gtx.domain({
    I: (-1, shape[0] + 1),
    J: (-1, shape[1] + 1),
    K: (-1, shape[2] + 1),
})

IJKField = gtx.Field[gtx.Dims[I, J, K], gtx.float64]

@gtx.field_operator
def lap_gt4py(in_field: IJKField) -> IJKField:
    lap_field = (
        -4.0 * in_field
        + in_field(I - 1)
        + in_field(I + 1)
        + in_field(J - 1)
        + in_field(J + 1)
    )
    return lap_field

@gtx.program
def lap_gt4py_program(
    in_field: IJKField,
    nx: gtx.int32,
    ny: gtx.int32,
    nz: gtx.int32,
    out: IJKField,
) -> None:
    lap_gt4py(in_field, out=out, domain={I: (0, nx), J: (0, ny), K: (0, nz)})

phi = gtx.as_field(field_domain, np.random.rand(*field_domain.shape), allocator=backend)
lap = gtx.zeros(field_domain, dtype=gtx.float64, allocator=backend)

lap_gt4py_stencil = lap_gt4py_program.with_backend(backend)

lap_gt4py_stencil(
    in_field=phi,
    nx = shape[0],
    ny = shape[1],
    nz = shape[2],
    out=lap,
)

%timeit lap_gt4py_stencil(in_field=phi, nx=shape[0], ny=shape[1], nz=shape[2], out=lap)
