# Exercise 3 - Templating
This notebook shows an example of how to autogenerate C kernels that are specialized for a user requested platform. Doing so you can write Python scripts that take a user input and depending on that launch the C code either on CPU or GPU. This is useful as we have to write the kernel template only once without knowing what platform the future user might want to run our software on. As you will see, this is much simpler than it sounds. 

In our example the kernel is a simple function that doubles the elements of a 1D vector. This is the same kernel from exercise 2, but with some extra comments added. These comments are markers where if requested, extra lines will be inserted specific to the parallel programming model CUDA or OpenCL, which will turn our serial C function into parallelized code for the GPU.

In [None]:
source_sig = """void elementwise(int, const double*, double*);"""
source_str = r"""
/*begin_gpukern*/
void elementwise(int n, 
    /*gpuglmem*/ const double* x, 
    /*gpuglmem*/ double* y)
{
  for(int i=0; i<n; i++){//begin_parallel i n
    y[i] = 2 * x[i];
  }//end_parallel
}
/*end_gpukern*/
"""

This is the user input

In [None]:
specialize_for = "gpu_pyopencl"

These fine tune the kernel for the requested platform

In [None]:
new_lines = []  # here we collect the new kernel lines to be inserted

for ll in source_str.splitlines():
    if "//begin_parallel" in ll:
        varname, limname = ll.split("//begin_parallel")[-1].split()

        if specialize_for == "cpu":
                new_lines.append(
                    f"for (int {varname}=0; {varname}<{limname}; {varname}++)"
                    + "{\n"
                )

        elif specialize_for == "gpu_pyopencl":
            new_lines.append(f"int {varname};\n")
            new_lines.append(
                f"{varname}=get_global_id(0);\n"
            )

        elif specialize_for == "gpu_cupy":
            new_lines.append(f"int {varname};\n")
            new_lines.append(
                f"{varname}=blockDim.x * blockIdx.x + threadIdx.x;\n"
                f"if ({varname}<{limname})" + "{"
            )
            
    elif "//end_parallel" in ll:
        if specialize_for == "cpu":
            new_lines.append("}")
        elif specialize_for == "gpu_pyopencl":
            new_lines.append("")
        elif specialize_for == "gpu_cupy":
            new_lines.append("}")
            
    else:
        new_lines.append(ll)
        
new_source_src = "\n".join(new_lines)

new_source_src = new_source_src.replace(
    "/*begin_gpukern*/",
    {
        "cpu": " ",
        "gpu_pyopencl": " __kernel ",
        "gpu_cupy": "extern \"C\"{\n__global__",
    }[specialize_for],
)

new_source_src = new_source_src.replace(
    "/*end_gpukern*/",
    {
        "cpu": " ",
        "gpu_pyopencl": " ",
        "gpu_cupy": "}",
    }[specialize_for],
)

new_source_src = new_source_src.replace(
    "/*gpuglmem*/",
    {
        "cpu": " ",
        "gpu_pyopencl": " __global ",
        "gpu_cupy": " ",
    }[specialize_for],
)
print(new_source_src)

Build and load the kernel function

In [None]:
import numpy as np

n = 10
x = np.random.randn(n)

In [None]:
if specialize_for == "cpu":
    import cffi
    print("using cffi")
    
    ffi_interface = cffi.FFI()
    ffi_interface.cdef(source_sig)
    ffi_interface.set_source("_exercise_3", new_source_src)
    ffi_interface.compile(verbose=True)
    
    from _exercise_3 import ffi, lib
    
    x_cffi = ffi.cast( "double *", ffi.from_buffer(x))
    y_cffi = ffi.new("double[]", len(x))
    lib.elementwise(10, x_cffi, y_cffi)
    y = ffi.unpack(y_cffi, 10)
    
    assert np.allclose(x*2, y)
    print("passed")

elif specialize_for == "gpu_cupy":
    import cupy as cp
    print("using cupy")
    
    module = cp.RawModule(code=new_source_src)
    elementwise_kernel = module.get_function("elementwise")
    
    x_gpu = cp.array(x)
    y_gpu = cp.zeros_like(x_gpu)
    
    blocksize = 1
    n_blocks = int(np.ceil(len(x_gpu) / blocksize))
    elementwise_kernel(grid=(n_blocks,), block=(blocksize,), args=(len(x), x_gpu, y_gpu))
    y_cpu = y_gpu.get()
    assert np.allclose(x*2, y_cpu)
    print("passed")
    
elif specialize_for == "gpu_pyopencl":
    import pyopencl as cl
    print("using pyopencl")

    ctx = cl.create_some_context(interactive=False)
    queue = cl.CommandQueue(ctx)
    
    prg = cl.Program(ctx, new_source_src).build()
    
    x_gpu = cl.array.to_device(queue, x)
    y_gpu = cl.array.zeros_like(x_gpu)

    grid_size = len(x)
    workgroup_size = 1
    prg.elementwise(queue, (grid_size,), (workgroup_size,), np.int32(len(x)), x_gpu.data, y_gpu.data)
    y_cpu = y_gpu.get()
    assert np.allclose(x*2, y_cpu)
    print("passed")
