# Exercises: Warp Generics

Test your understanding of the concepts from `core_02_generics.ipynb`.

In [1]:
import numpy as np
import warp as wp

wp.config.quiet = True
wp.init()

## Exercise 1: Function Overloading

Define two overloads of a `@wp.func` called `norm` that computes the L2 norm:
1. One that takes a `wp.vec2` and returns `float`
2. One that takes a `wp.vec3` and returns `float`

Then write a kernel that calls both overloads and stores the results in a 2-element output array.

Test with `wp.vec2(3.0, 4.0)` (expect `5.0`) and `wp.vec3(1.0, 2.0, 2.0)` (expect `3.0`).

In [2]:
@wp.func
def norm(v: wp.vec2):
    return wp.norm_l2(v)

@wp.func
def norm(v: wp.vec3):
    return wp.norm_l2(v)

@wp.kernel
def norm_kernel(v1: wp.vec2, v2: wp.vec3, out: wp.array(dtype=wp.float32)):
    n1 = norm(v1)
    n2 = norm(v2)
    out[0] = n1
    out[1] = n2

# Test with `wp.vec2(3.0, 4.0)` (expect `5.0`) and `wp.vec3(1.0, 2.0, 2.0)` (expect `3.0`).
v1 = wp.vec2(3.0, 4.0)
v2 = wp.vec3(1.0, 2.0, 2.0)
out = wp.zeros(shape=(2,))
wp.launch(norm_kernel, dim=1, inputs=(v1, v2), outputs=(out,))

res = out.numpy()
print(res)
assert(res[0] == 5.0)
assert(res[1] == 3.0)


[5. 3.]


## Exercise 2: Generic Functions with `wp.Scalar`

Write a generic `@wp.func` called `abs_val` that takes a `wp.Scalar` and returns its absolute value.

Hint: You can use a simple `if x < type(x)(0)` check and negate.

Then write **two separate kernels** (not a generic kernel) that use `abs_val`:
1. `abs_kernel_i32` operating on `wp.array(dtype=wp.int32)`
2. `abs_kernel_f64` operating on `wp.array(dtype=wp.float64)`

Test that both work correctly with arrays containing negative values.

In [3]:
@wp.func
def abs_val(x: wp.Scalar) -> wp.Scalar:
    if (x < type(x)(0)):
        return -x
    else:
        return x

@wp.kernel
def abs_kernel_i32(arr: wp.array(dtype=wp.int32)):
    i = wp.tid()
    arr[i] = abs_val(arr[i])

@wp.kernel
def abs_kernel_f64(arr: wp.array(dtype=wp.float64)):
    i = wp.tid()
    arr[i] = abs_val(arr[i])

arr1 = wp.array(data=[1, -3, 2, -9], dtype=wp.int32)
print("before: arr1=", arr1)
wp.launch(abs_kernel_i32, dim=arr1.shape, outputs=(arr1,))
print("after: arr1=", arr1)

arr2 = wp.array(data=[10.0, -30.0, 2.0, -90.0], dtype=wp.float64)
wp.launch(abs_kernel_f64, dim=arr2.shape, outputs=(arr2,))
print("arr2=", arr2)

before: arr1= [ 1 -3  2 -9]
after: arr1= [1 3 2 9]
arr2= [10. 30.  2. 90.]


## Exercise 3: Implicit Generic Kernel Instantiation

Write a single generic kernel `add_constant_kernel` that:
- Takes `arr: wp.array(dtype=wp.Scalar)` and `val: wp.Scalar`
- Adds `val` to every element of `arr` in-place

Launch it **three times** with different types to test implicit instantiation:
1. `wp.int16` array with an `wp.int16` constant
2. `wp.float32` array with a `wp.float32` constant
3. `wp.float64` array with a `wp.float64` constant

Print results to verify each launch produced correct output.

In [4]:
@wp.kernel
def add_constant_kernel(arr: wp.array(dtype=wp.Scalar), val: wp.Scalar):
    i = wp.tid()
    arr[i] += val

arr = wp.array((1, 2, 3, 4), dtype=wp.int16)
wp.launch(add_constant_kernel, dim=arr.shape, inputs=(arr, wp.int16(10)))
print(arr)

arr = wp.array((1.0, 2.0, 30.0, 4.0), dtype=wp.float32)
wp.launch(add_constant_kernel, dim=arr.shape, inputs=(arr, wp.float32(10)))
print(arr)

arr = wp.array((10.0, 2.0, 111.0, 4.0), dtype=wp.float64)
wp.launch(add_constant_kernel, dim=arr.shape, inputs=(arr, wp.float64(10)))
print(arr)




[11 12 13 14]
[11. 12. 40. 14.]
[ 20.  12. 121.  14.]


## Exercise 4: Explicit Instantiation with `@wp.overload`

Define the same generic kernel from Exercise 3, but this time use `@wp.overload` to explicitly
create instantiations for `wp.int32` and `wp.float32` **before** launching.

Launch the kernel with both types and verify correct results.

In [5]:
@wp.kernel
def add_constant_kernel(arr: wp.array(dtype=wp.Scalar), val: wp.Scalar):
    i = wp.tid()
    arr[i] += val

@wp.overload
def add_constant_kernel(arr: wp.array(dtype=wp.int32), val: wp.int32):
    ...

@wp.overload
def add_constant_kernel(arr: wp.array(dtype=wp.float32), val: wp.float32):
    ...

arr = wp.array((10, 21, 33, 40), dtype=wp.int32)
wp.launch(add_constant_kernel, dim=arr.shape, inputs=(arr, wp.int32(10)))
print(arr)

arr = wp.array((100.0, 210.0, 3113.0, 9940.5), dtype=wp.float32)
wp.launch(add_constant_kernel, dim=arr.shape, inputs=(arr, wp.float32(10.5)))
print(arr)




[20 31 43 50]
[ 110.5  220.5 3123.5 9951. ]


## Exercise 5: Type Introspection with `arr.dtype`

Write a generic kernel `fill_range_kernel` that takes a `wp.array(dtype=wp.Scalar)` and fills
each element with its thread index, cast to the array's element type.

Use `arr.dtype(value)` inside the kernel to cast the integer thread index to the correct type.

Test with:
1. A `wp.int16` array of length 5 (expect `[0, 1, 2, 3, 4]`)
2. A `wp.float64` array of length 4 (expect `[0.0, 1.0, 2.0, 3.0]`)

In [6]:
@wp.kernel
def fill_range_kernel(arr: wp.array(dtype=wp.Scalar)):
    i = wp.tid()
    arr[i] = arr.dtype(i)

arr = wp.zeros(shape=(5,), dtype=wp.int16)
wp.launch(fill_range_kernel, dim=arr.shape, inputs=(arr,))
print(arr)

arr = wp.zeros(shape=(4,), dtype=wp.float64)
wp.launch(fill_range_kernel, dim=arr.shape, inputs=(arr,))
print(arr)

[0 1 2 3 4]
[0. 1. 2. 3.]


## Exercise 6: Generic Dot Product

Write a generic `@wp.func` called `dot2` that computes the dot product of two 2-element sequences.
It should take arguments `a0: wp.Scalar, a1: wp.Scalar, b0: wp.Scalar, b1: wp.Scalar` and
return `a0 * b0 + a1 * b1`.

Write a generic kernel `dot_kernel` that:
- Takes two input arrays and one output array (all `wp.array(dtype=wp.Scalar)`)
- Reads pairs of consecutive elements from each input array
- Stores the dot product result

Test with `wp.float32` arrays: inputs `[1, 2, 3, 4]` and `[5, 6, 7, 8]`,
output should be `[1*5 + 2*6, 3*7 + 4*8]` = `[17, 53]`.

In [7]:
@wp.func
def dot2(a0: wp.Scalar, a1: wp.Scalar, b0: wp.Scalar, b1: wp.Scalar):
    return a0 * b0 + a1 * b1

@wp.kernel
def dot_kernel(arr1: wp.array(dtype=wp.Scalar), arr2: wp.array(dtype=wp.Scalar), out: wp.array(dtype=wp.Scalar)):
    i = wp.tid()
    out[i] = dot2(arr1[i*2], arr1[i*2+1], arr2[i*2], arr2[i*2+1])

arr1 = wp.array((1, 2, 3, 4), dtype=wp.float32)
arr2 = wp.array((5, 6, 7, 8), dtype=wp.float32)
out = wp.zeros(shape=(2, ), dtype=wp.float32)
wp.launch(dot_kernel, dim=out.shape, inputs=(arr1, arr2), outputs=(out,))
print(out)

[17. 53.]


## Exercise 7: Dynamic Code Generation

Write a closure `create_power_kernel(exponent: int)` that returns a kernel raising each array
element to the given power using repeated multiplication.

The returned kernel should operate on `wp.array(dtype=float)`.

Test:
1. `create_power_kernel(2)` on `[2.0, 3.0, 4.0]` should give `[4.0, 9.0, 16.0]`
2. `create_power_kernel(3)` on `[2.0, 3.0, 4.0]` should give `[8.0, 27.0, 64.0]`

In [8]:
def create_power_kernel(exponent: int):
    @wp.func
    def pow(val: wp.Scalar):
        res = val.dtype(1)
        for _ in range(0, exponent):
            res = res * val
        return res

    @wp.kernel
    def kernel(arr: wp.array(dtype=float)):
        i = wp.tid()
        arr[i] = pow(arr[i])

    return kernel

pow_2 = create_power_kernel(2)
pow_3 = create_power_kernel(3)
arr = wp.array((2.0, 3.0, 4.0), dtype=float)
wp.launch(pow_2, dim=arr.shape, outputs=(arr,))
print(arr)

arr = wp.array((2.0, 3.0, 4.0), dtype=float)
wp.launch(pow_3, dim=arr.shape, outputs=(arr,))
print(arr)


[ 4.  9. 16.]
[ 8. 27. 64.]


## Exercise 8: Dynamic Kernel with Type Parameter

Write a closure `create_clamp_kernel(dtype)` that takes a Warp scalar type (e.g. `wp.float32`,
`wp.int32`) and returns a kernel that clamps array elements to the range `[-10, 10]`.

Use the `dtype` parameter to cast the bounds inside the kernel.

Test:
1. With `wp.float32`: input `[-100.0, 5.0, 100.0]` should give `[-10.0, 5.0, 10.0]`
2. With `wp.int32`: input `[-100, 5, 100]` should give `[-10, 5, 10]`

In [9]:
def create_clamp_kernel(dtype):
    lower = dtype(-10)
    upper = dtype(10)

    @wp.func
    def clamp(val: dtype):
        if (val < lower):
            val = lower
        if (val > upper):
            val = upper
        return val
    
    @wp.kernel
    def clamp_kernel(arr: wp.array(dtype=dtype)):
        i = wp.tid()
        arr[i] = clamp(arr[i])
    
    return clamp_kernel

clamp_f32 = create_clamp_kernel(wp.float32)
arr = wp.array((-100.0, 5.0, 100.0), dtype=wp.float32)
wp.launch(clamp_f32, dim=arr.shape, outputs=(arr,))
print(arr)

clamp_i32 = create_clamp_kernel(wp.int32)
arr = wp.array((-100, 5, 100), dtype=wp.int32)
wp.launch(clamp_i32, dim=arr.shape, outputs=(arr,))
print(arr)

[-10.   5.  10.]
[-10   5  10]


## Exercise 9: Combining Overloading and Generics

Define two overloads of a `@wp.func` called `combine`:
1. One that takes two `wp.Scalar` values and returns their sum
2. One that takes two `wp.vec3` values and returns their cross product (`wp.cross()`)

Write a kernel that uses the scalar overload to add corresponding elements of two `float` arrays
into an output array.

Write a second kernel that uses the `vec3` overload to compute the cross product of corresponding
elements from two `vec3` arrays.

Test both kernels.

In [12]:
@wp.func
def combine(n1: wp.Scalar, n2: wp.Scalar):
    return n1 + n2

@wp.func
def combine(v1: wp.vec3, v2: wp.vec3):
    return wp.cross(v1, v2)

@wp.kernel
def add(arr1: wp.array(dtype=wp.float16), arr2: wp.array(dtype=wp.float16), out: wp.array(dtype=wp.float16)):
    i = wp.tid()
    out[i] = combine(arr1[i], arr2[i])

@wp.kernel
def cross(v1: wp.vec3, v2: wp.vec3, out: wp.array(dtype=wp.vec3)):
    out[0] = combine(v1, v2)

arr1 = wp.array((1.0, 5.0, 10.0), dtype=wp.float16)
arr2 = wp.array((1.0, 2.0, 3.0), dtype=wp.float16)
out = wp.zeros(shape=arr1.shape, dtype=wp.float16)
wp.launch(add, dim=out.shape, inputs=(arr1, arr2), outputs=(out,))
print(out)

vec1 = wp.vec3(1.0, 0.0, 0.0)
vec2 = wp.vec3(0.0, 1.0, 0.0)
out = wp.array(wp.vec3(0.0, 0.0, 0.0), dtype=wp.vec3)
wp.launch(cross, dim=1, inputs=(vec1, vec2), outputs=(out,))
print("===========")
print(out)



[ 2.  7. 13.]
[[0. 0. 1.]]


## Exercise 10: Putting It All Together

Write a closure `create_reduce_kernel(op_fn, dtype)` that returns a kernel performing
a pairwise reduction on an array. The kernel should:
1. Read element `2*i` and `2*i+1` from the input array
2. Apply `op_fn` to combine them
3. Store the result in the output array at index `i`

Define two generic `@wp.func` operators:
- `op_add(a: wp.Scalar, b: wp.Scalar) -> wp.Scalar` (returns `a + b`)
- `op_max(a: wp.Scalar, b: wp.Scalar) -> wp.Scalar` (returns `wp.max(a, b)`)

Test:
1. Reduce `[1.0, 3.0, 5.0, 2.0, 8.0, 4.0]` with `op_add` and `wp.float32` -> `[4.0, 7.0, 12.0]`
2. Reduce `[1.0, 3.0, 5.0, 2.0, 8.0, 4.0]` with `op_max` and `wp.float32` -> `[3.0, 5.0, 8.0]`

In [23]:
# Your code here
@wp.func
def op_add(a: wp.Scalar, b: wp.Scalar) -> wp.Scalar:
    return a + b

@wp.func
def op_max(a: wp.Scalar, b: wp.Scalar) -> wp.Scalar:
    return wp.max(a, b)

def create_reduce_kernel(op_fn, dtype):

    @wp.kernel
    def reduce_kernel(arr: wp.array(dtype=dtype), out: wp.array(dtype=dtype)):
        i = wp.tid()
        out[i] = op_fn(arr[2*i], arr[2*i+1])
    
    return reduce_kernel

arr1 = wp.array([1.0, 3.0, 5.0, 2.0, 8.0, 4.0], dtype=wp.float32)
(n,) = arr1.shape
out = wp.zeros(shape=(n/2,))
k1 = create_reduce_kernel(op_add, wp.float32)
wp.launch(k1, dim=out.shape, inputs=(arr1,), outputs=(out,))
print(out)

out = wp.zeros(shape=(n/2,))
k2 = create_reduce_kernel(op_max, wp.float32)
wp.launch(k2, dim=out.shape, inputs=(arr1,), outputs=(out,))
print(out)


[ 4.  7. 12.]
[3. 5. 8.]
