In [None]:
# SAVE AS invert_cupy.py (works in Colab too!)
import numpy as np
import cupy as cp
import time

# Choose image size
H, W = 4096, 4096
img = np.random.randint(0, 256, size=(H, W), dtype=np.uint8)

print("GPU available?:", cp.cuda.runtime.getDeviceCount() > 0)

# Define raw CUDA kernel in CuPy
invert_kernel = cp.RawKernel(r'''
extern "C" __global__
void invert_kernel(const unsigned char* input_img, unsigned char* output_img,
                   int H, int W) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    if (x < H && y < W) {
        output_img[x * W + y] = 255 - input_img[x * W + y];
    }
}
''', 'invert_kernel')


def run_gpu(block_dim):
    bx, by = block_dim
    gx = (H + bx - 1) // bx
    gy = (W + by - 1) // by
    grid_dim = (gx, gy)

    d_in = cp.array(img)
    d_out = cp.empty_like(d_in)

    # Warmup
    invert_kernel(grid_dim, (bx, by), (d_in, d_out, H, W))
    cp.cuda.runtime.deviceSynchronize()

    # Timing with CUDA events
    start = cp.cuda.Event()
    end = cp.cuda.Event()
    start.record()
    invert_kernel(grid_dim, (bx, by), (d_in, d_out, H, W))
    end.record()
    end.synchronize()
    elapsed_ms = cp.cuda.get_elapsed_time(start, end)

    out = cp.asnumpy(d_out)
    correct = np.all(out == (255 - img))

    return elapsed_ms, correct


blocks = [(8,8), (16,16), (32,32)]
for b in blocks:
    t_ms, correct = run_gpu(b)
    print(f"Block {b}: {t_ms:.3f} ms, correct={correct}")


# CPU fallback for comparison
def invert_cpu(in_img):
    return 255 - in_img

for b in blocks:
    t0 = time.perf_counter()
    out = invert_cpu(img)
    t1 = time.perf_counter()
    print(f"Block {b} (CPU): {(t1-t0)*1000:.3f} ms, correct={np.all(out==(255-img))}")


# Auto block size chooser (same heuristic)
def choose_block_size_auto(img_shape, max_threads_per_block=1024):
    candidates = [(8,8), (16,16), (32,32)]
    chosen = None
    for b in candidates[::-1]:
        tp = b[0]*b[1]
        if tp <= max_threads_per_block and tp % 32 == 0:
            chosen = b
            break
    if chosen is None:
        chosen = (16,16)
    return chosen

print("Auto-chosen block size:", choose_block_size_auto(img.shape, max_threads_per_block=1024))


GPU available?: True
Block (8, 8): 1.017 ms, correct=True
Block (16, 16): 1.884 ms, correct=True
Block (32, 32): 3.479 ms, correct=True
Block (8, 8) (CPU): 3.730 ms, correct=True
Block (16, 16) (CPU): 4.733 ms, correct=True
Block (32, 32) (CPU): 3.935 ms, correct=True
Auto-chosen block size: (32, 32)
