In [None]:
!pip install pycuda
import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
# CUDA kernel
kernel_code = """
__global__ void hello(int *out)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    out[tid] = tid;  // write thread ID
}
"""
# Compile
mod = SourceModule(kernel_code)
hello = mod.get_function("hello")
# Prepare GPU array (8 threads total)
n_threads = 8
out_gpu = gpuarray.zeros(n_threads, dtype=np.int32)
# Launch <<<2,4>>>
hello(out_gpu, block=(4,1,1), grid=(2,1))
# Copy back and print
print("=== Hello from GPU threads ===")
for tid in out_gpu.get():   # .get() copies back to host
    print(f"Hello from thread {tid}")

=== Hello from GPU threads ===
Hello from thread 0
Hello from thread 1
Hello from thread 2
Hello from thread 3
Hello from thread 4
Hello from thread 5
Hello from thread 6
Hello from thread 7


In [None]:
!pip install pycuda
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time

# CUDA kernel
kernel_code = """
__global__ void vector_add(float *a, float *b, float *c, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}
"""

mod = SourceModule(kernel_code)
vector_add = mod.get_function("vector_add")

# Problem size
N = 10_000_000
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)
c = np.zeros_like(a)

# Allocate GPU memory
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

# Copy to GPU
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

# CPU timing
start = time.time()
c_cpu = a + b
cpu_time = time.time() - start
print(f"CPU Time: {cpu_time:.4f} sec")

# GPU timing
threads_per_block = 256
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block

start = time.time()
vector_add(a_gpu, b_gpu, c_gpu, np.int32(N),
           block=(threads_per_block,1,1), grid=(blocks_per_grid,1))
cuda.Context.synchronize()
gpu_time = time.time() - start
print(f"GPU Time: {gpu_time:.4f} sec")

# Copy back
cuda.memcpy_dtoh(c, c_gpu)

# Verify correctness
print("Results match:", np.allclose(c, c_cpu))

# Speedup
speedup = cpu_time / gpu_time
print(f"Speedup = {speedup:.2f}x")


CPU Time: 0.0170 sec
GPU Time: 0.0007 sec
Results match: True
Speedup = 23.89x


In [None]:
# SAVE AS invert_cupy_test.py
import numpy as np
import cupy as cp
import cv2
import time

# -------------------------
# Generate test image (grayscale gradient)
# -------------------------
H, W = 2048, 2048   # Adjust size if you want
gray = np.tile(np.arange(W, dtype=np.uint8), (H, 1))

cv2.imwrite("test_input.png", gray)
print(f"Generated test image: {H}x{W}")

# -------------------------
# CPU VERSION (explicit loop)
# -------------------------
cpu_start = time.time()
cpu_inverted = np.empty_like(gray)
for i in range(H):
    for j in range(W):
        cpu_inverted[i, j] = 255 - gray[i, j]
cpu_time = time.time() - cpu_start
print(f"CPU loop inversion time: {cpu_time:.4f} sec")

# -------------------------
# GPU VERSION (CuPy RawKernel)
# -------------------------
gpu_start = time.time()

d_img = cp.array(gray)

invert_kernel = cp.RawKernel(r'''
extern "C" __global__
void invert(unsigned char* img, unsigned char* out, int H, int W) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    if (x < W && y < H) {
        int idx = y * W + x;
        out[idx] = 255 - img[idx];
    }
}
''', "invert")

d_out = cp.empty_like(d_img)

threads = (16, 16)
blocks = ((W + threads[0] - 1) // threads[0],
          (H + threads[1] - 1) // threads[1])

invert_kernel(blocks, threads, (d_img, d_out, H, W))

gpu_inverted = d_out.get()
gpu_time = time.time() - gpu_start
print(f"GPU inversion time: {gpu_time:.4f} sec")

# -------------------------
# Verify correctness
# -------------------------
diff = np.abs(cpu_inverted.astype(int) - gpu_inverted.astype(int)).sum()
print("Difference between CPU and GPU outputs:", diff)

# -------------------------
# Save results
# -------------------------
cv2.imwrite("cpu_inverted.png", cpu_inverted)
cv2.imwrite("gpu_inverted.png", gpu_inverted)
print("Saved test_input.png, cpu_inverted.png, and gpu_inverted.png")


Generated test image: 2048x2048
CPU loop inversion time: 5.6526 sec
GPU inversion time: 1.1336 sec
Difference between CPU and GPU outputs: 0
Saved test_input.png, cpu_inverted.png, and gpu_inverted.png
