In [4]:
import time
import cupy as cp
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from numba import jit

num_gpus = cp.cuda.runtime.getDeviceCount()
print('CUPY GPUs', num_gpus)

start = time.time()
# Create two random arrays on the GPU
a = cp.random.rand(10, 10)
b = cp.random.rand(10, 10)
# Perform element-wise multiplication
c = a * b
# Transfer the result back to the host (CPU)
c_host = cp.asnumpy(c)
end = time.time()
print('CUPY', end-start)

print("PyCUDA GPUs", cuda.Device.count())
start = time.time()
# Create a random array on the host (CPU)
a = np.random.randn(10, 10)
# Allocate memory on the GPU
a_gpu = cuda.mem_alloc(a.nbytes)
# Transfer the data to the GPU
cuda.memcpy_htod(a_gpu, a)
# Define a simple CUDA kernel
mod = SourceModule("""
__global__ void doublify(float *a) {
    int idx = threadIdx.x + threadIdx.y * 4;
    a[idx] *= 2;
}
""")
# Get the kernel function
func = mod.get_function("doublify")
# Execute the kernel
func(a_gpu, block=(10, 10, 1))
# Transfer the result back to the host (CPU)
a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
end = time.time()
print('PyCuda', end-start)

start = time.time()
# Create two random arrays on the GPU
a = np.random.rand(10, 10)
b = np.random.rand(10, 10)
# Perform element-wise multiplication
c = a * b
# Transfer the result back to the host (CPU)
end = time.time()
print('Numpy', end-start)

@jit(nopython=True)
def mult():
    a = np.random.rand(10, 10)
    b = np.random.rand(10, 10)
    c = a * b
    return c
    
start = time.time()
mult()
end = time.time()
print('Numba', end-start)

CUPY GPUs 1
CUPY 0.0
PyCUDA GPUs 1
PyCuda 0.00304412841796875
Numpy 0.0
Numba 0.2577366828918457
