In [6]:
import cupy as cp
from cupy import cuda
from cupy.cuda import cublas

# Initialize cuBLAS handle
handle = cublas.create()

# Example setup: Create a list of blocks with varying dimensions (A, B matrices)
blocks = [
    (cp.random.randn(64, 32).astype(cp.float16), cp.random.randn(32, 64).astype(cp.float16)),
    (cp.random.randn(128, 64).astype(cp.float16), cp.random.randn(64, 128).astype(cp.float16)),
    (cp.random.randn(256, 128).astype(cp.float16), cp.random.randn(128, 256).astype(cp.float16)),
    (cp.random.randn(512, 256).astype(cp.float16), cp.random.randn(256, 512).astype(cp.float16)),
    # Add more blocks as needed...
]

# Create a list of output matrices
outputs = [cp.zeros((A.shape[0], B.shape[1]), dtype=cp.float16) for A, B in blocks]

# Define CUDA streams dynamically
num_blocks = len(blocks)
streams = [cp.cuda.Stream() for _ in range(num_blocks)]  # Create a stream for each block

# Define GEMM parameters
alpha = 1.0
beta = 0.0
transA = cublas.CUBLAS_OP_N  # No transpose for A
transB = cublas.CUBLAS_OP_N  # No transpose for B

# Function to dispatch GEMM for each block in parallel
def dispatch_gemm(i, A, B, C, stream):
    with stream:
        cublas.hgemm(handle, transA, transB, A.shape[0], B.shape[1], A.shape[1], 
                     alpha, A.data.ptr, A.shape[0], B.data.ptr, B.shape[0], 
                     beta, C.data.ptr, C.shape[0])

# Dispatch work for each block in parallel using streams
for i in range(num_blocks):
    A, B = blocks[i]
    C = outputs[i]
    dispatch_gemm(i, A, B, C, streams[i])

# Synchronize all streams to ensure all GEMMs are finished
for stream in streams:
    stream.synchronize()

# At this point, `outputs` contains the results of all matrix multiplications
for C in outputs:
    print(C)


AttributeError: module 'cupy_backends.cuda.libs.cublas' has no attribute 'hgemm'