In [None]:
# Testing CUDA spgemm implementation

In [None]:
import numpy as np
import torch
import numml.sparse as sp
import time
sp_cpp = sp.numml_torch_cpp

In [None]:
# Large(r) poisson problem
# We'll compute the product A @ (-A)

N = 1024
A = sp.eye(N) * 2 - sp.eye(N, k=-1) - sp.eye(N, k=1)
B = (-A).copy()

In [None]:
# Move both over to GPU
A_c = A.to('cuda:0')
B_c = B.to('cuda:0')

In [None]:
# Make sure both implementations are identical
torch.allclose((A_c@B_c).cpu().data, (A@B).data)

In [None]:
# Forward timing test

N_it = 300
print(f'Performing {N_it} sparse matmat (forward pass)')

t_start = time.time()
for i in range(N_it):
    C = A@B
t_cpu = time.time() - t_start
print('CPU time:', t_cpu)

t_start = time.time()
for i in range(N_it):
    C_c = A_c@B_c
torch.cuda.synchronize()
t_cuda = time.time() - t_start
print('GPU time:', t_cuda)
print()

In [None]:
# Test that our GPU grad matches the CPU grad

A_c.requires_grad = True
B_c.requires_grad = True
C_c = A_c @ B_c
C_c.sum().backward()

A.requires_grad = True
B.requires_grad = True
C = A@B
C.sum().backward()

print(torch.allclose(A_c.grad.data.cpu(), A.grad.data))
print(torch.allclose(B_c.grad.data.cpu(), B.grad.data))

In [None]:
# Backward timing test

N_it = 100
print(f'Performing {N_it} sparse matmat (backward pass)')

t_start = time.time()
for i in range(N_it):
    C = A@B
    C.sum().backward()
t_cpu = time.time() - t_start
print('CPU time:', t_cpu)

t_start = time.time()
for i in range(N_it):
    C_c = A_c@B_c
    C_c.sum().backward()
torch.cuda.synchronize()
t_cuda = time.time() - t_start
print('GPU time:', t_cuda)
print()