In [1]:
!nvidia-smi

Thu Oct 20 17:08:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.56.06    Driver Version: 516.94       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 30%   30C    P8    14W / 170W |   1550MiB / 12288MiB |     30%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import numpy as np
from cupyx.scipy import linalg
from scipy.sparse import csr_matrix
from cupyx.scipy.sparse import csr_matrix as csr_gpu
import cupy as cp

In [3]:
REP_N = 1000
ARRAY_SIZE = 1000

A = cp.asarray(np.array([[i + j for j in range(ARRAY_SIZE)]
                              for i in range(ARRAY_SIZE)], np.float32))
x = cp.asarray(np.array([[i - j for j in range(ARRAY_SIZE)] for i in range(REP_N)], np.float32))
y = cp.empty((REP_N, ARRAY_SIZE), np.float32)

In [4]:
%%timeit
for i in range(REP_N):
    y[i] = cp.dot(A, x[i])

29.7 ms ± 5.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
A_cpu = cp.asnumpy(A)
x_cpu = cp.asnumpy(x)
y_cpu = cp.asnumpy(y)

In [6]:
%%timeit
for i in range(REP_N):
    y_cpu[i] = np.dot(A_cpu, x_cpu[i])

11.7 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
class XorShift32():
    def __init__(self, seed=42):
        self.set(seed);

    def set(self, seed):
        self.x = seed;

    def next(self):
        self.x ^= (self.x << 13) & 0xFFFFFFFF
        self.x ^= (self.x >> 17) & 0xFFFFFFFF
        self.x ^= (self.x << 5) & 0xFFFFFFFF
        return self.x

xor32 = XorShift32(42)

ratio = 0.05

data = []
row = []
col = []
A_cpu = A.get()
for i in range(ARRAY_SIZE):
    for j in range(ARRAY_SIZE):
        if (float(xor32.next()) / float(0xFFFFFFFF)) < ratio:
            row.append(i)
            col.append(j)
            data.append(A_cpu[i][j])
            
data_nd = cp.asarray(data)
row_nd = cp.asarray(row)
col_nd = cp.asarray(col)

A_sparsed = csr_gpu((data_nd, (row_nd, col_nd)), shape=(ARRAY_SIZE, ARRAY_SIZE), dtype=np.float32)
y_sparsed = cp.empty((REP_N, ARRAY_SIZE), np.float32)

A_sparsed_dense = A_sparsed.todense()
y_sparsed_dense = cp.empty((REP_N, ARRAY_SIZE), np.float32)

A_sparsed_dense_cpu = cp.asnumpy(A_sparsed_dense)
y_sparsed_cpu = cp.asnumpy(A_sparsed_dense)

In [8]:
%%timeit
for i in range(REP_N):
    y_sparsed[i] = A_sparsed.dot(x[i])

65.1 ms ± 6.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%%timeit
for i in range(REP_N):
    y_sparsed_dense[i] = cp.dot(A_sparsed_dense, x[i])

27.3 ms ± 1.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%%timeit
for i in range(REP_N):
    y_sparsed_cpu[i] = cp.dot(A_sparsed_dense_cpu, x_cpu[i])

11.7 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
