# CuPy vs NumPy einsum performance

In [1]:
import cupy as cp
import numpy as np

In [2]:
nsize = 500

In [3]:
a = np.random.rand(nsize,nsize).astype(np.float32)
b = np.random.rand(nsize,nsize).astype(np.float32)

a_cupy = cp.asarray(a)
b_cupy = cp.asarray(b)

CuPy

In [4]:
%timeit cp.einsum('ij,jk->ijk',a_cupy,b_cupy)
%timeit cp.einsum('ij,ik->ijk',a_cupy,a_cupy)

NumPy

In [5]:
%timeit np.einsum('ij,jk->ijk',a,b)
%timeit np.einsum('ij,ik->ijk',a,a)

## SoftmaxTimesVector (the name of one of the functions in crysx_n)

In [10]:
a = np.random.rand(32,nsize,nsize).astype(np.float32)
b = np.random.rand(32,nsize).astype(np.float32)

a_cupy = cp.asarray(a)
b_cupy = cp.asarray(b)

In [11]:
from numba import njit, prange, cuda

In [12]:
def softmaxTimesVector_cupy(a,b):
    # Reference: https://stackoverflow.com/questions/59289754/numpy-multiply-3d-array-with-2d-array
    ## Both the following methods are equally fast and give correct results
    output = cp.einsum('ijk,ik->ij',a,b)
    # output = (a @ b[..., cp.newaxis])[..., 0]
    return output

In [16]:


%timeit softmaxTimesVector_cupy(a_cupy,b_cupy)

In [17]:
@njit(cache=False,fastmath=True, parallel=False) #Works faster without tha parallel flag
def softmaxTimesVector(a,b):
    output = np.zeros((a.shape[0],a.shape[1]),dtype=np.float32)
    for i in prange(a.shape[0]):
#         a_temp = 
#         output[i,:] = np.dot(a[i,:,:], b[i,:]).T 
        output[i] = np.dot(a[i], b[i])
    return output

In [18]:
%timeit softmaxTimesVector(a,b)