Example from [SO](http://stackoverflow.com/questions/7409108/portfolio-variance-of-a-portfolio-of-n-assets-in-python)

In [6]:
import numpy as np

weights = np.array(
     [[ 0.09],
      [ 0.05],
      [ 0.15],
      [ 0.10],
      [ 0.15],
      [ 0.15],
      [ 0.08],
      [ 0.08],
      [ 0.1 ],
      [ 0.05]]
, dtype=np.float32)

covar = np.matrix([
[ 0.00154474, 0.00079555, 0.00099691, 0.00052596, 0.0005363,  0.00062005, 0.00064031, 0.00037494, 0.00018826, 0.00132809],
[ 0.00079555, 0.00287429, 0.00058536, 0.00091774, 0.00046885, 0.00110434, 0.00137141, 0.00046724, 0.00030414, 0.0016615 ],
[ 0.00099691, 0.00058536, 0.00155757, 0.00056336, 0.00052395, 0.00060104, 0.00057223, 0.00021365, 0.00017057, 0.00130247],
[ 0.00052596, 0.00091774, 0.00056336, 0.00126312, 0.00031941, 0.00088137, 0.00024493, 0.00025136, 0.00011519, 0.00135475],
[ 0.0005363,  0.00046885, 0.00052395, 0.00031941, 0.00054093, 0.00045649, 0.00042927, 0.00021928, 0.00016835, 0.00093471],
[ 0.00062005, 0.00110434, 0.00060104, 0.00088137, 0.00045649, 0.00133081, 0.00060353, 0.0003967,  0.00024983, 0.00168281],
[ 0.00064031, 0.00137141, 0.00057223, 0.00024493, 0.00042927, 0.00060353, 0.00468731, 0.00059557, 0.00020384, 0.00078669],
[ 0.00037494, 0.00046724, 0.00021365, 0.00025136, 0.00021928, 0.0003967, 0.00059557, 0.00082333, 0.00017191, 0.00066816],
[ 0.00018826, 0.00030414, 0.00017057, 0.00011519, 0.00016835, 0.00024983, 0.00020384, 0.00017191, 0.00036348, 0.0004505 ],
[ 0.00132809, 0.0016615,  0.00130247, 0.00135475, 0.00093471, 0.00168281, 0.00078669, 0.00066816, 0.0004505,  0.00530036]]
                 , dtype=np.float32)


In [7]:
weights.T*np.matrix(covar)*weights

matrix([[ 0.00064654]], dtype=float32)

In [40]:
from numba import vectorize
#from numba import jit
#@jit(nopython=True)

@vectorize(["float32(float32, float32)"], target='cuda')
def VectorAdd(weights, covar):
    #return weights.T.dot(covar.dot(weights))
    return weights

CudaAPIError: [700] Call to cuLinkCreate results in CUDA_ERROR_LAUNCH_FAILED

In [33]:
VectorAdd(weights, covar)

CudaAPIError: [700] Call to cuMemcpyDtoH results in CUDA_ERROR_LAUNCH_FAILED

In [64]:
from numba import *
from timeit import default_timer as time

bpg = 50
tpb = 32
n = bpg * tpb
        
A = np.array(np.random.random((n, n)), dtype=np.float32)
B = np.array(np.random.random((n, n)), dtype=np.float32)
C = np.empty_like(A)

print("N = %d x %d" % (n, n))

# Host compute
Amat = np.matrix(A)
Bmat = np.matrix(B)

s = time()
Cans = Amat * Bmat
e = time()
tcpu = e - s



#@cuda.jit(argtypes=[f4[:,:], f4[:,:], f4[:,:]])
@guvectorize(["void(float32[:,:], float32[:,:], float32[:,:])"], '(n,m),(m,n)->(n,n)', target='cuda')
def cu_square_matrix_mul(A, B, C):
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            C[i, j] = A[i, j] + B[i, j]
            
#     tx = cuda.threadIdx.x
#     ty = cuda.threadIdx.y
#     bx = cuda.blockIdx.x
#     by = cuda.blockIdx.y
#     bw = cuda.blockDim.x
#     bh = cuda.blockDim.y

#     x = tx + bx * bw
#     y = ty + by * bh

#     if x >= n or y >= n:
#         return

#     C[y, x] = 0
#     for i in range(n):
#         C[y, x] += A[y, i] * B[i, x]

s = time()
stream = cuda.stream()
with stream.auto_synchronize():
    dA = cuda.to_device(A, stream)
    dB = cuda.to_device(B, stream)
    dC = cuda.to_device(C, stream)
    cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
    dC.to_host(stream)

e = time()
tcuda = e - s

tcpu, tcuda

N = 1600 x 1600


CudaAPIError: [700] Call to cuLinkCreate results in CUDA_ERROR_LAUNCH_FAILED