In [14]:
import sys
import os
import numpy as np
from numba import vectorize, cuda
from numba.cuda.cudadrv.error import CudaDriverError
import scipy.linalg.blas as blas
import pyculib.blas as cublas
import math
import pandas as pd
import torch
import pyculib
from pyculib.blas import Blas
from utils import (get_number_processors, get_ram_memory, get_total_gpu_memory, 
                   get_gpu_name, get_cuda_version, get_cudnn_version, AttributeDict,
                   get_object_size, clear_memory_all_gpus)

print("System version: {}".format(sys.version))
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("PyTorch version: {}".format(torch.__version__))
print("Pyculib version: {}".format(pyculib.__version__))
print("BLAS info:") 
print(np.show_config())

%load_ext autoreload
%autoreload 2

System version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
Numpy version: 1.16.0
Pandas version: 0.23.4
PyTorch version: 1.0.0
Pyculib version: 1.0.2+7.g9744803
BLAS info:
blas_mkl_info:
  NOT AVAILABLE
blis_info:
  NOT AVAILABLE
openblas_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
blas_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_mkl_info:
  NOT AVAILABLE
openblas_lapack_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
None
The autoreload extension is already loaded. To reload it, use:
  %reload_ext auto

In [9]:
a=np.array([[1,2],[3,4]],dtype=np.float32)
b=np.array([[1,1],[2,2]],dtype=np.float32)

In [22]:
def np_matmul(a,b):
    return np.dot(a,b)

def pytorch_matmul(a,b):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    return torch.mm(at,bt)

def pyculib_matmul(a, b):
    A_d = cuda.to_device(a)
    B_d = cuda.to_device(b)
    return cublas.gemm("N", "N", 1.0, A_d, B_d)



In [11]:
np_matmul(a,b)

array([[ 5.,  5.],
       [11., 11.]], dtype=float32)

In [12]:
pytorch_matmul(a,b)

tensor([[ 5.,  5.],
        [11., 11.]], device='cuda:0')

In [23]:
pyculib_matmul(a, b)

array([[ 5.,  5.],
       [11., 11.]])

In [21]:


A = np.random.randn(3, 3)
B = np.random.randn(3, 3)

C = blas.sgemm(1.0, A, B)
print(C)

A_d = cuda.to_device(A)
B_d = cuda.to_device(B)

C_d = cublas.gemm("N", "N", 1.0, A_d, B_d)
print(C_d)
#C_h = np.zeros((3, 3), dtype=np.float64)
#C_d.copy_to_host(C_h)
#print(C_h)

[[-0.36082494 -0.32567266  0.02785533]
 [ 3.0361838   1.5626514   4.9736047 ]
 [-1.3531643  -0.5507817  -3.2932088 ]]
[[-0.36082489 -0.32567268  0.02785538]
 [ 3.03618366  1.56265139  4.97360477]
 [-1.35316433 -0.55078167 -3.29320896]]


In [24]:
#https://nyu-cds.github.io/python-numba/05-cuda/
#https://stackoverflow.com/questions/36526708/comparing-python-numpy-numba-and-c-for-matrix-multiplication
#http://jiajiamomomo.blogspot.com/2017/04/running-numba-example-of-matrix.html
#http://numba.pydata.org/numba-doc/0.17.0/cuda/examples.html

