# Benchmark of Matrix Multiplications
On this benchmark we compare several operations using numpy, numexpr and numba (CPU&GPU).

In [1]:
import sys
import os
import numpy as np
import numexpr as ne
from numba import vectorize
import math
from functools import reduce
import pandas as pd
import bokeh
from utils import (get_number_processors, get_ram_memory, get_total_gpu_memory, 
                   get_gpu_name, get_cuda_version, get_cudnn_version, AttributeDict,
                   get_object_size)

print("System version: {}".format(sys.version))
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Numexpr version: {}".format(ne.__version__))


%load_ext autoreload
%autoreload 2

System version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
Numpy version: 1.16.0
Pandas version: 0.23.4
Numexpr version: 2.6.9


## Helper function with numpy

In [2]:
def multiply(a,b):
    return a*b

def exponential(a, b):
    return a*np.exp(b)

def sine(a, b):
    return a*np.sin(b)

# A general function that multiplies an arbitrary number of matrices
# is 28% slower than directly multiplying the factors.
# The function multiply_list is not used, just leaving it here for reference
def multiply_list(l):
    return reduce(lambda x, y: x*y, l) 

def multiply3(a, b, c):
    return a*b*c

def multiply5(a, b, c, d, e):
    return a*b*c*d*e

def exponential_sine(a, b, c):
    return a*np.exp(b)*np.sin(c)

## Helper function with numpexp

In [3]:
def ne_multiply(a,b):
    return ne.evaluate("a*b")

def ne_exponential(a, b):
    return ne.evaluate("a*exp(b)")

def ne_sine(a, b):
    return ne.evaluate("a*sin(b)")

def ne_multiply3(a, b, c):
    return ne.evaluate("a*b*c")

def ne_multiply5(a, b, c, d, e):
    return ne.evaluate("a*b*c*d*e")

def ne_exponential_sine(a, b, c):
    return ne.evaluate("a*exp(b)*sin(c)")


## Helper functions for numba
NOTE: For numba solutions, having a solution empty vector speeds up around 10%
```
r0 = np.empty((S1, S2), dtype=np.int16)
r0 = multicpu(a, b)
```
source: https://devblogs.nvidia.com/numba-python-cuda-acceleration/

In [4]:
@vectorize(["int16(int16, int16)"], target="cpu")
def multicpu(a, b):
    return a * b

@vectorize(["int16(int16, int16)"], target="cuda")
def multicuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def multfcpu(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cuda")
def multfcuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def expcpu(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def expcuda(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cpu")
def sincpu(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def sincuda(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32, float32)"], target="cpu")
def multfcpu3(a, b, c):
    return a * b * c

@vectorize(["float32(float32, float32, float32)"], target="cuda")
def multfcuda3(a, b, c):
    return a * b * c

@vectorize(["float32(float32, float32, float32, float32, float32)"], target="cpu")
def multfcpu5(a, b, c, d, e):
    return a * b * c * d * e

@vectorize(["float32(float32, float32, float32, float32, float32)"], target="cuda")
def multfcuda5(a, b, c, d, e):
    return a * b * c * d * e

@vectorize(["float32(float32, float32, float32)"], target="cpu")
def expsincpu(a, b, c):
    return a*math.exp(b)*math.sin(c)

@vectorize(["float32(float32, float32, float32)"], target="cuda")
def expsincuda(a, b, c):
    return a*math.exp(b)*math.sin(c)

## Parameters

In [5]:
size_combinations=[
    (100, 100),
    (1000, 1000),
    (10000, 10000),
    (100000, 10000),
    (100000, 100000)
]

In [6]:
columns = ["n_processors",
           "cpu_memory",
           "gpu_name",
           "gpu_memory",
           "data_type",
           "size1",
           "size2",
           "operation",
           "numpy",
           "numexpr",
           "numba_cpu",
           "numba_gpu"]

In [7]:
n_processors = get_number_processors()
cpu_memory = get_ram_memory(units="Gb")
gpu_name = get_gpu_name()[0]
gpu_memory = get_total_gpu_memory(units="Gb")[0]
header = [n_processors, cpu_memory, gpu_name, gpu_memory]

In [8]:
filebase = gpu_name.replace(" ", "-")
filebase

'Tesla-V100-PCIE-16GB'

In [9]:
folder = "data"
os.makedirs(folder, exist_ok=True)

## Data

In [10]:
def factors_int(s1=100, s2=100):
    a = np.random.randint(1, 5, (s1, s2), dtype=np.int16)
    b = np.random.randint(1, 10, (s1, s2), dtype=np.int16)
    return a, b

def factors_float(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    return a, b

def factors_float3(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    c = np.random.uniform(low=0, high=10, size=(s1,s2)).astype(np.float32)
    return a, b, c

def factors_float5(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    c = np.random.uniform(low=0, high=10, size=(s1,s2)).astype(np.float32)
    d = np.random.uniform(low=5, high=15, size=(s1,s2)).astype(np.float32)
    e = np.random.uniform(low=0, high=30, size=(s1,s2)).astype(np.float32)
    return a, b, c, d, e

Checking data sizes in Gb

In [11]:
a, _ = factors_int(size_combinations[-1][0], size_combinations[-1][1])
print(get_object_size(a, units="Gb"))
a, _ = factors_float(size_combinations[-1][0], size_combinations[-1][1])
print(get_object_size(a, units="Gb"))

18.6264515966177
37.25290308892727


## Benchmark

#### Integer matrix multiplication

In [11]:
df = pd.DataFrame(columns=columns)
operation = "a*b"
for s1, s2 in size_combinations:
    a, b = factors_int(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multicpu(a,b)
    try:
        r4 = %timeit -o multicuda(a,b)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row


2.1 µs ± 20.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
464 µs ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.89 µs ± 2.34 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
1.43 ms ± 4.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
213 µs ± 813 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
573 µs ± 8.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
214 µs ± 873 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.62 ms ± 17.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
89.5 ms ± 591 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
21.4 ms ± 420 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
88 ms ± 845 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
145 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
900 ms ± 5.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
157 ms ± 7.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
927

In [12]:
filename = filebase + "_" + operation + "_int" + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100,100,a*b,2e-06,0.000464,2e-06,0.00143465
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,1000,1000,a*b,0.000213,0.000573,0.000214,0.00461905
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,10000,10000,a*b,0.08945,0.021387,0.088043,0.145048
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100000,10000,a*b,0.899621,0.157267,0.927014,1.43426
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100000,100000,a*b,8.922588,1.671576,8.855293,OOM


#### Float matrix multiplication

In [11]:
df = pd.DataFrame(columns=columns)
operation = "a*b"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    try:
        r4 = %timeit -o multfcuda(a,b)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

2.87 µs ± 3.39 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
470 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.06 µs ± 11.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.43 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
399 µs ± 730 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
577 µs ± 20.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
423 µs ± 713 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.11 ms ± 14.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
177 ms ± 914 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.8 ms ± 455 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
176 ms ± 822 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
279 ms ± 36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.74 s ± 6.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
292 ms ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.73 s ± 5.57 

In [12]:
filename = filebase + "_" + operation + "_float" + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*b,3e-06,0.00047,3e-06,0.0014349
1,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*b,0.000399,0.000577,0.000423,0.00511176
2,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*b,0.177347,0.031833,0.175587,0.279195
3,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*b,1.735279,0.291996,1.732133,2.81469
4,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*b,17.586249,2.701125,17.506866,OOM


#### Exponential matrix multiplication

In [13]:
df = pd.DataFrame(columns=columns)
operation = "a*exp(b)"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    try:
        r4 = %timeit -o multfcuda(a,b)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2)) 
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

2.87 µs ± 5.14 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
451 µs ± 34.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.27 µs ± 5.01 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.43 ms ± 5.83 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

398 µs ± 3.68 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
576 µs ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
422 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.05 ms ± 12.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

176 ms ± 678 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
32 ms ± 395 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
175 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
281 ms ± 38.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

1.74 s ± 7.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
290 ms ± 7.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.73

In [14]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*exp(b),3e-06,0.000451,3e-06,0.00142927
1,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*exp(b),0.000398,0.000576,0.000422,0.00505483
2,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*exp(b),0.176065,0.03199,0.174955,0.281395
3,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*exp(b),1.740126,0.290254,1.731988,2.77891
4,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*exp(b),17.499531,2.82723,17.461843,OOM


#### Sine matrix multiplication

In [15]:
df = pd.DataFrame(columns=columns)
operation = "a*sin(b)"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    try:
        r4 = %timeit -o multfcuda(a,b)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))        
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

2.84 µs ± 4.84 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
456 µs ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.29 µs ± 5.75 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.43 ms ± 2.91 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

399 µs ± 466 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
568 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
422 µs ± 917 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.33 ms ± 663 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

176 ms ± 455 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.8 ms ± 381 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
176 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
282 ms ± 44.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

1.75 s ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
285 ms ± 6.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.74

In [16]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*sin(b),3e-06,0.000456,3e-06,0.00143073
1,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*sin(b),0.000399,0.000568,0.000422,0.00533127
2,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*sin(b),0.175555,0.031786,0.175853,0.282201
3,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*sin(b),1.746111,0.284796,1.741112,2.80535
4,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*sin(b),17.572413,2.796824,17.473053,OOM


#### Multiple matrix multiplication (3 factors)

In [17]:
df = pd.DataFrame(columns=columns)
operation = "a*b*c"
for s1, s2 in size_combinations:
    a, b, c = factors_float3(s1, s2)
    r1 = %timeit -o multiply3(a,b,c)
    r2 = %timeit -o ne_multiply3(a,b,c)
    r3 = %timeit -o multfcpu3(a,b,c)
    try:
        r4 = %timeit -o multfcuda3(a,b,c)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

6.41 µs ± 26.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
414 µs ± 53.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.19 µs ± 55.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.87 ms ± 11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

712 µs ± 1.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
611 µs ± 25.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
529 µs ± 1.38 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
6.71 ms ± 22.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

255 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
42.1 ms ± 674 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
215 ms ± 1.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
345 ms ± 39.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

2.5 s ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
401 ms ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.08 s 

In [18]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*b*c,6e-06,0.000414,5e-06,0.00187432
1,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*b*c,0.000712,0.000611,0.000529,0.00670541
2,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*b*c,0.255325,0.042129,0.214533,0.344591
3,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*b*c,2.504561,0.400799,2.080108,3.40734
4,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*b*c,25.455447,3.727236,20.779519,OOM


#### Multiple matrix multiplication (5 factors)

In [19]:
df = pd.DataFrame(columns=columns)
operation = "a*b*c*d*e"
for s1, s2 in size_combinations:
    a, b, c, d, e = factors_float5(s1, s2)
    r1 = %timeit -o multiply5(a,b,c,d,e)
    r2 = %timeit -o ne_multiply5(a,b,c,d,e)
    r3 = %timeit -o multfcpu5(a,b,c,d,e)
    try:
        r4 = %timeit -o multfcuda5(a,b,c,d,e)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

13.6 µs ± 33.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
465 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
8.28 µs ± 19.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
2.81 ms ± 4.21 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

1.33 ms ± 4.36 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
642 µs ± 17.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
727 µs ± 5.72 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
9.89 ms ± 33.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

413 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
67 ms ± 854 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
284 ms ± 2.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
465 ms ± 40.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

4.12 s ± 22.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
638 ms ± 17.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.79 s

In [20]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*b*c*d*e,1.4e-05,0.000465,8e-06,0.00280596
1,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*b*c*d*e,0.001334,0.000642,0.000727,0.00989134
2,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*b*c*d*e,0.412506,0.066981,0.284481,0.464562
3,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*b*c*d*e,4.115293,0.637788,2.78639,OOM
4,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*b*c*d*e,47.783457,6.025675,32.410867,OOM


#### Exponential sine matrix multiplication

In [21]:
df = pd.DataFrame(columns=columns)
operation = "a*exp(b)*sin(c)"
for s1, s2 in size_combinations:
    a, b, c = factors_float3(s1, s2)
    r1 = %timeit -o exponential_sine(a,b,c)
    r2 = %timeit -o ne_exponential_sine(a,b,c)
    r3 = %timeit -o expsincpu(a,b,c)
    try:
        r4 = %timeit -o expsincuda(a,b,c)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))        
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

215 µs ± 1.63 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
472 µs ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
244 µs ± 972 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.89 ms ± 5.79 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

21.2 ms ± 135 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.35 ms ± 8.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
24.4 ms ± 194 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
6.71 ms ± 30.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

2.39 s ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
98.7 ms ± 192 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.61 s ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
345 ms ± 39.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

24.1 s ± 65.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
965 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
25.9 s ± 82.6 

In [22]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*exp(b)*sin(c),0.000215,0.000472,0.000244,0.00189101
1,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*exp(b)*sin(c),0.02125,0.001346,0.024411,0.00670868
2,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*exp(b)*sin(c),2.394257,0.098687,2.6066,0.344887
3,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*exp(b)*sin(c),24.050354,0.96521,25.900561,3.43568
4,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*exp(b)*sin(c),264.156864,9.863348,274.233718,OOM
