# Benchmark of Matrix Multiplications
On this benchmark we compare several operations using numpy, numexpr and numba (CPU&GPU).

In [1]:
import sys
import os
import numpy as np
import numexpr as ne
from numba import vectorize
from numba.cuda.cudadrv.error import CudaDriverError
import math
from functools import reduce
import pandas as pd
import torch
from utils import (get_number_processors, get_ram_memory, get_total_gpu_memory, 
                   get_gpu_name, get_cuda_version, get_cudnn_version, AttributeDict,
                   get_object_size)

print("System version: {}".format(sys.version))
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Numexpr version: {}".format(ne.__version__))
print("PyTorch version: {}".format(torch.__version__))

%load_ext autoreload
%autoreload 2

System version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
Numpy version: 1.16.0
Pandas version: 0.23.4
Numexpr version: 2.6.9
PyTorch version: 1.0.0


## Helper functions for numpy

In [2]:
def multiply(a,b):
    return a*b

def exponential(a, b):
    return a*np.exp(b)

def sine(a, b):
    return a*np.sin(b)

# A general function that multiplies an arbitrary number of matrices
# is 28% slower than directly multiplying the factors.
# The function multiply_list is not used, just leaving it here for reference
def multiply_list(l):
    return reduce(lambda x, y: x*y, l) 

def multiply3(a, b, c):
    return a*b*c

def multiply5(a, b, c, d, e):
    return a*b*c*d*e

def exponential_sine(a, b, c):
    return a*np.exp(b)*np.sin(c)

def dot(a, b):
    return np.dot(a,b)

## Helper functions for numexpr

In [3]:
def ne_multiply(a,b):
    return ne.evaluate("a*b")

def ne_exponential(a, b):
    return ne.evaluate("a*exp(b)")

def ne_sine(a, b):
    return ne.evaluate("a*sin(b)")

def ne_multiply3(a, b, c):
    return ne.evaluate("a*b*c")

def ne_multiply5(a, b, c, d, e):
    return ne.evaluate("a*b*c*d*e")

def ne_exponential_sine(a, b, c):
    return ne.evaluate("a*exp(b)*sin(c)")


## Helper functions for numba
NOTE: For numba solutions, having a solution empty vector speeds up around 10%
```
r0 = np.empty((S1, S2), dtype=np.int16)
r0 = multicpu(a, b)
```
source: https://devblogs.nvidia.com/numba-python-cuda-acceleration/

In [4]:
@vectorize(["int16(int16, int16)"], target="cpu")
def multicpu(a, b):
    return a * b

@vectorize(["int16(int16, int16)"], target="cuda")
def multicuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def multfcpu(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cuda")
def multfcuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def expcpu(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def expcuda(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cpu")
def sincpu(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def sincuda(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32, float32)"], target="cpu")
def multfcpu3(a, b, c):
    return a * b * c

@vectorize(["float32(float32, float32, float32)"], target="cuda")
def multfcuda3(a, b, c):
    return a * b * c

@vectorize(["float32(float32, float32, float32, float32, float32)"], target="cpu")
def multfcpu5(a, b, c, d, e):
    return a * b * c * d * e

@vectorize(["float32(float32, float32, float32, float32, float32)"], target="cuda")
def multfcuda5(a, b, c, d, e):
    return a * b * c * d * e

@vectorize(["float32(float32, float32, float32)"], target="cpu")
def expsincpu(a, b, c):
    return a*math.exp(b)*math.sin(c)

@vectorize(["float32(float32, float32, float32)"], target="cuda")
def expsincuda(a, b, c):
    return a*math.exp(b)*math.sin(c)

## Helper functions for PyTorch

*Note on performance*: 

`torch.as_tensor(a)` does not make a copy of a on CPU. Adding `.cuda()` copies the array to GPU memory.

More info: https://pytorch.org/docs/stable/tensors.html

In [5]:
def pt_multiply(a,b):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    return at*bt

def pt_exponential(a, b):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    return at*torch.exp(bt)

def pt_sine(a, b):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    return at*torch.sin(bt)

def pt_multiply3(a, b, c):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    ct = torch.as_tensor(c).cuda()
    return at*bt*ct

def pt_multiply5(a, b, c, d, e):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    ct = torch.as_tensor(c).cuda()
    dt = torch.as_tensor(d).cuda()
    et = torch.as_tensor(e).cuda()
    return at*bt*ct*dt*et

def pt_exponential_sine(a, b, c):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    ct = torch.as_tensor(c).cuda()
    return at*torch.exp(bt)*torch.sin(ct)


## Parameters

In [6]:
size_combinations=[
    (100, 100),
    (1000, 1000),
    (10000, 10000),
    (100000, 10000),
    (100000, 100000)
]

In [7]:
columns = ["n_processors",
           "cpu_memory",
           "gpu_name",
           "gpu_memory",
           "data_type",
           "size1",
           "size2",
           "operation",
           "numpy",
           "numexpr",
           "numba_cpu",
           "numba_gpu",
           "pytorch"]

In [8]:
n_processors = get_number_processors()
cpu_memory = get_ram_memory(units="Gb")
gpu_name = get_gpu_name()[0]
gpu_memory = get_total_gpu_memory(units="Gb")[0]
header = [n_processors, cpu_memory, gpu_name, gpu_memory]

In [9]:
filebase = gpu_name.replace(" ", "-")
filebase

'Tesla-V100-PCIE-16GB'

In [10]:
folder = "data"
os.makedirs(folder, exist_ok=True)

## Data

In [11]:
def factors_int(s1=100, s2=100):
    a = np.random.randint(1, 5, (s1, s2), dtype=np.int16)
    b = np.random.randint(1, 10, (s1, s2), dtype=np.int16)
    return a, b

def factors_float(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    return a, b

def factors_float3(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    c = np.random.uniform(low=0, high=10, size=(s1,s2)).astype(np.float32)
    return a, b, c

def factors_float5(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    c = np.random.uniform(low=0, high=10, size=(s1,s2)).astype(np.float32)
    d = np.random.uniform(low=5, high=15, size=(s1,s2)).astype(np.float32)
    e = np.random.uniform(low=0, high=30, size=(s1,s2)).astype(np.float32)
    return a, b, c, d, e

Checking data sizes in Gb

In [11]:
a, _ = factors_int(size_combinations[-1][0], size_combinations[-1][1])
print(get_object_size(a, units="Gb"))
a, _ = factors_float(size_combinations[-1][0], size_combinations[-1][1])
print(get_object_size(a, units="Gb"))

18.6264515966177
37.25290308892727


## Benchmark

#### Integer matrix multiplication

In [12]:
df = pd.DataFrame(columns=columns)
operation = "a*b"
for s1, s2 in size_combinations:
    a, b = factors_int(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multicpu(a,b)
    try:
        r4 = %timeit -o multicuda(a,b)
    except CudaDriverError: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r5 = %timeit -o pt_multiply(a,b)
    except RuntimeError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average]
    df.loc[len(df)] = row


2 µs ± 5.44 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
457 µs ± 15.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.9 µs ± 4.5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
1.47 ms ± 87.5 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
85.5 µs ± 28.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

213 µs ± 852 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
593 µs ± 13.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
208 µs ± 192 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.3 ms ± 26.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
577 µs ± 3.35 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

88.6 ms ± 539 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
21.6 ms ± 565 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
88.2 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
137 ms ± 2.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
46.9 

In [13]:
filename = filebase + "_" + operation + "_int" + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100,100,a*b,2e-06,0.000457,2e-06,0.00146729,8.5454e-05
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,1000,1000,a*b,0.000213,0.000593,0.000208,0.00430323,0.000577085
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,10000,10000,a*b,0.088591,0.021648,0.08824,0.136882,0.046876
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100000,10000,a*b,0.900896,0.183345,0.887828,1.32111,0.473809
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100000,100000,a*b,8.931365,1.754425,8.77594,OOM,OOM


#### Float matrix multiplication

In [14]:
df = pd.DataFrame(columns=columns)
operation = "a*b"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    try:
        r4 = %timeit -o multfcuda(a,b)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r5 = %timeit -o pt_multiply(a,b)
    except RuntimeError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average]
    df.loc[len(df)] = row

3.05 µs ± 13.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
458 µs ± 11.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.1 µs ± 10.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.48 ms ± 5.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
80.1 µs ± 810 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

397 µs ± 4.98 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
565 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
429 µs ± 3.35 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.79 ms ± 37 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
944 µs ± 11.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

176 ms ± 899 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.5 ms ± 257 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
177 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
265 ms ± 37.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop eac

In [15]:
filename = filebase + "_" + operation + "_float" + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*b,3e-06,0.000458,3e-06,0.00147518,8.0105e-05
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*b,0.000397,0.000565,0.000429,0.00479381,0.000943827
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*b,0.175588,0.031534,0.177034,0.264888,0.0946686
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*b,1.742532,0.287435,1.748335,2.67361,0.954018
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*b,17.414992,2.741685,17.431799,OOM,OOM


#### Exponential matrix multiplication

In [None]:
df = pd.DataFrame(columns=columns)
operation = "a*exp(b)"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    try:
        r4 = %timeit -o multfcuda(a,b)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2)) 
    try:
        r5 = %timeit -o pt_multiply(a,b)
    except RuntimeError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average]
    df.loc[len(df)] = row

2.98 µs ± 7.78 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
432 µs ± 40.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.08 µs ± 7.77 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.44 ms ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
77.7 µs ± 739 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

397 µs ± 4.51 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
589 µs ± 28.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
428 µs ± 6.84 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.86 ms ± 136 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
938 µs ± 12.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

175 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.8 ms ± 496 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
174 ms ± 317 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
264 ms ± 36.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop e

In [None]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*exp(b),3e-06,0.000432,3e-06,0.00144215,7.76956e-05
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*exp(b),0.000397,0.000589,0.000428,0.00485586,0.000938266
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*exp(b),0.17512,0.031774,0.174266,0.263535,0.0953557
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*exp(b),1.738481,0.284795,1.747448,2.68874,0.957177
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*exp(b),17.508171,2.832436,17.421199,OOM,OOM


#### Sine matrix multiplication

In [None]:
df = pd.DataFrame(columns=columns)
operation = "a*sin(b)"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    try:
        r4 = %timeit -o multfcuda(a,b)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))        
    try:
        r5 = %timeit -o pt_multiply(a,b)
    except RuntimeError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average]
    df.loc[len(df)] = row

3.04 µs ± 51.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
469 µs ± 16.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.09 µs ± 17.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.45 ms ± 3.87 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
78.3 µs ± 375 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

401 µs ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
556 µs ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
428 µs ± 600 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.79 ms ± 52.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
947 µs ± 12.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

175 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.6 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
173 ms ± 855 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
258 ms ± 37.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop 

In [None]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*sin(b),3e-06,0.000469,3e-06,0.00144839,7.83187e-05
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*sin(b),0.000401,0.000556,0.000428,0.00478838,0.00094656
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*sin(b),0.175495,0.03156,0.172847,0.258442,0.0941856
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*sin(b),1.725402,0.285651,1.756081,2.66361,0.956336
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*sin(b),17.438768,2.767125,17.360333,OOM,OOM


#### Multiple matrix multiplication (3 factors)

In [None]:
df = pd.DataFrame(columns=columns)
operation = "a*b*c"
for s1, s2 in size_combinations:
    a, b, c = factors_float3(s1, s2)
    r1 = %timeit -o multiply3(a,b,c)
    r2 = %timeit -o ne_multiply3(a,b,c)
    r3 = %timeit -o multfcpu3(a,b,c)
    try:
        r4 = %timeit -o multfcuda3(a,b,c)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r5 = %timeit -o pt_multiply(a,b)
    except RuntimeError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average]
    df.loc[len(df)] = row

6.46 µs ± 27.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
449 µs ± 11.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.68 µs ± 42 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.88 ms ± 78.9 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
77.7 µs ± 648 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

713 µs ± 1.96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
626 µs ± 31.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
530 µs ± 5.15 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
6.38 ms ± 97.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
946 µs ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

257 ms ± 3.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
43.8 ms ± 703 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
208 ms ± 2.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
323 ms ± 39.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
94.

In [None]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

#### Multiple matrix multiplication (5 factors)

In [None]:
df = pd.DataFrame(columns=columns)
operation = "a*b*c*d*e"
for s1, s2 in size_combinations:
    a, b, c, d, e = factors_float5(s1, s2)
    r1 = %timeit -o multiply5(a,b,c,d,e)
    r2 = %timeit -o ne_multiply5(a,b,c,d,e)
    r3 = %timeit -o multfcpu5(a,b,c,d,e)
    try:
        r4 = %timeit -o multfcuda5(a,b,c,d,e)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r5 = %timeit -o pt_multiply(a,b)
    except RuntimeError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average]
    df.loc[len(df)] = row

In [None]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

#### Exponential sine matrix multiplication

In [None]:
df = pd.DataFrame(columns=columns)
operation = "a*exp(b)*sin(c)"
for s1, s2 in size_combinations:
    a, b, c = factors_float3(s1, s2)
    r1 = %timeit -o exponential_sine(a,b,c)
    r2 = %timeit -o ne_exponential_sine(a,b,c)
    r3 = %timeit -o expsincpu(a,b,c)
    try:
        r4 = %timeit -o expsincuda(a,b,c)
    except: # in case of Out Of Memory (OOM)
        r4 = AttributeDict()
        r4["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))        
    try:
        r5 = %timeit -o pt_multiply(a,b)
    except RuntimeError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average]
    df.loc[len(df)] = row

In [None]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df