In [1]:
import sys
import os
import numpy as np
import numexpr as ne
from numba import vectorize
import math
import pandas as pd
import bokeh
from utils import (get_number_processors, get_ram_memory, get_total_gpu_memory, 
                   get_gpu_name, get_cuda_version, get_cudnn_version)

print("System version: {}".format(sys.version))
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Numexpr version: {}".format(ne.__version__))


%load_ext autoreload
%autoreload 2

System version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
Numpy version: 1.15.4
Pandas version: 0.23.4
Numexpr version: 2.6.9


## Helper function with numpy

In [2]:
def multiply(a,b):
    return a*b

def exponential(a, b):
    return a * np.exp(b)

def sine(a, b):
    return a * np.sin(b)

## Helper function with numpexp

In [3]:
def ne_multiply(a,b):
    return ne.evaluate("a*b")

def ne_exponential(a, b):
    return ne.evaluate("a*exp(b)")

def ne_sine(a, b):
    return ne.evaluate("a*sin(b)")

## Helper functions for numba
NOTE: For numba solutions, having a solution empty vector speeds up around 10%
```
r0 = np.empty((S1, S2), dtype=np.int16)
r0 = multicpu(a, b)
```
source: https://devblogs.nvidia.com/numba-python-cuda-acceleration/

In [4]:
@vectorize(["int16(int16, int16)"], target="cpu")
def multicpu(a, b):
    return a * b

@vectorize(["int16(int16, int16)"], target="cuda")
def multicuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def multfcpu(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cuda")
def multfcuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def expcpu(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def expcuda(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cpu")
def sincpu(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def sincuda(a, b):
    return a*math.sin(b)

## Data

In [None]:
def factors_int(S1=100, S2=100):
    a = np.random.randint(1, 5, (S1, S2), dtype=np.int16)
    b = np.random.randint(1, 10, (S1, S2), dtype=np.int16)
    return a, b

def factors_float(S1=100, S2=100):
    a = np.random.randn(S1, S2).astype(np.float32)
    b = np.random.randn(S1, S2).astype(np.float32)
    return a, b

## Benchmark

In [None]:
size_combinations=[
    (100, 100),
    (1000, 1000),
    (10000, 10000),
    (100000, 10000),
#    (100000, 100000)
]

In [None]:
columns = ["n_processors",
           "cpu_memory",
           "gpu_name",
           "gpu_memory",
           "data_type",
           "size1",
           "size2",
           "operation",
           "numpy",
           "numexpr",
           "numba_cpu",
           "numba_gpu"]
df = pd.DataFrame(columns=columns)


In [None]:
n_processors = get_number_processors()
cpu_memory = get_ram_memory(units="Gb")
gpu_name = get_gpu_name()[0]
gpu_memory = get_total_gpu_memory(units="Gb")[0]
header = [n_processors, cpu_memory, gpu_name, gpu_memory]

#### Integer matrix multiplication

In [None]:
for S1, S2 in size_combinations:
    a, b = factors_int(S1, S2)
    operation = "a * b"
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multicpu(a,b)
    r4 = %timeit -o multicuda(a,b)
    row = header + [type(a[0,0]), S1, S2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row


2.15 µs ± 14.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
478 µs ± 37.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.96 µs ± 4.25 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
1.4 ms ± 9.38 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
209 µs ± 2.01 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
586 µs ± 20.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
213 µs ± 2.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.16 ms ± 67.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
88 ms ± 475 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
21.1 ms ± 183 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
90.6 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
137 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
892 ms ± 8.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
155 ms ± 2.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
94

#### Float matrix multiplication

In [None]:
for S1, S2 in size_combinations:
    a, b = factors_float(S1, S2)
    operation = "a * b"
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    r4 = %timeit -o multfcuda(a,b)
    row = header + [type(a[0,0]), S1, S2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

3.06 µs ± 13.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
460 µs ± 14 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.14 µs ± 35.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.4 ms ± 7.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
41.3 µs ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
577 µs ± 32.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
423 µs ± 1.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.71 ms ± 64.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
21.4 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
30.5 ms ± 367 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
177 ms ± 2.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
258 ms ± 37.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
242 ms ± 21.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
267 ms ± 6.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.76 

#### Exponential matrix multiplication

In [None]:
for S1, S2 in size_combinations:
    a, b = factors_float(S1, S2)
    operation = "a * exp(b)"
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    r4 = %timeit -o multfcuda(a,b)
    row = header + [type(a[0,0]), S1, S2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

3.44 µs ± 25.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
483 µs ± 21.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.1 µs ± 10.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.43 ms ± 9.07 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
41.4 µs ± 534 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
566 µs ± 38.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
424 µs ± 2.14 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.78 ms ± 13 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
22.2 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
28.7 ms ± 417 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
183 ms ± 5.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
260 ms ± 36.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
225 ms ± 7.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
267 ms ± 8.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.77 

#### Sine matrix multiplication

In [None]:
for S1, S2 in size_combinations:
    a, b = factors_float(S1, S2)
    operation = "a * sin(b)"
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multfcpu(a,b)
    r4 = %timeit -o multfcuda(a,b)
    row = header + [type(a[0,0]), S1, S2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row

3.05 µs ± 9.83 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
470 µs ± 15.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.2 µs ± 17.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.42 ms ± 3.37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
40.7 µs ± 849 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
563 µs ± 11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
424 µs ± 4.68 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.77 ms ± 77.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
20.8 ms ± 2.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
29.2 ms ± 456 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
190 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
268 ms ± 35.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
226 ms ± 8.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
285 ms ± 5.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.76

In [None]:
df