In [37]:
import sys
import os
import numpy as np
import numexpr as ne
from numba import vectorize
import math
import pandas as pd
import bokeh
from utils import (get_number_processors, get_ram_memory, get_total_gpu_memory, 
                   get_gpu_name, get_cuda_version, get_cudnn_version)

print("System version: {}".format(sys.version))
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Numexpr version: {}".format(ne.__version__))


%load_ext autoreload
%autoreload 2

System version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
Numpy version: 1.15.4
Pandas version: 0.23.4
Numexpr version: 2.6.9
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Helper function with numpy

In [4]:
def multiply(a,b):
    return a*b

def exponential(a, b):
    return a * np.exp(b)

def sine(a, b):
    return a * np.sin(b)

## Helper function with numpexp

In [5]:
def ne_multiply(a,b):
    return ne.evaluate("a*b")

def ne_exponential(a, b):
    return ne.evaluate("a*exp(b)")

def ne_sine(a, b):
    return ne.evaluate("a*sin(b)")

## Helper functions for numba
NOTE: For numba solutions, having a solution empty vector speeds up around 10%
```
r0 = np.empty((S1, S2), dtype=np.int16)
r0 = multicpu(a, b)
```
source: https://devblogs.nvidia.com/numba-python-cuda-acceleration/

In [6]:
@vectorize(["int16(int16, int16)"], target="cpu")
def multicpu(a, b):
    return a * b

@vectorize(["int16(int16, int16)"], target="cuda")
def multicuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def multfcpu(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cuda")
def multfcuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def expcpu(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def expcuda(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cpu")
def sincpu(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def sincuda(a, b):
    return a*math.sin(b)

## Data

In [7]:
def factors_int(S1=100, S2=100):
    a = np.random.randint(1, 5, (S1, S2), dtype=np.int16)
    b = np.random.randint(1, 10, (S1, S2), dtype=np.int16)
    return a, b

def factor_float(S1=100, S2=100):
    a = np.random.randn(S1, S2).astype(np.float32)
    b = np.random.randn(S1, S2).astype(np.float32)
    return a, b

## Benchmark

In [20]:
size_combinations=[
    (100, 100),
    (1000, 1000),
    (10000, 10000),
    (100000, 10000),
#    (100000, 100000)
]

In [35]:
columns = ["n_processors",
           "cpu_memory",
           "gpu_name",
           "gpu_memory",
           "data_type",
           "size1",
           "size2",
           "operation",
           "numpy",
           "numexpr",
           "numba_cpu",
           "numba_gpu"]
df = pd.DataFrame(columns=columns)


In [87]:
n_processors = get_number_processors()
cpu_memory = get_ram_memory(units="Gb")
gpu_name = get_gpu_name()[0]
gpu_memory = get_total_gpu_memory(units="Gb")[0]
header = [n_processors, cpu_memory, gpu_name, gpu_memory]

#### Integer matrix multiplication

In [88]:
for S1, S2 in size_combinations:
    a, b = factors_int(S1, S2)
    operation = "a * b"
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multicpu(a,b)
    r4 = %timeit -o multicuda(a,b)
    row = header + [type(a[0,0]), S1, S2, operation, r1.average, r2.average, r3.average, r4.average]
    df.loc[len(df)] = row


2.2 µs ± 3.42 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
464 µs ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.04 µs ± 16.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.33 ms ± 2.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
213 µs ± 654 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
583 µs ± 21.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
211 µs ± 581 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.35 ms ± 28.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [89]:
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_gpu
0,24,451491.589844,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100,100,a * b,2e-06,0.000397,2e-06,0.001322
1,24,451491.589844,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,1000,1000,a * b,0.000212,0.000591,0.000211,0.004364
2,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100,100,a * b,2e-06,0.000464,2e-06,0.00133
3,24,440.909756,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,1000,1000,a * b,0.000213,0.000583,0.000211,0.004353
