In [3]:
import sys
import os
import numpy as np
import numexpr as ne
from numba import vectorize
import math
import pandas as pd
import bokeh

print("System version: {}".format(sys.version))
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Numexpr version: {}".format(ne.__version__))


%load_ext autoreload
%autoreload 2

System version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
Numpy version: 1.15.4
Pandas version: 0.23.4
Numexpr version: 2.6.9


## Helper function with numpy

In [4]:
def multiply(a,b):
    return a*b

def exponential(a, b):
    return a * np.exp(b)

def sine(a, b):
    return a * np.sin(b)

## Helper function with numpexp

In [5]:
def ne_multiply(a,b):
    return ne.evaluate("a*b")

def ne_exponential(a, b):
    return ne.evaluate("a*exp(b)")

def ne_sine(a, b):
    return ne.evaluate("a*sin(b)")

## Helper functions for numba
NOTE: For numba solutions, having a solution empty vector speeds up around 10%
```
r0 = np.empty((S1, S2), dtype=np.int16)
r0 = multicpu(a, b)
```
source: https://devblogs.nvidia.com/numba-python-cuda-acceleration/

In [6]:
@vectorize(["int16(int16, int16)"], target="cpu")
def multicpu(a, b):
    return a * b

@vectorize(["int16(int16, int16)"], target="cuda")
def multicuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def multfcpu(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cuda")
def multfcuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def expcpu(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def expcuda(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cpu")
def sincpu(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def sincuda(a, b):
    return a*math.sin(b)

## Data

In [7]:
def factors_int(S1=100, S2=100):
    a = np.random.randint(1, 5, (S1, S2), dtype=np.int16)
    b = np.random.randint(1, 10, (S1, S2), dtype=np.int16)
    return a, b

def factor_float(S1=100, S2=100):
    a = np.random.randn(S1, S2).astype(np.float32)
    b = np.random.randn(S1, S2).astype(np.float32)
    return a, b

## Benchmark

In [20]:
size_combinations=[
    (100, 100),
    (1000, 1000),
    (10000, 10000),
    (100000, 10000),
    (100000, 100000)
]

In [24]:

for S1, S2 in size_combinations:
    a, b = factors_int(S1, S2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multicpu(a,b)
    r4 = %timeit -o multicuda(a,b)


2.04 µs ± 3.02 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
455 µs ± 7.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.0385756357011684e-06
0.00045541241242868374
213 µs ± 440 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
618 µs ± 36.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
0.00021329077428578917
0.0006175262951422025


In [15]:
print(r1.average)
print(r2.average)

2.16484145999857e-06
0.0005055294695713591


In [17]:
r1.repeat

7

In [26]:
print(str(np.int16))

<class 'numpy.int16'>


In [27]:
np.int16

numpy.int16

In [28]:
str(np.int16)

"<class 'numpy.int16'>"

In [31]:
from psutil import virtual_memory

mem = virtual_memory()
mem.total/1024/1024/1024

440.9097557067871