In [5]:
import numpy as np
import pandas as pd
import math
from numba import jit

In [6]:
values = np.random.random(100000000)
values

array([0.22731521, 0.59162628, 0.21776349, ..., 0.73022621, 0.31880917,
       0.44050632])

In [7]:
xs = pd.Series(values)

In [8]:
%%timeit 

xs.mean()

292 ms ± 776 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
np.mean(values)

37.9 ms ± 661 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%%timeit
values.sum()/len(values)

38 ms ± 528 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
@jit(nopython=True)
def calc_mean(a):
    return np.mean(values)

In [12]:
%%timeit
calc_mean(values)

91.4 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
@jit(nopython=True)
def calc_mean2(a):
    total = 0
    for i in range(values.shape[0]):
        total += a[i]
    return total / values.shape[0]

In [14]:
%%timeit
calc_mean2(values)

91.6 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%%timeit
calc_mean2(values)

93.7 ms ± 955 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
@jit(parallel=True)
def calc_mean3(a):
    total = 0
    for i in range(values.shape[0]):
        total += a[i]
    return total / values.shape[0]

In [17]:
%%timeit
calc_mean3(values)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "<ipython-input-16-71654a2836c4>", line 2:
@jit(parallel=True)
def calc_mean3(a):
^



93.4 ms ± 959 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
calc_mean3.parallel_diagnostics(level=4)

 
 Parallel Accelerator Optimizing:  Function calc_mean3, <ipython-
input-16-71654a2836c4> (1)  
No source available
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
---------------------------Loop invariant code motion---------------------------
Allocation hoisting:
No allocation hoisting found

Instruction hoisting:
No instruction hoisting found
--------------------------------------------------------------------------------


In [19]:
# rapids-0.18 env

In [20]:
import cupy as cp

In [23]:
cp_values = cp.array(values)

In [24]:
cp_values

array([0.22731521, 0.59162628, 0.21776349, ..., 0.73022621, 0.31880917,
       0.44050632])

In [26]:
%%timeit
cp.mean(cp_values)

57.9 ms ± 46 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
cu_xs = cudf.Series(cp_values)

In [30]:
%%timeit
cu_xs.mean()

5.6 ms ± 10 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
