# Shared memory vs array parsing

**Conclusions:**
- Data to subprocesss:
  - The forking mechanism on linux allows much faster passing of read-only data to subprocesses (not possible on windows)
- Data from subprocess:
  - Sharing numpy array via multiprocessing.Array seems to be much slower



In [1]:
from multiprocessing import Array, Pool, cpu_count
print (cpu_count())

32


# Data to subprocess
On linux, the fork mechanism allows us to pass data (read only) to subprocesses much faster

Example:
```
main process:
  create array
Subprocesses
  return array[i]*2
main process:
  collect doubled array 
```

In [2]:
%%writefile tmp.py
def f_arr_forked(i):
    return a[i]*2

def f_arr_passed(args):
    arr,i = args
    return arr[i]*2


Overwriting tmp.py


In [3]:
from multiprocessing import Pool
import numpy as np

import tmp
import importlib
importlib.reload(tmp)

<module 'tmp' from '/home/mmpe/github/python_examples/tmp.py'>

In [11]:
N = 1000*1000*10

In [12]:
%%timeit -n 1 -r 1
tmp.a = np.arange(N) # tmp.a is accessible from subprocesses on linux due to forking
with Pool() as p:
    b = np.array(p.map(tmp.f_arr_forked, range(N)))
print (b.shape)
print (b[:4])

(10000000,)
[0 2 4 6]
4.26 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [13]:
%%timeit -n 1 -r 1
a = np.arange(N)
args_lst = ((a,i) for i in range(N))
with Pool() as p:
    b = np.array(list(p.imap(tmp.f_arr_passed, args_lst, chunksize=N//(cpu_count()*2))))
print (b.shape)
print (b[:4])

(10000000,)
[0 2 4 6]
11.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Data from subprocess

Example:
```
main process:
  create array
Subprocesses
  add `i` to subset of array
```

In [2]:
%%writefile tmp.py
import numpy as np

class MyData():
    data = None


myData = MyData()


def initProcess(data):
    myData.data = data


def f_shared(args):
    i, N2, N3 = args
    d = np.full((N2, N3), i).flatten()
    N = N2 * N3
    i1, i2 = i * N, (i + 1) * N
    myData.data[i1:i2] += d


def f_passing(args):
    data, i, N2, N3 = args
    return data+np.full((N2, N3), i)


Overwriting tmp.py


In [3]:
import tmp
import importlib
importlib.reload(tmp)
import numpy as np
from multiprocessing import Array, Pool

def run_shared_lock(arr, args_lst):
    # create shared array and let subprocesses update
    N1 = len(args_lst)
    N2, N3 = args_lst[0][1:]
    unshared_arr = arr
    share_arr = Array('f', unshared_arr.flatten(), lock=True)

    with Pool(initializer=tmp.initProcess, initargs=(share_arr,)) as pool:
        pool.map(tmp.f_shared, args_lst)

    b = np.frombuffer(share_arr.get_obj(), dtype=np.float32).reshape(unshared_arr.shape)
    print(b.shape)

def run_shared_nolock(arr, args_lst):
    # create shared array and let subprocesses update
    N1 = len(args_lst)
    N2, N3 = args_lst[0][1:]
    unshared_arr = arr
    share_arr = Array('f', unshared_arr.flatten(), lock=False)

    with Pool(initializer=tmp.initProcess, initargs=(share_arr,)) as pool:
        pool.map(tmp.f_shared, args_lst)

    b = np.frombuffer(share_arr, dtype=np.float32).reshape(unshared_arr.shape)
    print(b.shape)


def run_passing(arr, args_lst):
    args_lst = [(arr[i],i,N2,N3) for i,N2,N3 in args_lst]
    with Pool() as pool:
        b = np.array(pool.map(tmp.f_passing, args_lst))
    print(b.shape)


def run_seq(arr, args_lst):
    args_lst = [(arr[i],i,N2,N3) for i,N2,N3 in args_lst]
    b = np.array(list(map(tmp.f_passing, args_lst)))
    print(b.shape)

In [4]:
N1, N2, N3 = 8192 *2, 64, 64
arr = np.zeros((N1,N2,N3), dtype=np.float32)+0
args_lst = [(i, N2, N3) for i in range(N1)]

In [5]:
%%timeit -n 1 -r 1
run_shared_lock(arr, args_lst)

(16384, 64, 64)
22.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [6]:
%%timeit -n 1 -r 1
run_shared_nolock(arr, args_lst)

(16384, 64, 64)
16.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [7]:
%%timeit -n 1 -r 1
run_passing(arr, args_lst)

(16384, 64, 64)
1.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [8]:
%%timeit -n 1 -r 1
run_seq(arr, args_lst)

(16384, 64, 64)
472 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
