# Parallelism

In [1]:
import pandas as pd
import numpy as np

In [2]:
[x * 2 for x in range(10)]

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [5]:
i = 0
for j in range(10):
    print(i, j)
    i += j
    print(i)

0 0
0
0 1
1
1 2
3
3 3
6
6 4
10
10 5
15
15 6
21
21 7
28
28 8
36
36 9
45


# Silly example

In [6]:
def my_sleep(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    import time
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x

In [7]:
my_sleep(5)

Sleeping for 5 seconds.
Returning 5


5

In [8]:
my_list = [1,2,3,4,5,6]

In [9]:
sum(my_list)

21

In [10]:
from tqdm.auto import tqdm

In [11]:
for i in my_list:
    print(my_sleep(i))

Sleeping for 1 seconds.
Returning 1
1
Sleeping for 2 seconds.
Returning 2
2
Sleeping for 3 seconds.
Returning 3
3
Sleeping for 4 seconds.
Returning 4
4
Sleeping for 5 seconds.
Returning 5
5
Sleeping for 6 seconds.
Returning 6
6


In [12]:
map(my_sleep,my_list) # lazy evaluation

<map at 0x226c02b4340>

In [13]:
list(map(my_sleep,my_list))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6


[1, 2, 3, 4, 5, 6]

## Serial code

In [14]:
for item in tqdm(my_list):
    my_sleep(item)

  0%|          | 0/6 [00:00<?, ?it/s]

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6


In [15]:
list(map(my_sleep, tqdm(my_list)))

  0%|          | 0/6 [00:00<?, ?it/s]

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6


[1, 2, 3, 4, 5, 6]

In [16]:
%%time
list(map(my_sleep, my_list))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6
Wall time: 21 s


[1, 2, 3, 4, 5, 6]

In [17]:
%%prun
list(map(my_sleep, [1,2,3]))

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
 

## Parallel code

In [18]:
from multiprocessing import Pool, cpu_count # Como funciona o pool?

cpu_count()

8

## You have to create a pool of `n` (or `n-1`) process.

In [19]:
pool = Pool(processes=cpu_count()-1)

In [20]:
pool

<multiprocessing.pool.Pool state=RUN pool_size=7>

### We'll `%%time` here to measure the velocity of this code in parallel.

However, if you run this code, watch what happens:

In [None]:
#%%time

#result = pool.map(my_sleep, my_list)
#pool.terminate()

## This happens because multiprocessing doesn't always (?) work in Jupyter Notebooks. 

_Some versions of linux or macbooks may handle it well (yay unix)_. But certainly it doesn't work for Windows.

### What should we do then? Two solutions.

1. We have to write our functions inside a `.py` file.

2. Install `multiprocess` (note it is different from Python's `multiprocessing` module)

In [21]:
import sleeper

In [22]:
sleeper.my_sleep_from_file

<function sleeper.my_sleep_from_file(x)>

In [23]:
from sleeper import my_sleep_from_file

In [24]:
%%time
list(map(my_sleep_from_file, tqdm([1,2,3,4,5,6,7,8])))

  0%|          | 0/8 [00:00<?, ?it/s]

Sleeping for 1 seconds.
Returning 1
Sleeping for 2 seconds.
Returning 2
Sleeping for 3 seconds.
Returning 3
Sleeping for 4 seconds.
Returning 4
Sleeping for 5 seconds.
Returning 5
Sleeping for 6 seconds.
Returning 6
Sleeping for 7 seconds.
Returning 7
Sleeping for 8 seconds.
Returning 8
Wall time: 36.1 s


[1, 2, 3, 4, 5, 6, 7, 8]

In [26]:
pool.terminate()
pool = Pool(processes=cpu_count())
pool

<multiprocessing.pool.Pool state=RUN pool_size=8>

In [27]:
%%time
result = pool.map(my_sleep_from_file, [8,2,3,4,5,6,7,1])
my_sleep_from_file(1)
my_sleep_from_file(2)
my_sleep_from_file(3)


Wall time: 8.02 s


In [28]:
result

[1, 2, 3, 4, 5, 6, 7, 8]

In [29]:
pool.terminate()

## Using multiprocess


In [None]:
!pip3 install multiprocess

In [30]:
# using multiprocess instead of multiprocessing
from multiprocess import Pool, cpu_count

In [36]:
pool = Pool(processes=cpu_count())

In [32]:
%%time
result = list(map(lambda x:x**10000000, [1,2,3,4,5,6]))

Wall time: 14.3 s


In [37]:
%%prun
result = pool.map(lambda x:x**10000000, [1,2,3,4,5,6])

6.3 s ± 96.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
pool.terminate()

# Running Asynchronous code

## What is asynchrony?

- `result.ready()`
- `result.wait()`
- `result.get()`

In [49]:
pool = Pool(processes=cpu_count()-1)

In [50]:
result = pool.map_async(my_sleep_from_file, [10, 10, 10, 10, 10, 10])

In [51]:
print('Do something that doesn"t depend on result')
print('...')
print('Now the time came when the result is needed.')
result.wait()

result_list = result.get()
pool.terminate()
print(f'Now go on and use the results obtained - {result_list}')

Do something that doesn"t depend on result
...
Now the time came when the result is needed.
Now go on and use the results obtained - [10, 10, 10, 10, 10, 10]


In [48]:
result.ready()

True

# CPU intensive computations

In [52]:
def square(x):
    return x ** 2

In [53]:
%%time
square(1249415165)

Wall time: 0 ns


1561038254531977225

In [54]:
n = 1000000

In [55]:
np.random.random(size=n)

array([0.42009881, 0.49036755, 0.40821077, ..., 0.74452849, 0.43506181,
       0.62043001])

In [56]:
%%timeit
    
result = [square(item) for item in np.random.random(size=n)]

426 ms ± 5.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
%%time
    
result = [square(item) for item in np.random.random(size=n)]

Wall time: 443 ms


In [58]:
pool = Pool(processes=4)

In [59]:
random_numbers = np.random.random(size=n)

In [60]:
random_numbers

array([0.94854775, 0.10676648, 0.59214525, ..., 0.27745973, 0.32699503,
       0.40524897])

In [61]:
%%time

result = pool.map(square, random_numbers)

Wall time: 21.5 s


In [62]:
%%time
result = [square(item) for item in random_numbers]

Wall time: 438 ms


In [63]:
pool.terminate()

In [None]:
# GIL - global interpreter lock

## profiling tools

In [None]:
%%prun

result = [square(item) for item in np.random.random(size=n)]

## Usually, for CPU intensive computations, Pool.map won't speed up your code.

Why? It will spend more time managing process, replicating data and sending data to other process than actually computing it.



# VOLTAMOS 21H15

In [None]:
!pip3 install Cython

In [None]:
## Cython - CPython

In [64]:
%load_ext Cython

In [65]:
%%cython -a
def square_c(x):
    return x**2

In [66]:
random_numbers = np.random.random(size=n)

In [67]:
%%time
result = [square_c(item) for item in random_numbers]

Wall time: 361 ms


In [68]:
%%time
result = [square(item) for item in random_numbers]

Wall time: 422 ms


In [69]:
%%time
pool = Pool(processes=4)
result = pool.map(square, random_numbers)

Wall time: 21.5 s


In [None]:
#https://numba.pydata.org/

# When is multiprocess useful then? 


## I/O bound computations

In [70]:
import pandas as pd

In [71]:
import requests
from bs4 import BeautifulSoup

In [73]:
n_max = 51
my_range = range(1,n_max)

In [74]:
%%time

for i in tqdm(my_range):
    response = requests.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    html=response.content
    soup = BeautifulSoup(html)
    titles=[s.find_all('a')[0]['title'] for s in soup.find_all('h3')]
    prices = [s.text for s in soup.find_all('p', attrs={'class':'price_color'})]
    stocks = [(True if s.text.strip()=='In stock' else False) for s in soup.find_all('p', attrs={'class':'instock availability'})]
    df_temp=pd.DataFrame({'Title':titles,'Price':prices,'Stock Availability':stocks})
    df_temp.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')

  0%|          | 0/50 [00:00<?, ?it/s]

Wall time: 26.4 s


In [77]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def download(i):
    '''
    
    '''
    response = requests.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    html=response.content
    soup = BeautifulSoup(html)
    titles=[s.find_all('a')[0]['title'] for s in soup.find_all('h3')]
    prices = [s.text for s in soup.find_all('p', attrs={'class':'price_color'})]
    stocks = [(True if s.text.strip()=='In stock' else False) for s in soup.find_all('p', attrs={'class':'instock availability'})]
    df_temp=pd.DataFrame({'Title':titles,'Price':prices,'Stock Availability':stocks})
    df_temp.to_csv(f'tmp_par/results_{i}.csv', index=False, sep=',')

In [78]:
pool = Pool(cpu_count())

In [79]:
%%time

results = pool.map(download, tqdm(my_range))

  0%|          | 0/50 [00:00<?, ?it/s]

Wall time: 5.8 s


In [80]:
pool.terminate()

In [None]:
def download_html(i):
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    response = requests.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    html=response.content
    file = open(f'html_books_{i}.html','wb')
    file.write(response.content)

In [None]:
import os

In [None]:
os.mkdir('tmp_2')

In [None]:
os.getcwd()

In [None]:
os.chdir('tmp_2')

In [None]:
os.getcwd()

In [None]:
pool = Pool(cpu_count())
results = pool.map(download_html, tqdm(my_range))