# Parallelism

In [None]:
import pandas as pd
import numpy as np

# Silly example

In [None]:
def my_sleep(x):
    '''
    Sleeps for x-seconds and returns the result x
    '''
    import time
    print(f'Sleeping for {x} seconds.')
    time.sleep(x)
    print(f'Returning {x}')
    return x

In [None]:
my_sleep(5)

In [None]:
my_list = [1,2,3,4,5,6]

In [None]:
sum(my_list)

In [None]:
from tqdm.auto import tqdm

In [None]:
for i in my_list:
    print(my_sleep(i))

In [None]:
map(my_sleep,my_list) # lazy evaluation

In [None]:
list(map(my_sleep,my_list))

## Serial code

In [None]:
for item in tqdm(my_list):
    my_sleep(item)

In [None]:
list(map(my_sleep, tqdm(my_list)))

In [None]:
%%time
list(map(my_sleep, my_list))

In [None]:
%%prun
list(map(my_sleep, [1,2,3]))

## Parallel code

In [None]:
from multiprocessing import Pool, cpu_count # Como funciona o pool?

cpu_count()

## You have to create a pool of `n` (or `n-1`) process.

In [None]:
pool = Pool(processes=cpu_count()-1)

In [None]:
pool

### We'll `%%time` here to measure the velocity of this code in parallel.

However, if you run this code, watch what happens:

In [None]:
#%%time

#result = pool.map(my_sleep, my_list)
#pool.terminate()

## This happens because multiprocessing doesn't always (?) work in Jupyter Notebooks. 

_Some versions of linux or macbooks may handle it well (yay unix)_. But certainly it doesn't work for Windows.

### What should we do then? Two solutions.

1. We have to write our functions inside a `.py` file.

2. Install `multiprocess` (note it is different from Python's `multiprocessing` module)

In [None]:
from sleeper import my_sleep_from_file

In [None]:
%%time
list(map(my_sleep_from_file, tqdm([1,2,3,4,5,6,7,8])))

In [None]:
pool.terminate()
pool = Pool(processes=cpu_count())
pool

In [None]:
%%time

result = pool.map(my_sleep_from_file, [1,2,3,4,5,6,7,8])

In [None]:
result

In [None]:
pool.terminate()

## Using multiprocess


In [None]:
!pip3 install multiprocess

In [None]:
# using multiprocess instead of multiprocessing
from multiprocess import Pool, cpu_count

In [None]:
pool = Pool(processes=cpu_count())

In [None]:
%%time

result = list(map(lambda x:x**10000000, [1,2,3,4,5,6]))

In [None]:
%%time
result = pool.map(lambda x:x**10000000, [1,2,3,4,5,6])

In [None]:
pool.terminate()

# Running Asynchronous code

## What is asynchrony?

- `result.ready()`
- `result.wait()`
- `result.get()`

In [None]:
pool = Pool(processes=cpu_count()-1)

In [None]:
result = pool.map_async(my_sleep_from_file, [10, 10, 10, 10, 10, 10])

In [None]:
result.ready()

In [None]:
print('Do something that doesn"t depend on result')
print('...')
print('Now the time came when the result is needed.')
result.wait()

result_list = result.get()
pool.terminate()
print(f'Now go on and use the results obtained - {result_list}')

# CPU intensive computations

In [None]:
def square(x):
    return x ** 2

In [None]:
%%time
square(1249415165)

In [None]:
n = 1000000

In [None]:
%%timeit
    
result = [square(item) for item in np.random.random(size=n)]

In [None]:
%%time
    
result = [square(item) for item in np.random.random(size=n)]

In [None]:
pool = Pool(processes=4)

In [None]:
random_numbers = np.random.random(size=n)

In [None]:
random_numbers

In [None]:
%%time

result = pool.map(square, random_numbers)

In [None]:
%%time
result = [square(item) for item in random_numbers]

In [None]:
pool.terminate()

In [None]:
# GIL - global interpreter lock

## profiling tools

In [None]:
%%prun

result = [square(item) for item in np.random.random(size=n)]

## Usually, for CPU intensive computations, Pool.map won't speed up your code.

Why? It will spend more time managing process, replicating data and sending data to other process than actually computing it.



In [None]:
!pip3 install Cython

In [None]:
## Cython - CPython

In [None]:
%load_ext Cython

In [None]:
%%cython -a
def square_c(x):
    return x**2

In [None]:
random_numbers = np.random.random(size=n)

In [None]:
%%timeit
result = [square_c(item) for item in random_numbers]

In [None]:
%%timeit
result = [square(item) for item in random_numbers]

In [None]:
%%timeit
pool = Pool(processes=4)
result = pool.map(square, random_numbers)

In [None]:
#https://numba.pydata.org/

# When is multiprocess useful then? 


## I/O bound computations

In [None]:
import pandas as pd

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
n_max = 51
my_range = range(1,n_max)

In [None]:
%%time

for i in tqdm(my_range):
    response = requests.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    html=response.content
    soup = BeautifulSoup(html)
    titles=[s.find_all('a')[0]['title'] for s in soup.find_all('h3')]
    prices = [s.text for s in soup.find_all('p', attrs={'class':'price_color'})]
    stocks = [(True if s.text.strip()=='In stock' else False) for s in soup.find_all('p', attrs={'class':'instock availability'})]
    df_temp=pd.DataFrame({'Title':titles,'Price':prices,'Stock Availability':stocks})
    df_temp.to_csv(f'tmp/results_{i}.csv', index=False, sep=',')

In [None]:
def download(i):
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    response = requests.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    html=response.content
    soup = BeautifulSoup(html)
    titles=[s.find_all('a')[0]['title'] for s in soup.find_all('h3')]
    prices = [s.text for s in soup.find_all('p', attrs={'class':'price_color'})]
    stocks = [(True if s.text.strip()=='In stock' else False) for s in soup.find_all('p', attrs={'class':'instock availability'})]
    df_temp=pd.DataFrame({'Title':titles,'Price':prices,'Stock Availability':stocks})
    df_temp.to_csv(f'tmp_par/results_{i}.csv', index=False, sep=',')

In [None]:
pool = Pool(cpu_count())

In [None]:
%%time

results = pool.map(download, tqdm(my_range))

In [None]:
pool.terminate()

In [None]:
def download_html(i):
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    response = requests.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    html=response.content
    file = open(f'html_books_{i}.html','wb')
    file.write(response.content)

In [None]:
import os

In [None]:
os.mkdir('tmp_2')

In [None]:
os.getcwd()

In [None]:
os.chdir('tmp_2')

In [None]:
os.getcwd()

In [None]:
pool = Pool(cpu_count())
results = pool.map(download_html, tqdm(my_range))