In [None]:
import time
import threading
# import multiprocessing
import multiprocess 
import itertools
import os
import logging
import random
import string
import requests
from functools import partial
# from multiprocessing import Queue
# from multiprocessing.pool import ThreadPool

# Definitions

In [None]:
# Processes are OS-level resource used to run and manage applications
# Processes have pid, list of open files and sockets, isolated memory,

In [None]:
# Special part of OS (scheduler) iterates over processes and give them time slot to run
TIME_SLOT = 0.5/(1000*1000)
while True:
    for process in processes:
        process.run_for(TIME_SLOT)

In [None]:
# Process has 1 or more threads
# Threads have common memory

In [None]:
# 2 kinds of tasks:
- parallel execution: speedup
- background execution: 
    - daemon process: OS services
    - multithreaded process: notifications, monitor, watcher, etc

In [None]:
Parallel execution can be done in 2 ways:
    - threads
    - processes
    
Which one is better?

# Threads

In [None]:
class timer():
    def __init__(self, message):
        self.message = message

    def __enter__(self):
        self.start = time.time()
        return None

    def __exit__(self, type, value, traceback):
        elapsed_time = (time.time() - self.start)
        print(self.message.format(elapsed_time))

TIME_TO_SLEEP = 4
        
def long_running_task(n=TIME_TO_SLEEP):
    print(threading.current_thread())
#     with timer('Elapsed 2: {}s'):
    time.sleep(n)

# what is it?
with timer('Executed in: {}s'):
#     long_running_task()
    long_running_task()


In [None]:
with timer('Executed in: {}s'):
    t1 = threading.Thread(target=long_running_task, args=(TIME_TO_SLEEP/2,))
    t2 = threading.Thread(target=long_running_task, args=(TIME_TO_SLEEP/2,))
    t1.start()
    t2.start()
    print(threading.current_thread())
    print('BEFORE T1 JOIN')
    t1.join() # be carefull
    print('BEFORE T2 JOIN')
    t2.join() 
    
print('next step')
# work with date

In [None]:
def run_threads(func, data, workers):
    threads = [
        threading.Thread(target=func, args=(data / workers, ))
        for _ in range(workers)
    ]
    for t in threads:
        t.start()
    for t in threads:
        t.join()

In [None]:
workers = 10
DATA_SIZE = 10

with timer('Elapsed: {}s'):
    run_threads(long_running_task, DATA_SIZE, workers)

In [None]:
def generate_task_queue(total_tasks, queue_size):
    task_queue = []
    remainder = total_tasks
    while remainder > 0:
        value = min(remainder, random.randint(1, total_tasks//queue_size))
        remainder -= value
        task_queue.append(value)
    return task_queue

assert sum(generate_task_queue(1000, 20)) == 1000
assert sum(generate_task_queue(1000, 1)) == 1000
assert sum(generate_task_queue(1000, 1000)) == 1000
assert sum(generate_task_queue(1000, 42)) == 1000

In [None]:
# input_data = [DATA_SIZE / workers for _ in range(workers)]
input_data = [1, 2, 2, 1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
print(input_data)

with timer('Elapsed: {}s'):
    with ThreadPool(workers) as pool:
        pool.map(long_running_task, input_data)
#         for chunk in input_data:
#             long_running_task(chunk)

<img src="https://www.nginx.com/wp-content/uploads/2016/07/thread-pools-worker-process-event-cycle.png">

# Real world task

In [None]:
def fetch_pic(num_pic):
# def fetch_pic(num_pic, path):
    url = 'https://picsum.photos/400/600'
    path = os.path.join(os.getcwd(), 'img')
    for _ in range(num_pic):
#         print(f"current image number: {i}")
        random_name = ''.join(random.choices(string.ascii_letters + string.digits, k=5))
        response = requests.get(url)
        if response.status_code == 200:
            with open(f'{path}/{random_name}.jpg', 'wb') as f:
                f.write(response.content)
                #print(f"Fetched pic [{os.getpid()}]: {f.name}")

In [None]:
workers = 64
DATA_SIZE = 100

with timer('Elapsed: {}s'):
#     with multiprocessing.Pool(workers) as pool:
    with ThreadPool(workers) as pool:
        input_data = [DATA_SIZE // workers for _ in range(workers)]
        # input_data = [(DATA_SIZE // workers, './pics') for _ in range(workers)]
        pool.map(fetch_pic, input_data)
# 30 - 1 
# 6 - 8 

# IO vs CPU bound tasks

In [None]:
DATA_SIZE = 1_000_000
lst2 = []
lst1 = []


def fill_data2(n):
    # print(threading.current_thread())
    while n > 0:
        n -= 1
        lst2.append(random.randint(1, 100))

def fill_data1(n):
    # print(threading.current_thread())
    while n > 0:
        n -= 1
        lst1.append(random.randint(1, 100))        
        
with timer('Elapsed: {}s'):
#     fill_data(DATA_SIZE, lst)
    fill_data(DATA_SIZE)

In [None]:
1  2  M
   |  |
|     |
   |  |
|     |
   |
______
GIL

In [None]:
with timer('Elapsed: {}s'):
#     t1 = threading.Thread(target=fill_data, args=(DATA_SIZE // 2, lst))
#     t2 = threading.Thread(target=fill_data, args=(DATA_SIZE // 2, lst))
    t1 = threading.Thread(target=fill_data2, args=(DATA_SIZE // 2,))
    t2 = threading.Thread(target=fill_data1, args=(DATA_SIZE // 2,))
    t1.start()
    t2.start()
    t1.join()
    t2.join()
#     lst = lst2 + lst1
    
print(len(lst), lst[:100])    

In [20]:
lst = []
workers = 8
with timer('Elapsed: {}s'):
    with ThreadPool(workers) as pool:
        input_data = [DATA_SIZE // workers for _ in range(workers)]
#         pool.map(partial(fill_data, lst=lst), input_data)
        result = pool.map(fill_data, input_data)
        
print(len(lst), lst[:100])

Elapsed: 0.0s


NameError: name 'ThreadPool' is not defined

In [None]:
lst = []
workers = 8

DATA_SIZE = 1_000_000
lst = []

def fill_data(n):
    # print(threading.current_thread())
    while n > 0:
        n -= 1
        lst.append(random.randint(1, 100))
        
with timer('Elapsed: {}s'):
    with multiprocess.Pool(workers) as pool:
        input_data = [DATA_SIZE // workers for _ in range(workers)]
        pool.map(fill_data, input_data)
        
print(len(lst), lst[:100])

In [None]:
def factorize_naive(n):
    """ A naive factorization method. Take integer 'n', return list of
        factors.
    """
    if n < 2:
        return []
    factors = []
    p = 2

    while True:
        if n == 1:
            return factors

        r = n % p
        if r == 0:
            factors.append(p)
            n = n / p
        elif p * p >= n:
            factors.append(n)
            return factors
        elif p > 2:
            # Advance in steps of 2 over odd numbers
            p += 2
        else:
            # If p == 2, get to 3
            p += 1

    assert False, "unreachable"

In [None]:
DATA_SIZE = 1_000

In [None]:
result = {}
workers = 16
with timer('Elapsed: {}s'):
    with multiprocessing.Pool(workers) as pool:
        input_data = (i for i in range(1, DATA_SIZE+1))
        result = [
            (n, factors)
            for n, factors in enumerate(pool.map(factorize_naive, input_data), 1)
        ]
    
print(len(result), result[:100])

In [None]:
result = {}
workers = 16
with timer('Elapsed: {}s'):
    with ThreadPool(workers) as pool:
        input_data = (i for i in range(1, DATA_SIZE+1))
        result = [
            (n, factors)
            for n, factors in enumerate(pool.map(factorize_naive, input_data), 1)
        ]
    
print(len(result), result[:100])