# Thread

In [None]:
import time

# no threads

def f(name, n):
    for i in range(n):
        print(name, n-i)
        time.sleep(1)

# takes 8 seconds to run
f("A", 3)
f("B", 5)

In [None]:
# with threads

from threading import Thread

# takes 5 seconds to run 
t1 = Thread(target=f, args=("A", 3))
t2 = Thread(target=f, args=("B", 5))
# start both threads
t1.start()
t2.start()
# join together when both finished
t1.join()
t2.join()

print("done")

# Process

In [None]:
def double(x):
    return x * 2

In [None]:
# map function apply the double function on each element of the list
# similar termiology with the "apply" function of pandas
list(map(double, [7,8,9]))

In [None]:
# a pool of available processes
from multiprocessing import Pool

In [None]:
# use the with keyword
with Pool() as p:
    # this is probably a bad idea: it is much slower to create processes than multiply by 2
    # why? need to take extra time to create copies of the current process and wait all the processes to return the results
    results = p.map(double, [7,8,9])
results

In [None]:
# a good untilization of multiprocessing
# create different processes to scrape different web pages

# 17 pages in total 
import requests

def get_page(page_num):
    url = f"https://www.msyamkumar.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    return r.text

get_page(2)

In [None]:
# your VM has two CPUs
t0 = time.time()
with Pool() as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000) # how it takes to scrape all 17 pages in ms

In [None]:
# gets slower if only use one CPU
t0 = time.time()
with Pool(1) as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000)

In [None]:
t0 = time.time()
with Pool(8) as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000)

In [None]:
# bug 1: if there is an exception in one of our processes
with Pool(8) as p:
    results = p.map(get_page, range(20)) # we don't actually have this many pages

RemoteTraceback: this error traceback comes from another process than the current process this jupyter notebook is running \
Thus, the name "remote". 

In [None]:
import os
os.getpid() # the unique process ID for the current process

In [None]:
status_codes = []

def get_page(page_num):
    url = f"https://www.msyamkumar.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    status_codes.append(r.status_code) # bug 2: status_codes is empty
    r.raise_for_status()
    return r.text

In [None]:
with Pool(8) as p:
    results = p.map(get_page, range(18))
status_codes

In [None]:
status_codes = []

def get_page(page_num):
    url = f"https://www.msyamkumar.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    status_codes.append(r.status_code)
    print(os.getpid(), status_codes)
    r.raise_for_status()
    # ADVICE: return everything you need when using a process in a pool
    # global variables aren't shared between processes
    return r.text

In [None]:
with Pool(8) as p:
    results = p.map(get_page, range(18))