# Thread

In [1]:
import time

# no threads

def f(name, n):
    for i in range(n):
        print(name, n-i)
        time.sleep(1)

# takes 8 seconds to run
f("A", 3)
f("B", 5)

A 3
A 2
A 1
B 5
B 4
B 3
B 2
B 1


In [2]:
# with threads

from threading import Thread

# takes 5 seconds to run 
t1 = Thread(target=f, args=("A", 3))
t2 = Thread(target=f, args=("B", 5))
# start both threads
t1.start()
t2.start()
# join together when both finished
t1.join()
t2.join()

print("done")

AB 5
 3
BA 2
 4
A 1
B 3
B 2
B 1
done


# Process

In [3]:
def double(x):
    return x * 2

In [4]:
# map function apply the double function on each element of the list
# similar termiology with the "apply" function of pandas
list(map(double, [7,8,9]))

[14, 16, 18]

In [5]:
# a pool of available processes
from multiprocessing import Pool

In [6]:
# use the with keyword
with Pool() as p:
    # this is probably a bad idea: it is much slower to create processes than multiply by 2
    # why? need to take extra time to create copies of the current process and wait all the processes to return the results
    results = p.map(double, [7,8,9])
results

[14, 16, 18]

In [7]:
# a good untilization of multiprocessing
# create different processes to scrape different web pages

# 17 pages in total 
import requests

def get_page(page_num):
    url = f"https://www.msyamkumar.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    r.raise_for_status()
    return r.text

get_page(2)

'<html><body>\n<h1>Node: 2</h1>\n<h3>Children:</h3>\n<ul>\n<li><a href="0.html">0</a><li><a href="1.html">1</a></ul>\n</body></html>\n'

In [8]:
# your VM has two CPUs
t0 = time.time()
with Pool() as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000) # how it takes to scrape all 17 pages in ms

1211.5576267242432


In [9]:
# gets slower if only use one CPU
t0 = time.time()
with Pool(1) as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000)

1944.6773529052734


In [10]:
t0 = time.time()
with Pool(8) as p:
    results = p.map(get_page, range(18))
t1 = time.time()
print((t1-t0) * 1000)

1004.1356086730957


In [11]:
# bug 1: if there is an exception in one of our processes
with Pool(8) as p:
    results = p.map(get_page, range(20)) # we don't actually have this many pages

HTTPError: 404 Client Error: Not Found for url: https://www.msyamkumar.com/cs320/crawl/practice7/18.html

RemoteTraceback: this error traceback comes from another process than the current process this jupyter notebook is running \
Thus, the name "remote". 

In [12]:
import os
os.getpid() # the unique process ID for the current process

975761

In [13]:
status_codes = []

def get_page(page_num):
    url = f"https://www.msyamkumar.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    status_codes.append(r.status_code) # bug 2: status_codes is empty
    r.raise_for_status()
    return r.text

In [14]:
with Pool(8) as p:
    results = p.map(get_page, range(18))
status_codes

[]

In [15]:
status_codes = []

def get_page(page_num):
    url = f"https://www.msyamkumar.com/cs320/crawl/practice7/{page_num}.html"
    r = requests.get(url)
    status_codes.append(r.status_code)
    print(os.getpid(), status_codes)
    r.raise_for_status()
    # ADVICE: return everything you need when using a process in a pool
    # global variables aren't shared between processes
    return r.text

In [16]:
with Pool(8) as p:
    results = p.map(get_page, range(18))

975834 975837975840[200] [200]
975838975839975835975836
     975841[200] [200]
[200]
[200][200][200]



975840 [200, 200]975836 975834
975841975838 [200, 200]975839 975837975835[200, 200] 

 [200, 200]  [200, 200][200, 200]
[200, 200][200, 200]



975840975836 [200, 200, 200] 
[200, 200, 200]
