## Basics

In [1]:
import multiprocessing

def spawn():
    print('Spawned')

for i in range(5):
    p = multiprocessing.Process(target=spawn)
    p.start()
    p.join() # wit for the process to end

Spawned
Spawned
Spawned
Spawned
Spawned


In [2]:
def spawn(num):
    print('Spawn # {}'.format(num))

for i in range(5):
    p = multiprocessing.Process(target=spawn, args=(i,))
    p.start()


Spawn # 0
Spawn # 2
Spawn # 1
Spawn # 4
Spawn # 3


Notice as well that these spawns aren't in any order. The processes might complete at different times. If you need order, you should make use of .join

## Getting values

In [3]:
from multiprocessing import Pool

def job(num):
    return num*2

p = Pool(processes=20)
data = p.map(job, range(20))
p.close()
print(data)

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38]


## Spider

In [1]:
from multiprocessing import Pool
import bs4 as bs
import random
import requests
import string

In [None]:
def random_starting_url():
    starting = ''.join(random.SystemRandom().choice(string.ascii_lowercase) for _ in range(3))
    url = ''.join([f'http://{starting}.com'])
    return url

def handle_local_links(url, link):
    if link.startswith('/'):
        return ''.join([url, link])
    else:
        return link
    
def get_links(url):
    try:
        resp = requests.get(url)
        soup = bs.BeautifulSoup(resp.text, 'lxml')
        body = soup.body
        links = [link.get('href') for link in body.find_all('a')]
        links = [handle_local_links(url, link) for link in links]
        links = [str(link.encode('ascii')) for link in links]
        return links

    except TabError as e:
        print(e)
        print('Got a TypeError, probably got a None that we tried to iterate over')
        return []
    except IndexError as e:
        print(e)
        print('We probably did not find any useful links, return empty list')
        return []
    except AttributeError as e:
        print(e)
        print('Likely got None for the links, so we are throwing this')
        return []
    except Exception as e:
        print(str(e))
        # log this error 
        return []

def main():
    how_many = 50
    p = Pool(processes=how_many)
    parse_us = [random_starting_url() for _ in range(how_many)]

    data = p.map(get_links, [link for link in parse_us])
    data = [url for url_list in data for url in url_list]
    p.close()

    with open('urls.txt','w') as f:
        f.write(str(data))

main()