### Multi-threaded crawling


In [None]:
import _thread
import time


def print_time(threadName: str, delay: int, iterations: int) -> None:
    start = int(time.time())
    for i in range(0, iterations):
        time.sleep(delay)
        seconds_elapsed = str(int(time.time()) - start)
        print("{} {}".format(seconds_elapsed, threadName))


try:
    _thread.start_new_thread(print_time, ('Fizz', 3, 33))
    _thread.start_new_thread(print_time, ('Buzz', 5, 20))
    _thread.start_new_thread(print_time, ('Counter', 1, 100))
except:
    print('Error: unable to start thread')
while 1:
    pass

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, ResultSet
from typing import Any
import re
import random

import _thread
import time


def get_links(thread_name: str, bs: BeautifulSoup) -> ResultSet[Any]:
    print('Getting links in {}'.format(thread_name))
    
    return bs.find('div', {
        'id': 'bodyContent'
    }).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))


# Define a function for the thread


def scrape_article(thread_name, path) -> None:
    html = urlopen('http://en.wikipedia.org{}'.format(path))
    time.sleep(5)  # attention
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    print('Scraping {} in thread {}'.format(title, thread_name))
    links = get_links(thread_name, bs)
    
    if len(links) > 0:
        newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
        print(newArticle)
        scrape_article(thread_name, newArticle)


# Create two threads as defined below
try:
    _thread.start_new_thread(scrape_article, (
        'Thread 1',
        '/wiki/Kevin_Bacon',
    ))
    _thread.start_new_thread(scrape_article, (
        'Thread 2',
        '/wiki/Monty_Python',
    ))
except:
    print('Error: unable to start thread')
while 1:
    pass

### Queues and Thread


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import _thread
from queue import Queue
import time
import pymysql


def storage(queue: Queue) -> None:
    conn = pymysql.connect(
        host='127.0.0.1',
        #    unix_socket='/tmp/mysql.sock',
        user='root',
        passwd='',
        db='mysql',
        charset='utf8')
    cur = conn.cursor()
    cur.execute('USE wiki_threads')
    
    while 1:
        if not queue.empty():
            article = queue.get()
            cur.execute('SELECT * FROM pages WHERE path = %s',
                        (article["path"]))
        if cur.rowcount == 0:
            print("Storing article {}".format(article["title"]))
            cur.execute('INSERT INTO pages (title, path) VALUES (%s, %s)',
                        (article["title"], article["path"]))
            conn.commit()
        else:
            print("Article already exists: {}".format(article['title']))


visited = []


def getLinks(thread_name: str, bs: BeautifulSoup) -> list[str]:
    print('Getting links in {}'.format(thread_name))
    links = bs.find('div', {
        'id': 'bodyContent'
    }).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    
    return [link for link in links if link not in visited]


def scrape_article(thread_name: str, path: str, queue: Queue) -> None:
    visited.append(path)
    html = urlopen('http://en.wikipedia.org{}'.format(path))
    time.sleep(5)
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    print('Added {} for storage in thread {}'.format(title, thread_name))
    queue.put({"title": title, "path": path})
    links = getLinks(thread_name, bs)
    if len(links) > 0:
        newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
        scrape_article(thread_name, newArticle, queue)


queue = Queue()
try:
    _thread.start_new_thread(scrape_article, (
        'Thread 1',
        '/wiki/Kevin_Bacon',
        queue,
    ))
    _thread.start_new_thread(scrape_article, (
        'Thread 2',
        '/wiki/Monty_Python',
        queue,
    ))
    _thread.start_new_thread(storage, (queue, ))
except:
    print('Error: unable to start threads')

while 1:
    pass

### Threads


In [None]:
import threading
import time


def print_time(threadName: str, delay: int, iterations: int) -> None:
    start = int(time.time())
    for i in range(0, iterations):
        time.sleep(delay)
        seconds_elapsed = str(int(time.time()) - start)
        print('{} {}'.format(seconds_elapsed, threadName))


threading.Thread(target=print_time, args=('Fizz', 3, 33)).start()
threading.Thread(target=print_time, args=('Buzz', 5, 20)).start()
threading.Thread(target=print_time, args=('Counter', 1, 100)).start()

In [None]:
import threading


def crawler(url: str) -> None:
    data = threading.local()
    data.visited = []


threading.Thread(target=crawler, args=('http://brookings.edu')).start()

while True:
    time.sleep(1)
    if not t.isAlive():
        t = threading.Thread(target=crawler)
        t.start()

In [None]:
class Crawler(threading.Thread):

    def __init__(self):
        # initializing the parent class
        threading.Thread.__init__(self)
        self.done = False

    def isDone(self) -> bool:
        return self.done

    def run(self) -> None:
        time.sleep(5)
        self.done = True
        raise Exception('Something bad happened!')


t = Crawler()
t.start()

while True:
    time.sleep(1)
    if t.isDone():
        print('Done')
        break
    if not t.isAlive():
        t = Crawler()
        t.start()

### Multiprocessing


In [2]:
from multiprocessing import Process
import time


def print_time(threadName: str, delay: int, iterations: int) -> None:
    start = int(time.time())
    for i in range(0, iterations):
        time.sleep(delay)
        seconds_elapsed = str(int(time.time()) - start)
        print(threadName if threadName else seconds_elapsed)


processes = []
processes.append(Process(target=print_time, args=('Counter', 1, 100)))
processes.append(Process(target=print_time, args=('Fizz', 3, 33)))
processes.append(Process(target=print_time, args=('Buzz', 5, 20)))

for p in processes:
    p.start()

for p in processes:
    p.join()

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random

from multiprocessing import Process
import os
import time

visited = []


def get_links(bs: BeautifulSoup) -> list[str]:
    print('Getting links in {}'.format(os.getpid()))
    links = bs.find('div', {
        'id': 'bodyContent'
    }).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

    return [link for link in links if link not in visited]


def scrape_article(path: str) -> None:
    visited.append(path)
    html = urlopen('http://en.wikipedia.org{}'.format(path))
    time.sleep(5)

    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    print('Scraping {} in process {}'.format(title, os.getpid()))
    links = get_links(bs)

    if len(links) > 0:
        newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
        print(newArticle)
        scrape_article(newArticle)


processes = []
processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', )))
processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', )))

for p in processes:
    p.start()

# page 284

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
from multiprocessing import Process, Queue
import os
import time


def task_delegator(taskQueue: Queue, urlsQueue: Queue) -> None:
    # Initialize com uma tarefa para cada processo
    visited = ['/wiki/Kevin_Bacon', '/wiki/Monty_Python']
    taskQueue.put('/wiki/Kevin_Bacon')
    taskQueue.put('/wiki/Monty_Python')

    while 1:
        # Check for new links in urlsQueue
        # to be processed
        if not urlsQueue.empty():
            links = [link for link in urlsQueue.get() if link not in visited]
            for link in links:
                # Add a new link to taskQueue
                taskQueue.put(link)


def get_links(bs: BeautifulSoup) -> list[str]:
    links = bs.find('div', {
        'id': 'bodyContent'
    }).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

    return [link.attrs['href'] for link in links]


def scrape_article(taskQueue: Queue, urlsQueue: Queue) -> None:
    while 1:
        while taskQueue.empty():
            # Sleep for 100 ms while waiting for task queue
            # This must be rare
            time.sleep(.1)
    path = taskQueue.get()
    html = urlopen('http://en.wikipedia.org{}'.format(path))
    time.sleep(5)
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find('h1').get_text()
    print('Scraping {} in process {}'.format(title, os.getpid()))
    links = get_links(bs)
    # Envia ao código de delegação para processamento
    urlsQueue.put(links)


processes = []
taskQueue = Queue()
urlsQueue = Queue()
processes.append(Process(target=task_delegator, args=(
    taskQueue,
    urlsQueue,
)))

processes.append(Process(target=scrape_article, args=(
    taskQueue,
    urlsQueue,
)))

processes.append(Process(target=scrape_article, args=(
    taskQueue,
    urlsQueue,
)))

for p in processes:
    p.start()

# page 286