In [97]:
from collections import deque
from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED

In [205]:
def is_not_allowed(rp, url):
    return not rp.can_fetch('*', url)

In [206]:
def process_url(base_url, rp, url, depth):
    with lock:
        if url in visited_urls or depth > max_depth or is_not_allowed(rp, url):
            return []

        visited_urls.add(url)

    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        new_texts = []
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            text = p.get_text(strip=True)
            if len(text) > 150:
                new_texts.append(text)

        with lock:
            for text in new_texts:
                all_text.add(text)

        new_links = []
        links = soup.find_all('a')
        for link in links:
            href = link.get('href')
            if href and (
                  href.startswith("/") or href.startswith(url)
            ):
                next_url = urljoin(url, href)
                with lock:
                    if next_url not in visited_urls:
                        new_links.append((next_url, depth + 1))

        return new_links

    except Exception as e:
        print(f"Error on {url}: {e}")
        return []

In [218]:
max_depth = 5
max_count = int(1e5)
max_workers = 16

speed experiments for crawling 10k links
* max_workers = 8: 327.23 seconds
* max_workers = 16: 234.79 seconds

In [219]:
text_local_path = "/content/text_output.json"

In [220]:
import os
if os.path.exists(text_local_path):
    with open(text_local_path, 'r') as f:
        output_dict = json.load(f)
    print("Loaded existing text_output.json")
else:
  output_dict = {}

In [221]:
urls = [
    "https://www.fotech.cl/",
    "https://www.portalnet.cl/",
    "https://www.gamba.cl/",
    "https://www.soychile.cl/",
    "https://www.df.cl/",
    "https://www.ed.cl/",
    "https://www.wwf.cl/",
    "https://www.elmostrador.cl/",
    "https://www.biobiochile.cl/",
    "https://fundacionsol.cl/",
    "https://observatoriodesigualdades.udp.cl/",
]

In [222]:
import threading
import json
import time
lock = threading.Lock()

In [None]:
visited_urls = set()
multi_level_queue = {}
for url in urls:
  multi_level_queue[url] = [(url, 0)]

start = time.time()
for base_url, queue in multi_level_queue.items():
    print("Crawling:", base_url)

    robots_url = urljoin(base_url, '/robots.txt')
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()

    queue = deque(queue)
    count = 0

    all_text = set()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = set()

        while queue and count < max_count:
            while queue and len(futures) < max_workers:
                url, depth = queue.popleft()
                future = executor.submit(process_url, base_url, rp, url, depth)
                futures.add(future)
                count += 1
                if count % 1000 == 0:
                    print("log: ", count, len(queue))

            done, futures = wait(futures, return_when='FIRST_COMPLETED')
            for future in done:
                result = future.result()
                if result:
                    queue.extend(result)
    output_dict[base_url] = list(all_text)

    with open(text_local_path, 'w') as f:
      json.dump(output_dict, f)

print(f"Time taken: {time.time() - start:.2f} seconds")

Crawling: https://www.fotech.cl/
Crawling: https://www.portalnet.cl/
Crawling: https://www.gamba.cl/
Error on https://www.gamba.cl/: 403 Client Error: Forbidden for url: https://www.gamba.cl/
Crawling: https://www.soychile.cl/
Error on https://www.soychile.cl/: 403 Client Error: Forbidden for url: https://www.soychile.cl/
Crawling: https://www.df.cl/
log:  1000 28261
log:  2000 28199
log:  3000 27870
log:  4000 28952
Error on https://www.df.cl/legales/site/tax/port/all/taxport_24___1.html: 404 Client Error: Not Found for url: https://www.df.cl/legales/site/tax/port/all/taxport_24___1.html
log:  5000 28630
log:  6000 28839
log:  7000 28322
log:  8000 27684
log:  9000 27459
log:  10000 27160
log:  11000 26842
log:  12000 26516
log:  13000 26163
log:  14000 25510
log:  15000 25116
log:  16000 24311
log:  17000 23574
log:  18000 22574
log:  19000 21599
log:  20000 20770
log:  21000 20001
log:  22000 19001
log:  23000 18178
log:  24000 17182
