In [None]:
from collections import deque
from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Connection': 'keep-alive'
}

In [None]:
def is_not_allowed(rp, url):
    # return not rp.can_fetch('*', url)
    return False

In [None]:
from requests.exceptions import RequestException

In [None]:
def process_url(base_url, rp, url, depth):
    with lock:
        if url in visited_urls or depth > max_depth or is_not_allowed(rp, url):
            return []

        visited_urls.add(url)

    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.find('h1', class_='jeg_post_title')
        if not title:
            pass
        else: # only record text if recieved title
            title_text = title.get_text(strip=True)
            new_texts = []
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                text = p.get_text(strip=True)
                if len(text) > 150:
                    new_texts.append(text)

            with lock:
                all_text[url] = {
                    "title": title_text,
                    "text": new_texts
                }

        new_links = []
        links = soup.find_all('a')
        for link in links:
            href = link.get('href')
            if href and (
                  href.startswith("/") or href.startswith(url)
                  or (not href.startswith("https://") and not href.startswith("javascript"))
            ):
                next_url = urljoin(url, href)
                with lock:
                    if next_url not in visited_urls:
                        new_links.append((next_url, depth + 1))

        return new_links

    except RequestException as e:
        if e.response and e.response.status_code == 403:
            print(f"403 Forbidden on {url}")
        return []

    except Exception as e:
        print(f"Unhandled error on {url}: {e}")
        return []

In [None]:
max_depth = 10
max_count = int(1e5)
max_workers = 16

speed experiments for crawling 10k links
* max_workers = 8: 327.23 seconds
* max_workers = 16: 234.79 seconds

In [None]:
text_local_path = "/content/fotech_output.json"

In [None]:
import os
if os.path.exists(text_local_path):
    with open(text_local_path, 'r') as f:
        output_dict = json.load(f)
    print("Loaded existing text_output.json")
else:
  output_dict = {}

Loaded existing text_output.json


In [None]:
base_url = "https://www.fotech.cl/"

In [None]:
import threading
import json
import time
lock = threading.Lock()

In [None]:
visited_urls = set()
queue = [(base_url, 0)]


robots_url = urljoin(base_url, '/robots.txt')
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()

queue = deque(queue)
count = 0
MAX_CONSECUTIVE_TIMEOUTS = 10
consecutive_timeouts = 0

all_text = {}
executor = ThreadPoolExecutor(max_workers=max_workers)
futures = set()
try:
    while queue and count < max_count:
        while queue and len(futures) < max_workers:
            url, depth = queue.popleft()
            future = executor.submit(process_url, base_url, rp, url, depth)
            futures.add(future)
            count += 1
            if count % 1000 == 0:
                print("log: ", count, len(queue))

        done, futures = wait(futures, timeout=10, return_when='FIRST_COMPLETED')
        if not done:
            consecutive_timeouts += 1
            print(f"Timeout #{consecutive_timeouts}: no futures completed this cycle.")
            if consecutive_timeouts >= MAX_CONSECUTIVE_TIMEOUTS:
                print("Too many consecutive timeouts! Exiting crawl early.")
                break
        else:
            consecutive_timeouts = 0

        for future in done:
            result = future.result()
            if result:
                queue.extend(result)

finally:
    # Cancel any unfinished futures before shutdown
    for f in futures:
        if not f.done():
            f.cancel()
    executor.shutdown(wait=False)  # Safe to call here

with open(text_local_path, 'w') as f:
    json.dump(all_text, f)
print(len(all_text))

log:  1000 32
log:  2000 1285
log:  3000 2151
log:  4000 2351
log:  5000 2822
log:  6000 2990
log:  7000 3311
log:  8000 3231
log:  9000 3139
log:  10000 3064
log:  11000 2668
log:  12000 1978
log:  13000 979
117


In [None]:
type(all_text)

dict