# Scraping granted patents from EPO publication server and processing

Similar to `scraping/scraping-epo-pub-server.ipynb` except I want to process into JSONL instead of saving as XML.

In [1]:
START_DATE = "20050812"
END_DATE   = "20210914"
ENDS_WITH  = "B1"                     # granted
BASE_URL   = "https://data.epo.org"      # kept only for date bookkeeping (no XML written)
LOG_DIRECTORY = "../data-test/logs"
OUTPUT_DIR = "../data-test/ep-b1-claims"  # where the XML files are saved
MAX_WORKERS = 6

# Retry / requests
RETRIES = 10
DELAY   = 1
BACKOFF = 1
JITTER  = (1, 3)
TIMEOUT = 5

In [2]:
import os, csv, json, threading
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from retry import retry
from bs4 import BeautifulSoup
from bs4.filter import SoupStrainer
from tqdm import tqdm
from lxml import etree

In [3]:
os.makedirs(LOG_DIRECTORY, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

def _append_failed(url):
    with open(os.path.join(LOG_DIRECTORY, 'failed-urls.csv'), 'a', newline='') as f:
        csv.writer(f).writerow([datetime.now().isoformat(), url])

def _read_finished_dates():
    path = os.path.join(LOG_DIRECTORY, 'finished.csv')
    if not os.path.exists(path):
        return set()
    with open(path, 'r') as f:
        return {row[0] for row in csv.reader(f) if row}

def _append_finished_dates(dates):
    path = os.path.join(LOG_DIRECTORY, 'finished.csv')
    existing = _read_finished_dates()
    new_dates = [d for d in dates if d not in existing]
    if not new_dates:
        return
    with open(path, 'a', newline='') as f:
        w = csv.writer(f)
        for d in new_dates:
            w.writerow([d])

In [4]:
@retry(tries=RETRIES, delay=DELAY, backoff=BACKOFF, jitter=JITTER)
def get_response(link):
    try:
        r = requests.get(link, timeout=TIMEOUT)
        r.raise_for_status()
        return r
    except (requests.RequestException, requests.HTTPError, requests.ConnectionError, requests.Timeout):
        return None

In [5]:
def extract_all_links_from_response(response_content):
    soup = BeautifulSoup(response_content, 'html.parser', parse_only=SoupStrainer('a'))
    return [link.get('href') for link in soup if link and link.get('href')]

def extract_links_ending_with(page_url, ends_with):
    resp = get_response(page_url)
    if resp is None:
        return []
    soup = BeautifulSoup(resp.content, 'html.parser', parse_only=SoupStrainer('a'))
    out = []
    for a in soup:
        href = a.get('href')
        if not href:
            continue
        # EPO pages usually have ".../<docId>/B1" links; we want those.
        if href.endswith(ends_with):
            out.append(href)
    return out

def get_filtered_links(start_date_str, end_date_str):
    finished_dates = _read_finished_dates()
    url = BASE_URL + "/publication-server/rest/v1.2/publication-dates/"
    resp = get_response(url)
    if resp is None:
        return None

    links = extract_all_links_from_response(resp.content)
    start_date = datetime.strptime(start_date_str, '%Y%m%d')
    end_date   = datetime.strptime(end_date_str, '%Y%m%d')

    filtered = []
    for href in links:
        try:
            date_str = href.split('/')[-2]
            link_date = datetime.strptime(date_str, '%Y%m%d')
            if start_date <= link_date <= end_date and date_str not in finished_dates:
                filtered.append(BASE_URL + href)
        except (ValueError, IndexError):
            pass
    return filtered

def get_date_from_url(url):
    return url.rstrip('/').split('/')[-2]

def extract_all_links(date_links):
    """Return {date: [full_document_xml_urls]}"""
    extracted = {}
    with ThreadPoolExecutor(MAX_WORKERS) as ex:
        fut2url = {ex.submit(extract_links_ending_with, url, ENDS_WITH): url for url in date_links}
        for fut in as_completed(fut2url):
            url = fut2url[fut]
            try:
                links = fut.result() or []
            except Exception as exc:
                print(f'{url!r} generated an exception: {exc}')
                links = []
            date = get_date_from_url(url)
            extracted.setdefault(date, [])
            processed = [BASE_URL + link + "/document.xml" for link in links]
            extracted[date].extend(processed)
    return extracted

In [6]:
def extract_json_from_xml_bytes(xml_bytes):
    parser = etree.XMLParser(recover=True)
    try:
        root = etree.fromstring(xml_bytes, parser=parser)
    except Exception:
        return None

    country = root.get('country', '') or ''
    number  = root.get('doc-number', '') or ''
    kind    = root.get('kind', '') or ''
    pn = f"{country}{number}{kind}".strip()
    if not pn:
        return None

    # Claims (EN only)
    claims_dict = {}
    for claim in root.xpath('//claims[@lang="en"]//claim'):
        num = (claim.get('num') or '').lstrip('0')
        if not num:
            continue
        texts = []
        for ctext in claim.xpath('.//claim-text'):
            texts.append(" ".join(s.strip() for s in ctext.xpath('.//text()') if s and s.strip()))
        claim_text = "\n".join(t for t in texts if t)
        if claim_text:
            claims_dict[num] = claim_text.strip()

    return {"pn": pn, "c": (claims_dict or {})}

In [7]:
_file_locks = {}
_global_lock = threading.Lock()

def _get_file_lock(date):
    with _global_lock:
        if date not in _file_locks:
            _file_locks[date] = threading.Lock()
        return _file_locks[date]

def process_link_to_jsonl(url, date):
    resp = get_response(url)
    if resp is None:
        _append_failed(url)
        return 0
    data = extract_json_from_xml_bytes(resp.content)
    if data is None:
        return 0  # skip non-CPC or malformed
    outfile = os.path.join(OUTPUT_DIR, f"{date}.jsonl")
    lock = _get_file_lock(date)
    with lock, open(outfile, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")
    return 1

def process_all_to_jsonl(extracted_links_dict):
    total = sum(len(v) for v in extracted_links_dict.values())
    pbar = tqdm(total=total, desc="Fetching + parsing (CPC-only)")
    processed = 0  # number of JSONL records written (CPC-only)

    with ThreadPoolExecutor(MAX_WORKERS) as ex:
        futures = []
        for date, links in extracted_links_dict.items():
            for link in links:
                futures.append(ex.submit(process_link_to_jsonl, link, date))

        for fut in as_completed(futures):
            try:
                processed += fut.result() or 0
            except Exception:
                pass
            pbar.update()

    pbar.close()
    return processed

In [8]:
def run_cpc_batch(start_date=START_DATE, end_date=END_DATE):
    pages = get_filtered_links(start_date, end_date)
    if not pages:
        print("No date pages to fetch.")
        return
    links = extract_all_links(pages)
    print(f"Found {sum(len(v) for v in links.values())} document.xml links across {len(links)} dates.")
    process_all_to_jsonl(links)           # skips non-CPC before any file I/O
    _append_finished_dates(list(links.keys()))
    print("Done.")

In [None]:
run_cpc_batch()

Found 1318364 document.xml links across 839 dates.


Fetching + parsing (CPC-only):   0%|          | 0/1318364 [00:00<?, ?it/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f4d1b952f90>>
Traceback (most recent call last):
  File "/home/mjh/patent-llms/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Fetching + parsing (CPC-only):   0%|          | 3663/1318364 [03:24<15:47:39, 23.12it/s]