# Extracting Claims from EPO Patent XML Files

This notebook processes XML files downloaded from the EPO publication server and extracts:
1. Patent number (combining country+number+kind)
2. Claims in English language
3. Saves output as JSONL files grouped by publication date (YYYYMMDD)

The output format is:
```json
{
  "pn": "EP1234567B1",
  "c": {
    "1": "text of claim 1",
    "2": "text of claim 2"
  }
}
```

*Notebook process*
1. Find all XML files in date directories under `INPUT_DIR`
2. Extract patent numbers and claims
3. Save results as JSONL files in `OUTPUT_DIR` directory named by date (`YYYYMMDD.jsonl`)
4. Skip dates that already have a JSONL file for that date


In [None]:
import os
import json
import glob
from lxml import etree # pyright: ignore[reportAttributeAccessIssue]
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

# Input/Output directories
INPUT_DIR = "../rawdata/ep-b1"
OUTPUT_DIR = "../processed"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [2]:
def extract_patent_info(xml_file):
    """Process a single XML file and extract patent number and claims"""
    try:
        # Parse XML with recovery mode for potential malformed XML
        parser = etree.XMLParser(recover=True)
        tree = etree.parse(xml_file, parser)
        root = tree.getroot()

        # Extract patent number
        country = root.get('country', '')
        number = root.get('doc-number', '')
        kind = root.get('kind', '')
        patent_number = f"{country}{number}{kind}"

        # Extract claims (only English)
        claims_dict = {}
        claims = root.xpath('//claims[@lang="en"]')
        if claims:
            for claim in claims[0].xpath('.//claim'):
                num = claim.get('num', '').lstrip('0')  # Remove leading zeros
                if num:
                    # Get all text from claim, handling nested tags
                    texts = []
                    for text in claim.xpath('.//claim-text'):
                        # Process text content and any tail text
                        parts = []
                        for elem in text.xpath('.//text()'):
                            parts.append(elem.strip())
                        texts.append(' '.join(filter(None, parts)))
                    
                    claim_text = '\n'.join(filter(None, texts))
                    claims_dict[f"{num}"] = claim_text.strip()

        if patent_number and claims_dict:
            return {
                "pn": patent_number,
                "c": claims_dict
            }
        
    except Exception as e:
        print(f"Error processing {xml_file}: {str(e)}")
    
    return None

In [3]:
def process_patent_files():
    """Process all XML files and save as JSONL grouped by date"""
    # Get all date directories
    date_dirs = sorted(glob.glob(os.path.join(INPUT_DIR, '*')))
    
    for date_dir in date_dirs:
        date = os.path.basename(date_dir)
        output_file = os.path.join(OUTPUT_DIR, f"{date}.jsonl")
        
        # Skip if output file already exists
        if os.path.exists(output_file):
            print(f"Skipping {date} - output file already exists")
            continue
        
        # Get all XML files in this date directory
        xml_files = glob.glob(os.path.join(date_dir, '*.xml'))
        if not xml_files:
            continue
            
        print(f"Processing {len(xml_files)} files from {date}")
        
        # Process each XML file
        with open(output_file, 'w', encoding='utf-8') as f:
                    with ProcessPoolExecutor() as executor:
                        futures = {executor.submit(extract_patent_info, xml_file): xml_file for xml_file in xml_files}
                        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing {date}"):
                            result = future.result()
                            if result:
                                f.write(json.dumps(result, ensure_ascii=False) + '\n')