In [21]:
import requests
import json
from pathlib import Path
from lxml import etree as ET  # Replace the existing xml.etree.ElementTree import
import re
import unicodedata
import html

In [22]:
doc_ids = [32763, 32692, 13525, 32616, 19421]  # extend as needed

In [23]:
def clean_text_for_ai(text: str) -> str:
    # Normalize unicode
    text = unicodedata.normalize("NFKC", text)
    # Unescape HTML entities
    text = html.unescape(text)
    # Remove non-printable/control characters
    text = ''.join(c for c in text if c.isprintable())
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text)
    # Remove leading/trailing whitespace again
    text = text.strip()
    return text

def fetch_xml(doc_id: int, lang: str) -> str:
    """
    Fetches the XML section for the given document ID and language code.
    """
    url = f"https://www.tbs-sct.canada.ca/pol/doc-{lang}.aspx?id={doc_id}&section=xml"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.content

def extract_nodes_xml(xml_bytes: bytes) -> list[str]:
    """
    Parses XML bytes, finds all <p> and <li> within the main body,
    excluding content in appendices sections.
    Returns their text contents as a list.
    """
    parser = ET.XMLParser(encoding='utf-8')
    root = ET.fromstring(xml_bytes, parser=parser)
    items = []
    
    # Find all appendices sections first
    appendices = root.findall(".//appendices")
    
    def is_in_appendices(elem):
        """Check if element is inside any appendices section"""
        parent = elem.getparent()
        while parent is not None:
            if parent in appendices:
                return True
            parent = parent.getparent()
        return False

    # Find all p and li elements
    for elem in root.findall(".//p") + root.findall(".//li") + root.findall(".//clause"):
        if is_in_appendices(elem):
            continue
            
        # Get all text content, including tail text
        text_parts = []
        if elem.text:
            text_parts.append(elem.text)
        for child in elem:
            if child.tail:
                text_parts.append(child.tail)
        
        text = " ".join(text_parts)
        clean = clean_text_for_ai(text)
        if len(clean) > 2:  # skip empty/very short
            items.append(clean)
            
    return items

def scrape_doc_bilingual(doc_id: int) -> dict[str, list[str]]:
    data = {}
    for lang_code, key in [("eng", "en"), ("fra", "fr")]:
        xml = fetch_xml(doc_id, lang_code)
        items = extract_nodes_xml(xml)
        data[key] = items
    # Align by index, truncate to shortest length
    min_len = min(len(data["en"]), len(data["fr"]))
    pairs = [{"en": data["en"][i], "fr": data["fr"][i]} for i in range(min_len)]
    return pairs
    

In [24]:
print("Starting new scraping run...\n")
all_docs = {}

# Ensure output directory exists
out_path = Path("output/docs")
out_path.mkdir(exist_ok=True)

for did in doc_ids:
    out_file = out_path / f"{did}.json"
    if out_file.exists():
        print(f"Skipping doc_id {did} (output exists)")
        continue
    try:
        all_docs[str(did)] = scrape_doc_bilingual(did)
        print("doc_id: ", did,  " - Pairs: ", len(all_docs[str(did)]))
        with open(out_path / f"{did}.json", "w", encoding="utf-8") as f:
            json.dump(all_docs, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"Error on ID {did}: {e}")

Starting new scraping run...

Skipping doc_id 32763 (output exists)
Skipping doc_id 32692 (output exists)
Skipping doc_id 13525 (output exists)
Skipping doc_id 32616 (output exists)
doc_id:  19421  - Pairs:  166


In [32]:
# Collect all en/fr pairs from each JSON file in output/docs
all_pairs = []
docs_path = Path("output/docs")

def strip_leading_non_capital(text, lang="fr"):
    # For English: A-Z; For French: A-Z plus accented capitals
    if lang == "fr":
        pattern = r"^([^A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ]*)([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ].*)"
    else:
        pattern = r"^([^A-Z]*)([A-Z].*)"
    m = re.match(pattern, text)
    if m:
        return m.group(2)
    return text


for file in docs_path.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        doc_pairs = json.load(f)
        # If the file contains a dict (old format), flatten its values
        if isinstance(doc_pairs, dict):
            for pairs in doc_pairs.values():
                all_pairs.extend(pairs)
        else:
            all_pairs.extend(doc_pairs)

# Capitalize French if English starts with a capital letter
for pair in all_pairs:
    en = pair.get("en", "")
    fr = pair.get("fr", "")
    # Clean English: remove leading non-capital-letter chars
    if en:
        en_clean = strip_leading_non_capital(en, lang="en")
        pair["en"] = en_clean
    # Clean French: remove leading non-capital-letter chars
    if fr:
        fr_clean = strip_leading_non_capital(fr, lang="fr")
        pair["fr"] = fr_clean
    # Capitalize French if English starts with a capital letter
    if en and en[0].isupper() and fr:
        if fr and not fr[0].isupper():
            pair["fr"] = fr[0].upper() + fr[1:]

# Write the flattened list to a new JSON file
with open(Path("output") / "all_pairs.json", "w", encoding="utf-8") as f:
    json.dump(all_pairs, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(all_pairs)} pairs to {Path('output') / 'all_pairs.json'}")

Wrote 1569 pairs to output\all_pairs.json
