In [15]:
import requests
import json
from pathlib import Path
import xml.etree.ElementTree as ET
import re
import unicodedata
import html

In [16]:
def clean_text_for_ai(text: str) -> str:
    # Normalize unicode
    text = unicodedata.normalize("NFKC", text)
    # Unescape HTML entities
    text = html.unescape(text)
    # Remove non-printable/control characters
    text = ''.join(c for c in text if c.isprintable())
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text)
    # Remove leading/trailing whitespace again
    text = text.strip()
    return text

def fetch_xml(doc_id: int, lang: str) -> str:
    """
    Fetches the XML section for the given document ID and language code.
    """
    url = f"https://www.tbs-sct.canada.ca/pol/doc-{lang}.aspx?id={doc_id}&section=xml"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.text

def extract_nodes_xml(xml_str: str) -> list[str]:
    """
    Parses XML string, finds all <p> and <li> within the main body,
    and returns their text contents as a list.
    """
    root = ET.fromstring(xml_str)  # parse the XML string :contentReference[oaicite:3]{index=3}
    items = []
    # The main content is typically under the <body> or <div> tags;
    # adjust the XPath as needed for your XML structure.
    for elem in root.findall(".//p") + root.findall(".//li"):
        text = (elem.text or "")
        clean = clean_text_for_ai(text)
        if len(clean) > 2:  # skip empty/very short
            items.append(clean)
    return items

def scrape_doc_bilingual(doc_id: int) -> dict[str, list[str]]:
    data = {}
    for lang_code, key in [("eng", "en"), ("fra", "fr")]:
        xml = fetch_xml(doc_id, lang_code)
        items = extract_nodes_xml(xml)
        data[key] = items
    # Align by index, truncate to shortest length
    min_len = min(len(data["en"]), len(data["fr"]))
    pairs = [{"en": data["en"][i], "fr": data["fr"][i]} for i in range(min_len)]
    return pairs
    

In [17]:
doc_ids = [32763, 32692, 13525]  # extend as needed
all_docs = {}

for did in doc_ids:
    try:
        all_docs[str(did)] = scrape_doc_bilingual(did)
    except Exception as e:
        print(f"Error on ID {did}: {e}")

# Ensure output directory exists
out_path = Path("output")
out_path.mkdir(exist_ok=True)

# Write one combined JSON
with open(out_path / "all_docs.json", "w", encoding="utf-8") as f:
    json.dump(all_docs, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(all_docs)} documents to {out_path/'all_docs.json'}")

Wrote 3 documents to output/all_docs.json


In [18]:
with open(Path("output") / "all_docs.json", "r", encoding="utf-8") as f:
    all_docs = json.load(f)

# Flatten all pairs into a single list
all_pairs = []
for pairs in all_docs.values():
    all_pairs.extend(pairs)

# Write the flattened list to a new JSON file
with open(Path("output") / "all_pairs.json", "w", encoding="utf-8") as f:
    json.dump(all_pairs, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(all_pairs)} pairs to {Path('output') / 'all_pairs.json'}")

Wrote 538 pairs to output/all_pairs.json
