In [1]:
import requests
import json
from pathlib import Path
from lxml import etree as ET  # Replace the existing xml.etree.ElementTree import
import re
import unicodedata
import html
import requests
from bs4 import BeautifulSoup
import time

In [2]:
# Get IDs
# Get IDs
# URL of the policy instruments page
url = 'https://www.tbs-sct.canada.ca/pol/a-z-eng.aspx'

# Set headers to mimic a browser visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}

# Fetch the page content
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an error for bad status codes

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find all anchor tags with the document links and extract IDs
doc_ids_set = set()
for tag in soup.find_all('a', href=True):
    match = re.search(r'doc-eng\.aspx\?id=(\d+)', tag['href'])
    if match:
        doc_ids_set.add(int(match.group(1)))

print("Extracted document IDs:")
doc_ids = sorted(doc_ids_set)
print(doc_ids)

Extracted document IDs:
[12084, 12111, 12129, 12139, 12141, 12143, 12160, 12182, 12323, 12453, 12510, 12522, 12553, 12563, 12583, 12588, 12595, 12601, 12602, 12607, 12610, 12614, 13342, 13525, 13583, 13589, 13593, 13602, 13603, 13616, 13663, 13685, 13697, 13832, 13848, 13890, 13937, 13953, 13954, 14208, 14219, 14265, 15772, 15773, 15774, 15796, 16484, 16553, 16557, 16577, 16578, 17065, 17067, 17151, 17280, 17284, 17590, 18309, 18310, 19061, 19420, 19421, 19422, 20008, 20930, 21104, 22370, 22379, 23601, 24227, 24970, 25049, 25583, 25593, 25600, 25748, 25761, 25845, 25857, 25867, 25868, 25875, 26160, 26163, 26164, 26168, 26262, 26295, 26332, 26952, 26953, 26954, 27088, 27146, 27228, 27256, 27807, 28108, 28203, 28305, 28699, 28700, 30656, 30678, 30682, 30683, 31300, 31306, 32495, 32499, 32502, 32503, 32504, 32505, 32509, 32510, 32511, 32512, 32513, 32514, 32515, 32516, 32517, 32518, 32519, 32520, 32521, 32522, 32523, 32524, 32525, 32526, 32527, 32528, 32529, 32530, 32533, 32563, 32573, 32

In [3]:
#doc_ids = [32763, 32692, 13525, 32616, 19421]  # extend as needed

In [4]:
def clean_text_for_ai(text: str) -> str:
    # Normalize unicode
    text = unicodedata.normalize("NFKC", text)
    # Unescape HTML entities
    text = html.unescape(text)
    # Remove non-printable/control characters
    text = ''.join(c for c in text if c.isprintable())
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text)
    # Remove leading/trailing whitespace again
    text = text.strip()
    return text

def fetch_xml(doc_id: int, lang: str) -> str:
    """
    Fetches the XML section for the given document ID and language code.
    """
    url = f"https://www.tbs-sct.canada.ca/pol/doc-{lang}.aspx?id={doc_id}&section=xml"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.content

def extract_nodes_xml(xml_bytes: bytes) -> list[str]:
    """
    Parses XML bytes, finds all <p> and <li> within the main body,
    excluding content in appendices sections.
    Returns their text contents as a list.
    """
    parser = ET.XMLParser(encoding='utf-8')
    root = ET.fromstring(xml_bytes, parser=parser)
    items = []
    
    # Find all appendices sections first
    appendices = root.findall(".//appendices")
    
    def is_in_appendices(elem):
        """Check if element is inside any appendices section"""
        parent = elem.getparent()
        while parent is not None:
            if parent in appendices:
                return True
            parent = parent.getparent()
        return False

    # Find all p and li elements
    for elem in root.findall(".//p") + root.findall(".//li") + root.findall(".//clause"):
        if is_in_appendices(elem):
            continue
            
        # Get all text content, including tail text
        text_parts = []
        if elem.text:
            text_parts.append(elem.text)
        for child in elem:
            if child.tail:
                text_parts.append(child.tail)
        
        text = " ".join(text_parts)
        clean = clean_text_for_ai(text)
        if len(clean) > 2:  # skip empty/very short
            items.append(clean)
            
    return items

def scrape_doc_bilingual(doc_id: int) -> dict[str, list[str]]:
    data = {}
    for lang_code, key in [("eng", "en"), ("fra", "fr")]:
        xml = fetch_xml(doc_id, lang_code)
        items = extract_nodes_xml(xml)
        data[key] = items
    # Align by index, truncate to shortest length
    min_len = min(len(data["en"]), len(data["fr"]))
    pairs = [{"en": data["en"][i], "fr": data["fr"][i]} for i in range(min_len)]
    return pairs
    

In [5]:
print("Starting new scraping run...\n")
all_docs = {}

# Ensure output directory exists
out_path = Path("output/docs")
out_path.mkdir(exist_ok=True)

for did in doc_ids:
    out_file = out_path / f"{did}.json"
    if out_file.exists():
        print(f"Skipping doc_id {did} (output exists)")
        continue
    try:
        all_docs[str(did)] = scrape_doc_bilingual(did)
        print("doc_id: ", did,  " - Pairs: ", len(all_docs[str(did)]))
        with open(out_path / f"{did}.json", "w", encoding="utf-8") as f:
            json.dump(all_docs[str(did)], f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"Error on ID {did}: {e}")

Starting new scraping run...

doc_id:  12084  - Pairs:  31
doc_id:  12111  - Pairs:  37
doc_id:  12129  - Pairs:  14
doc_id:  12139  - Pairs:  14
doc_id:  12141  - Pairs:  10
doc_id:  12143  - Pairs:  26
doc_id:  12160  - Pairs:  23
doc_id:  12182  - Pairs:  4654
doc_id:  12323  - Pairs:  135
doc_id:  12453  - Pairs:  106
doc_id:  12510  - Pairs:  111
doc_id:  12522  - Pairs:  2
doc_id:  12553  - Pairs:  71
doc_id:  12563  - Pairs:  11
doc_id:  12583  - Pairs:  26
doc_id:  12588  - Pairs:  16
doc_id:  12595  - Pairs:  62
doc_id:  12601  - Pairs:  50
doc_id:  12602  - Pairs:  55
doc_id:  12607  - Pairs:  174
doc_id:  12610  - Pairs:  18
doc_id:  12614  - Pairs:  45
doc_id:  13342  - Pairs:  49
doc_id:  13525  - Pairs:  88
doc_id:  13583  - Pairs:  22
doc_id:  13589  - Pairs:  39
doc_id:  13593  - Pairs:  2
doc_id:  13602  - Pairs:  283
doc_id:  13603  - Pairs:  84
doc_id:  13616  - Pairs:  76
doc_id:  13663  - Pairs:  2
doc_id:  13685  - Pairs:  8
doc_id:  13697  - Pairs:  54
doc_id:  1

In [7]:
# Collect all en/fr pairs from each JSON file in output/docs
all_pairs = []
all_pairs_indexed = []
docs_path = Path("output/docs")

def strip_leading_non_capital(text, lang="fr"):
    # For English: A-Z; For French: A-Z plus accented capitals
    if lang == "fr":
        pattern = r"^([^A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ]*)([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ].*)"
    else:
        pattern = r"^([^A-Z]*)([A-Z].*)"
    m = re.match(pattern, text)
    if m:
        return m.group(2)
    return text


for file in docs_path.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        doc_pairs = json.load(f)
        # If the file contains a dict (old format), flatten its values
        if isinstance(doc_pairs, dict):
            for pairs in doc_pairs.values():
                for pair in pairs:
                    all_pairs.append(pair)
                    indexed = dict(pair)
                    indexed["did"] = did
                    all_pairs_indexed.append(indexed)
        else:
            for pair in doc_pairs:
                all_pairs.append(pair)
                indexed = dict(pair)
                indexed["did"] = did
                all_pairs_indexed.append(indexed)

# Capitalize French if English starts with a capital letter
for pair in all_pairs:
    en = pair.get("en", "")
    fr = pair.get("fr", "")
    # Clean English: remove leading non-capital-letter chars
    if en:
        en_clean = strip_leading_non_capital(en, lang="en")
        pair["en"] = en_clean
    # Clean French: remove leading non-capital-letter chars
    if fr:
        fr_clean = strip_leading_non_capital(fr, lang="fr")
        pair["fr"] = fr_clean
    # Capitalize French if English starts with a capital letter
    if en and en[0].isupper() and fr:
        if fr and not fr[0].isupper():
            pair["fr"] = fr[0].upper() + fr[1:]

# Write the flattened list to a new JSON file
with open(Path("output") / "all_pairs.json", "w", encoding="utf-8") as f:
    json.dump(all_pairs, f, ensure_ascii=False, indent=2)

    # Write the indexed list to a new JSON file
with open(Path("output") / "all_pairs_indexed.json", "w", encoding="utf-8") as f:
    json.dump(all_pairs_indexed, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(all_pairs)} pairs to {Path('output') / 'all_pairs.json'}")

Wrote 22663 pairs to output\all_pairs.json
