In [1]:
# Load dotenv
from dotenv import load_dotenv
import os

load_dotenv()

LEGISLATION_URL_PREFIX = os.getenv('LEGISLATION_URL_PREFIX')
LEGISLATION_URI_LIST_FILE = os.getenv('LEGISLATION_URI_LIST_FILE')
JSON_OUTPUT_DIR = os.getenv('JSON_OUTPUT_DIR', 'json_out')
DEPTH_LIMIT = int(os.getenv('DEPTH_LIMIT', 2))

In [2]:

import os
import requests
from bs4 import BeautifulSoup
import json
from collections import deque

class LegislationCrawler:
    def __init__(self, max_depth=2):
        self.max_depth = max_depth
        self.visited_urls = set()
        # Queue stores tuples of: (url, current_depth)
        self.queue = deque() 
        
        # Ensure directories exist
        os.makedirs('.cache', exist_ok=True)
        os.makedirs(JSON_OUTPUT_DIR, exist_ok=True)

    def get_safe_filename(self, url):
        clean_url = url.split('://')[-1].replace(LEGISLATION_URL_PREFIX + '/', '')
        return clean_url.replace('/', '_')

    def normalize_url(self, uri):
        """
        Converts a citation URI (e.g., /id/ukpga/2025/8) into the actual 
        data XML endpoint (e.g., /ukpga/2025/8/data.xml).
        """
        if not uri:
            return None
        # Remove the '/id/' namespace often used in citation URIs
        clean_uri = uri.replace('/id/', '/')
        clean_uri = clean_uri.rstrip('/')
        if not clean_uri.endswith('data.xml'):
            return f"{clean_uri}/data.xml"
        return clean_uri

    def extract_identifier(self, soup):
        """
        Extracts the main identifier information from the XML soup.
        """
        identifier = {}
        title = soup.find('dc:title')
        identifier['title'] = title.text.strip() if title else None
        description = soup.find('dc:description')
        identifier['description'] = description.text.strip() if description else None
        publisher = soup.find('dc:publisher')
        identifier['publisher'] = publisher.text.strip() if publisher else None
        modified = soup.find('dc:modified')
        identifier['modified'] = modified.text.strip() if modified else None
        identifier_tag = soup.find('dc:identifier')
        identifier['uri'] = identifier_tag.text.strip() if identifier_tag else None
        valid = soup.find('dct:valid')
        identifier['valid_date'] = valid.text.strip() if valid else None
        return identifier
        
    def extract_primary_metadata(self, soup):
        """
        Extracts the primary metadata fields and unapplied effects from the XML soup.
        """
        metadata = {}
        primary_metadata = soup.find('ukm:PrimaryMetadata')
        if not primary_metadata: return metadata
            
        year = primary_metadata.find('ukm:Year')
        metadata['year'] = year.get('Value') if year else None
        number = primary_metadata.find('ukm:Number')
        metadata['number'] = number.get('Value') if number else None
        enactment = primary_metadata.find('ukm:EnactmentDate')
        metadata['enactment_date'] = enactment.get('Date') if enactment else None
        isbn = primary_metadata.find('ukm:ISBN')
        metadata['isbn'] = isbn.get('Value') if isbn else None
        
        unapplied_effects_list = []
        for effect in primary_metadata.find_all('ukm:UnappliedEffect'):
            effect_data = {
                "effect_id": effect.get('EffectId'),
                "type": effect.get('Type'), 
                "affected_provisions": effect.get('AffectedProvisions'),
                "affecting_provisions": effect.get('AffectingProvisions'),
                "requires_applied": effect.get('RequiresApplied') == 'true',
                "notes": effect.get('Notes'),
                "modified_date": effect.get('Modified'),
                "affecting_title": None,
                "in_force_date": None,
                "in_force_qualification": None
            }
            affecting_title = effect.find('ukm:AffectingTitle')
            if affecting_title: effect_data['affecting_title'] = affecting_title.text.strip()
            in_force = effect.find('ukm:InForce')
            if in_force:
                if in_force.get('Date'): effect_data['in_force_date'] = in_force.get('Date')
                elif in_force.get('Prospective') == 'true': effect_data['in_force_date'] = 'Prospective'
                effect_data['in_force_qualification'] = in_force.get('Qualification') or in_force.get('OtherQualification')
            unapplied_effects_list.append(effect_data)
            
        metadata['unapplied_effects'] = unapplied_effects_list
        return metadata

    def fetch_and_parse(self, target_url, current_depth):
        """Core parsing logic adapted for the class structure."""
        xml_url = self.normalize_url(target_url)
        if not xml_url: return None
        
        # Prevent duplicate processing (especially important for base URLs vs data.xml variations)
        base_identifying_url = xml_url.replace('/data.xml', '')
        if base_identifying_url in self.visited_urls:
            return None
            
        self.visited_urls.add(base_identifying_url)
        
        safe_name = self.get_safe_filename(xml_url)
        cache_filepath = os.path.join('.cache', safe_name)
        
        # Cache Check
        if os.path.exists(cache_filepath):
            print(f"[{current_depth}] Loading from cache: {cache_filepath}")
            with open(cache_filepath, 'rb') as f:
                xml_content = f.read()
        else:
            print(f"[{current_depth}] Fetching network data: {xml_url}")
            try:
                response = requests.get(xml_url)
                response.raise_for_status()
                xml_content = response.content
                # Before writing to cache, let's beautify the XML for consistent formatting
                temp_soup = BeautifulSoup(xml_content, 'xml')
                xml_content = temp_soup.prettify(encoding='utf-8')
                with open(cache_filepath, 'wb') as f:
                    f.write(xml_content)
            except requests.exceptions.RequestException as e:
                print(f"Failed to fetch {xml_url}: {e}")
                return None
            
        soup = BeautifulSoup(xml_content, 'xml')
        identifier = self.extract_identifier(soup)
        metadata = self.extract_primary_metadata(soup)
        
        # Dynamically create the filepath based on the extracted year
        doc_year = metadata.get('year') or "unknown_year"
        year_dir = os.path.join(JSON_OUTPUT_DIR, str(doc_year))
        os.makedirs(year_dir, exist_ok=True)
        json_filepath = os.path.join(year_dir, safe_name.replace('.xml', '.json'))
        
        commentaries_map = {}
        found_citations = set() # Track new citations to queue
        
        for comm in soup.find_all('Commentary'):
            comm_id = comm.get('id')
            if not comm_id: continue
                
            full_text = comm.get_text(separator=" ", strip=True)
            citations = []
            
            for cit in comm.find_all('Citation'):
                cit_uri = cit.get('URI')
                citations.append({
                    "id": cit.get('id'),
                    "uri": cit_uri,
                    "title": cit.get('Title'),
                    "class": cit.get('Class'),
                    "year": cit.get('Year'),
                    "number": cit.get('Number'),
                    "text": cit.text.strip()
                })
                # If we haven't hit max depth, prepare to queue this citation
                if cit_uri and current_depth < self.max_depth:
                    found_citations.add(cit_uri)
                
            citation_subrefs = []
            for subref in comm.find_all('CitationSubRef'):
                citation_subrefs.append({
                    "id": subref.get('id'),
                    "uri": subref.get('URI'),
                    "citation_ref": subref.get('CitationRef'), 
                    "section_ref": subref.get('SectionRef'),
                    "text": subref.text.strip()
                })
                
            commentaries_map[comm_id] = {
                "type": comm.get('Type'),
                "text": full_text,
                "citations": citations,
                "citation_subrefs": citation_subrefs
            }

        # Document Tree Building (main body)
        document_tree = {}
        
        # Filter: Only grab P1s that are NOT inside the <Schedules> block
        body_p1s = [p1 for p1 in soup.find_all('P1') if not p1.find_parent('Schedules')]
        
        for section in body_p1s:
            part = section.find_parent('Part')
            chapter = section.find_parent('Chapter')
            p1group = section.find_parent('P1group') 
            
            part_num = part.find('Number').text.strip() if part and part.find('Number') else "No Part"
            part_title = part.find('Title').text.strip() if part and part.find('Title') else None
            chapter_uri = chapter.get('DocumentURI') if chapter else None
            chapter_num = chapter.find('Number').text.strip() if chapter and chapter.find('Number') else "No Chapter"
            chapter_title = chapter.find('Title').text.strip() if chapter and chapter.find('Title') else None
            
            chap_dict_key = chapter_uri or chapter_num
            
            if part_num not in document_tree:
                document_tree[part_num] = {"title": part_title, "chapters": {}}
                
            if chap_dict_key not in document_tree[part_num]["chapters"]:
                document_tree[part_num]["chapters"][chap_dict_key] = {
                    "chapter_uri": chapter_uri,
                    "chapter_number": chapter_num if chapter_num != "No Chapter" else None,
                    "title": chapter_title, 
                    "sections": []
                }
                
            section_num = section.find('Pnumber').text.strip() if section.find('Pnumber') else None
            section_title = p1group.find('Title').text.strip() if p1group and p1group.find('Title') else None
            
            section_commentaries = []
            for cref in section.find_all('CommentaryRef'):
                if cref.find_parent('P2') is None:
                    ref_id = cref.get('Ref')
                    if ref_id in commentaries_map:
                        section_commentaries.append({"ref_id": ref_id, **commentaries_map[ref_id]})

            section_data = {
                "section_number": section_num,
                "title": section_title,
                "uri": section.get('DocumentURI') or section.get('id'),
                "commentaries": section_commentaries,
                "paragraphs": []
            }
            
            paragraphs = section.find_all('P2')
            if not paragraphs:
                text_nodes = section.find_all('Text')
                section_data["text"] = " ".join([t.get_text(separator=" ", strip=True) for t in text_nodes])
            else:
                for para in paragraphs:
                    para_num = para.find('Pnumber').text.strip() if para.find('Pnumber') else None
                    para_text = " ".join([t.get_text(separator=" ", strip=True) for t in para.find_all('Text')])
                    
                    para_commentaries = []
                    for cref in para.find_all('CommentaryRef'):
                        ref_id = cref.get('Ref')
                        if ref_id in commentaries_map:
                            para_commentaries.append({"ref_id": ref_id, **commentaries_map[ref_id]})
                    
                    section_data["paragraphs"].append({
                        "paragraph_number": para_num,
                        "text": para_text,
                        "uri": para.get('DocumentURI') or para.get('id'),
                        "commentaries": para_commentaries
                    })
            
            document_tree[part_num]["chapters"][chap_dict_key]["sections"].append(section_data)

        # Document Tree Building (schedules)
        schedules_list = []
        schedules_root = soup.find('Schedules')
        
        if schedules_root:
            for schedule in schedules_root.find_all('Schedule'):
                sched_num = schedule.find('Number').text.strip() if schedule.find('Number') else None
                # Get the first Title tag (handles both <TitleBlock><Title> and direct <Title>)
                sched_title_node = schedule.find('Title')
                sched_title = sched_title_node.get_text(strip=True) if sched_title_node else None
                sched_ref = schedule.find('Reference').text.strip() if schedule.find('Reference') else None
                
                sched_obj = {
                    "schedule_number": sched_num,
                    "title": sched_title,
                    "reference": sched_ref,
                    "uri": schedule.get('DocumentURI') or schedule.get('id'),
                    "paragraphs": []
                }
                
                # In schedules, <P1> represents a paragraph
                for p1 in schedule.find_all('P1'):
                    p1_num = p1.find('Pnumber').text.strip() if p1.find('Pnumber') else None
                    
                    # Check for an enclosing <Pblock> (crossheading title)
                    pblock = p1.find_parent('Pblock')
                    pblock_title = pblock.find('Title').get_text(strip=True) if pblock and pblock.find('Title') else None
                    
                    p1_commentaries = []
                    for cref in p1.find_all('CommentaryRef'):
                        if cref.find_parent('P2') is None:
                            ref_id = cref.get('Ref')
                            if ref_id in commentaries_map:
                                p1_commentaries.append({"ref_id": ref_id, **commentaries_map[ref_id]})
                                
                    p1_data = {
                        "paragraph_number": p1_num,
                        "crossheading": pblock_title,
                        "uri": p1.get('DocumentURI') or p1.get('id'),
                        "commentaries": p1_commentaries,
                        "subparagraphs": []
                    }
                    
                    # In schedules, <P2> represents a sub-paragraph
                    p2s = p1.find_all('P2')
                    if not p2s:
                        p1_data["text"] = " ".join([t.get_text(separator=" ", strip=True) for t in p1.find_all('Text')])
                    else:
                        for p2 in p2s:
                            p2_num = p2.find('Pnumber').text.strip() if p2.find('Pnumber') else None
                            p2_text = " ".join([t.get_text(separator=" ", strip=True) for t in p2.find_all('Text')])
                            
                            p2_commentaries = []
                            for cref in p2.find_all('CommentaryRef'):
                                ref_id = cref.get('Ref')
                                if ref_id in commentaries_map:
                                    p2_commentaries.append({"ref_id": ref_id, **commentaries_map[ref_id]})
                            
                            p1_data["subparagraphs"].append({
                                "subparagraph_number": p2_num,
                                "text": p2_text,
                                "uri": p2.get('DocumentURI') or p2.get('id'),
                                "commentaries": p2_commentaries
                            })
                            
                    sched_obj["paragraphs"].append(p1_data)
                    
                schedules_list.append(sched_obj)

        # Assemble Final JSON
        final_json_structure = {
            "legislation_url": identifier.get('uri'),
            "identifier": identifier,
            "metadata": metadata, 
            "parts": [],
            "schedules": schedules_list  # Newly injected root array
        }
        
        # Assembly loop
        for p_num, p_data in document_tree.items():
            part_obj = {
                "part_number": p_num if p_num != "No Part" else None, 
                "title": p_data["title"], 
                "chapters": []
            }
            
            for chap_key, c_data in p_data["chapters"].items():
                chapter_obj = {
                    "uri": c_data["chapter_uri"], 
                    "chapter_number": c_data["chapter_number"], 
                    "title": c_data["title"], 
                    "sections": c_data["sections"]
                }
                part_obj["chapters"].append(chapter_obj)
                
            final_json_structure["parts"].append(part_obj)

        with open(json_filepath, 'w', encoding='utf-8') as f:
            f.write(json.dumps(final_json_structure, indent=4))
            
        # Add the found citations to our queue for the next loop
        for cit_url in found_citations:
            # Check if we've already seen the base identifying URL
            normalized = self.normalize_url(cit_url)
            if normalized and normalized.replace('/data.xml', '') not in self.visited_urls:
                self.queue.append((cit_url, current_depth + 1))

    def crawl(self, start_url):
        """
        Starts the queue processor.
        """
        self.queue.append((start_url, 0)) # Start at depth 0
        
        while self.queue:
            current_url, depth = self.queue.popleft()
            self.fetch_and_parse(current_url, depth)
            
        print(f"\nCrawl complete. Processed {len(self.visited_urls)} unique pieces of legislation.")

In [3]:
crawler = LegislationCrawler(max_depth=DEPTH_LIMIT)

#Â Read LEGISLATION_URI_LIST_FILE and process each URI
with open(LEGISLATION_URI_LIST_FILE, 'r') as f:
    legislation_uris = [line.strip() for line in f if line.strip()]
for uri in legislation_uris:
    full_url = f"{LEGISLATION_URL_PREFIX}{uri}"
    print(f"\nProcessing legislation: {full_url}")
    crawler.crawl(full_url)


Processing legislation: http://legislation.gov.uk/ukpga/2020/14
[0] Fetching network data: http://legislation.gov.uk/ukpga/2020/14/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/uksi/2022/531/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/uksi/2025/1130/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/ukpga/2024/3/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/uksi/2021/92/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/ukpga/2022/3/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/uksi/2023/113/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/uksi/2025/434/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/ukpga/2023/30/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/uksi/2021/740/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/uksi/2022/500/data.xml
[1] Fetching network data: http://www.legislation.gov.uk/ukpg