In [0]:
%pip install --quiet requests beautifulsoup4 tqdm

In [0]:
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import pyspark.sql.functions as F

In [0]:
CATALOG = "marcell"
SCHEMA = "marine_ai_poc"
PROJECT_ID = "EN010098"

In [0]:
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.data")

In [0]:
def scrape_document_details(url: str) -> List[Dict]:
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        documents = []
        
        # Find all result sections
        results = soup.find_all('li', class_='section-results__result')
        
        for result in results:
            document_info = {}
            
            # Extract PDF link and text
            pdf_link = result.find('a', href=lambda x: x and x.endswith('.pdf'))
            if pdf_link:
                document_info['link_text'] = pdf_link.get_text(strip=True)
                document_info['pdf_url'] = pdf_link['href']
            
            # Extract subtitle
            subtitle = result.find('div', class_='section-results__result-copy')
            if subtitle:
                document_info['subtitle'] = subtitle.get_text(strip=True)
            
            # Extract date
            date_elem = result.find(attrs={'data-cy': 'published-date'})
            if date_elem:
                document_info['date'] = date_elem.get_text(strip=True)
            
            # Extract stage
            stage = result.find(attrs={'data-cy': 'published-stage'})
            if stage:
                document_info['stage'] = stage.get_text(strip=True)
            
            # Extract title
            title = result.find(attrs={'data-cy': 'published-title'})
            if title:
                document_info['title'] = title.get_text(strip=True)
            
            if document_info:
                documents.append(document_info)
        
        return documents
    
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []



In [0]:
all_docs = []

page = 1

while True:
    url = f"https://national-infrastructure-consenting.planninginspectorate.gov.uk/projects/{PROJECT_ID}/documents?page={page}"
    documents = scrape_document_details(url)
    if len(documents)<1:
        break
    all_docs.extend(documents)
    page+=1

In [0]:
non_null_docs = [d for d in all_docs if "pdf_url" in d.keys()]
len(non_null_docs)

In [0]:
df = spark.createDataFrame(non_null_docs) \
    .withColumn("date", F.to_timestamp(F.col("date"), "d MMMM yyyy")) \
    .withColumn("project_id", F.lit(PROJECT_ID))\
    .withColumn("title", F.regexp_replace(F.col("link_text"), r"\s*\([^)]*\)", "").rstrip())

df.write.mode("append").saveAsTable(f"{CATALOG}.{SCHEMA}.document_reference")