In [None]:
import requests
import pandas as pd
from tqdm.notebook import tqdm
from markitdown import MarkItDown
import openai
import bs4
import re
from pydantic import BaseModel
from datetime import date, datetime
from typing import Optional
from bs4 import BeautifulSoup
import dotenv
from utils import convert_pdf
import os

import logging
import colorlog
from humanize import filesize

In [None]:
logger = colorlog.getLogger(__name__)
handler = colorlog.StreamHandler()
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)  # Set the desired logging level

# Add file log output to ./logs/
file_handler = logging.FileHandler("logs/scrape.log", mode="a")

formatter = colorlog.ColoredFormatter(
    "%(log_color)s%(asctime)s %(levelname)s: %(message)s",
    log_colors={
        "DEBUG": "cyan",
        "INFO": "green",
        "WARNING": "yellow",
        "ERROR": "red",
        "CRITICAL": "red,bg_white",
    },
    datefmt="%Y-%m-%d %H:%M:%S",
)
handler.setFormatter(formatter)

ENV = dotenv.dotenv_values()

client = openai.AzureOpenAI(
    api_key=ENV["AZURE_OPENAI_KEY"],
    api_version=ENV["OPENAI_API_VERSION"],
    azure_endpoint=ENV["AZURE_OPENAI_ENDPOINT"],
)

In [None]:
def slugify(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = text.strip().lower()
    text = re.sub(r"\s+", "-", text)
    return text

In [None]:
# Collecting all bills  from https://www.barbadosparliament.com/bills/ asynch
# Save raw HTML to ./data/legislature/bills/html
# Save parsed data to ./data/legislature/bills/json


base_bill_url = "https://www.barbadosparliament.com/bills/"


class Bill(BaseModel):
    bill_id: int
    slug: str
    title: str
    pdf_url: Optional[str]
    current_stage: Optional[str]
    chamber: Optional[str]
    notice_date: Optional[date]
    first_reading: Optional[date]
    gazette_date: Optional[date]
    markdown: Optional[str]


def extract_bill_metadata(html: str, bill_id: int) -> Bill:
    soup = BeautifulSoup(html, "html.parser")

    # Extract the bill title from the <h2> tag. The title is assumed to be the first text node.
    h2 = soup.find("h2")
    title = h2.find(string=True, recursive=False).strip() if h2 else ""

    # Extract PDF URL from the <a> tag inside the <h2>
    pdf_anchor = h2.find("a") if h2 else None
    pdf_url = pdf_anchor.get("href") if pdf_anchor else None
    pdf_url = "https://www.barbadosparliament.com/" + pdf_url if pdf_url else None

    # Initialize metadata values
    current_stage = None
    notice_date = None
    first_reading = None
    gazette_date = None
    chamber = None

    table = soup.find("table")
    if table:
        for tr in table.find_all("tr"):
            th = tr.find("th")
            tds = tr.find_all("td")
            if th:
                key = th.get_text(strip=True)
                if key == "Current Stage" and tds:
                    current_stage = tds[0].get_text(strip=True)
                elif key.startswith("Notice Date") and tds:
                    date_str = tds[0].get_text(strip=True)
                    try:
                        notice_date = datetime.strptime(date_str, "%d/%m/%Y").date()
                    except ValueError:
                        pass
                elif key.startswith("First reading") and tds:
                    date_str = tds[0].get_text(strip=True)
                    try:
                        first_reading = datetime.strptime(date_str, "%d/%m/%Y").date()
                    except ValueError:
                        pass
                elif key.startswith("First appeared in the Official Gazette") and tds:
                    date_str = tds[0].get_text(strip=True)
                    try:
                        gazette_date = datetime.strptime(date_str, "%d/%m/%Y").date()
                    except ValueError:
                        pass
            else:
                # This row may contain the chamber information.
                td = tr.find("td", attrs={"colspan": "2"})
                if td:
                    chamber = td.get_text(strip=True)

    return Bill(
        bill_id=bill_id,
        title=title,
        slug=slugify(title),
        pdf_url=pdf_url,
        current_stage=current_stage,
        chamber=chamber,
        notice_date=notice_date,
        first_reading=first_reading,
        gazette_date=gazette_date,
        markdown=None,
    )


class MissingBillIDException(Exception):
    pass


def scrape_bill(bill_id: int):
    """
    Scrapes a bill from the Barbados Parliament website.

    Args:
        bill_id (id): The ID of the bill to scrape.
    """

    bill_url = f"https://www.barbadosparliament.com/bills/details/{bill_id}"

    logger.debug("🌐 Fetching %s", bill_url)
    bill_webpage = requests.get(bill_url)
    bill_webpage.raise_for_status()

    # Check if the page is an error page
    if "A PHP Error was encountered" in bill_webpage.text:
        logger.warning("❌ Bill %s not found", bill_id)

        # Add a dud JSON file to stop us re-scraping this bill
        json_path = f"./data/legislature/bills/json/{bill_id}_not_found.json"
        with open(json_path, "w") as f:
            f.write("{}")

        raise MissingBillIDException(f"Bill {bill_id} not found")

    # Parse the bill metadata
    logger.debug("🔎 Extracting bill metadata for bill_id %s", str(bill_id))
    bill = extract_bill_metadata(html=bill_webpage.text, bill_id=bill_id)

    # Save raw HTML to ./data/legislature/bills/html
    html_path = f"./data/legislature/bills/html/{bill.bill_id}_{bill.slug}.html"
    with open(html_path, "w") as f:
        f.write(bill_webpage.text)
        logger.debug("✅ Saved %s", html_path)

    # Save pdf to ./data/legislature/bills/pdf
    logger.debug("💾 Downloading PDF: %s", bill.pdf_url)
    pdf_resp = requests.get(url=bill.pdf_url, timeout=60)
    pdf_resp.raise_for_status()

    pdf_path = f"./data/legislature/bills/pdf/{bill.bill_id}_{bill.slug}.pdf"
    with open(pdf_path, "wb") as f:
        f.write(pdf_resp.content)
        pdf_size_str = filesize.naturalsize(os.path.getsize(pdf_path))
        logger.debug("✅ Saved (%s) %s", pdf_size_str, pdf_path)

    # Save markdown to ./data/legislature/bills/md

    logger.debug("📖 Converting %s to Markdown", pdf_path)
    bill.markdown = convert_pdf(file_path=pdf_path)
    markdown_path = f"./data/legislature/bills/md/{bill.bill_id}_{bill.slug}.md"
    with open(markdown_path, "w") as f:
        f.write(bill.markdown)
        logger.debug("✅ Saved %s", markdown_path)

    json_path = f"./data/legislature/bills/json/{bill.bill_id}_{bill.slug}.json"
    # Save parsed data to ./data/legislature/bills/json
    with open(json_path, "w") as f:
        f.write(bill.model_dump_json(indent=2))

    return bill


In [None]:
completed_bill_ids = [
    int(x.split("_")[0]) for x in os.listdir("./data/legislature/bills/json/")
]

target_bill_id_min = 21
target_bill_id_max = 824
target_bill_ids = set(range(target_bill_id_min, target_bill_id_max + 1))
target_bill_ids = target_bill_ids - set(completed_bill_ids)

logger.info("📜 Scraping %s target bills", len(target_bill_ids))

for bill_id in tqdm(target_bill_ids, unit="bill"):
    # check if the bill_id has been scraped in the json data dir
    try:
        bill = scrape_bill(bill_id=bill_id)
        logger.info("📜 Scraped bill: %s", bill.title)
    except MissingBillIDException as err:
        continue
