In [14]:
import datetime
from functools import lru_cache
from pathlib import Path

import requests
from bs4 import BeautifulSoup


@lru_cache
def list_files_from_http(url: str) -> list[str]:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    files = []
    # Assuming the directory listing is an unordered list or contains <a> tags with file links
    for link in soup.find_all("a"):
        file_name = link.get("href")
        if file_name:  # Filter out directory navigation links like '../' and './'
            files.append(file_name)
    return files


def get_latest_for_date(
    date: datetime.date, folder="debates", prepattern="debates", letter="a"
):
    url = f"https://www.theyworkforyou.com/pwdata/scrapedxml/{folder}/{prepattern}{date.isoformat()}{letter}.xml"
    local_path = (
        Path("..", "data", "pwdata", "scrapedxml")
        / folder
        / f"{prepattern}{date.isoformat()}{letter}.xml"
    )

    if not local_path.exists():
        local_path.parent.mkdir(parents=True, exist_ok=True)
        response = requests.get(url)
        if response.status_code == 404:
            raise FileNotFoundError(f"File {url} not found")
        with local_path.open("w") as f:
            f.write(response.text)
        content = response.text
    else:
        content = local_path.read_text()

    if 'latest="yes"' in content:
        # download the file
        return local_path
    else:
        return get_latest_for_date(date, folder, prepattern, chr(ord(letter) + 1))


get_latest_for_date(datetime.date.fromisoformat("2022-05-23"))

PosixPath('../data/pwdata/scrapedxml/debates/debates2022-05-23b.xml')