In [35]:
from typing import Literal, NewType

import httpx
import pandas as pd
from bs4 import BeautifulSoup

URL = NewType("URL", str)


def get_release_urls() -> list[URL]:
    base_url = "https://www.gov.uk/government/collections/register-of-ministers-gifts-and-hospitality"
    index_page = httpx.get(base_url)
    index_soup = BeautifulSoup(index_page.text, "html.parser")

    # get all a links with a class of govuk-link

    links = index_soup.find_all("a", class_="govuk-link")
    link_urls = [link["href"] for link in links]
    # reduce to links that start '/government/publications/register-of-ministers-gifts-and-hospitality'

    link_urls = [
        "https://www.gov.uk" + link
        for link in link_urls
        if link.startswith(
            "/government/publications/register-of-ministers-gifts-and-hospitality"
        )
    ]

    return link_urls


def get_csv_links_from_page(page_url: URL) -> list[URL]:
    page = httpx.get(page_url)
    page_soup = BeautifulSoup(page.text, "html.parser")

    # get all links that are .csvs
    csv_links = page_soup.find_all("a")
    csv_links = [link["href"] for link in csv_links if link["href"].endswith(".csv")]

    lower_links = [link.lower() for link in csv_links]

    # check that all links contain either 'gifts' or 'hospitality'

    for link in lower_links:
        if "gifts" not in link and "hospitality" not in link:
            raise ValueError(f"Link {link} does not contain 'gifts' or 'hospitality'")

    return csv_links


def get_dept(s: str):
    # get first part seperated by __
    s = s.split("__")[0]
    return s.replace("_", " ").title()


def get_csv(csv_url: URL):
    df = pd.read_csv(csv_url)
    df.columns = [x.strip() for x in df.columns]
    # drop unnamed columns
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    df["Department"] = get_dept(get_final_part_of_url(csv_url))
    df["source_slug"] = get_final_part_of_url(csv_url)
    return df


def get_final_part_of_url(s: str):
    s = s.split("/")[-1]
    if s.endswith(".csv"):
        s = s[:-4]
    return s


def get_all_csvs(csv_type: Literal["gifts", "hospitality"]):
    dfs: list[pd.DataFrame] = []
    releases = get_release_urls()
    for r in releases:
        csv_links = [
            link for link in get_csv_links_from_page(r) if csv_type in link.lower()
        ]
        csv_links = sorted(list(set(csv_links)))
        for csv in csv_links:
            df = get_csv(csv)
            df["release_slug"] = get_final_part_of_url(r)
            dfs.append(df)
    return pd.concat(dfs)


gift_df = get_all_csvs("gifts")
hospitality_df = get_all_csvs("hospitality")

In [57]:
hospitality_df

Unnamed: 0,Minister,Date,Individual or Organisation that offered hospitality,Type of Hospitality Received,Accompanied by Guest,Value of Hospitality (£),Department,source_slug,release_slug
0,Yvette Cooper,Nil Return,Nil Return,Nil Return,Nil Return,Nil Return,Home Office,Home_Office__Ministers__Hospitality_-_January_...,register-of-ministers-gifts-and-hospitality-ja...
1,Dan Jarvis,Nil Return,Nil Return,Nil Return,Nil Return,Nil Return,Home Office,Home_Office__Ministers__Hospitality_-_January_...,register-of-ministers-gifts-and-hospitality-ja...
2,Dame Angela Eagle,Nil Return,Nil Return,Nil Return,Nil Return,Nil Return,Home Office,Home_Office__Ministers__Hospitality_-_January_...,register-of-ministers-gifts-and-hospitality-ja...
3,Dame Diana Johnson,Nil Return,Nil Return,Nil Return,Nil Return,Nil Return,Home Office,Home_Office__Ministers__Hospitality_-_January_...,register-of-ministers-gifts-and-hospitality-ja...
4,Lord Hanson of Flint,Nil Return,Nil Return,Nil Return,Nil Return,Nil Return,Home Office,Home_Office__Ministers__Hospitality_-_January_...,register-of-ministers-gifts-and-hospitality-ja...
...,...,...,...,...,...,...,...,...,...
6,Lord Vallance,16/10/2024,Gates Foundation,Dinner,No,£100,Department For Science,Department_for_Science__Innovation___Technolog...,register-of-ministers-gifts-and-hospitality-ju...
7,Lord Vallance,23/10/2024,University of Edinburgh,Lunch,No,£20,Department For Science,Department_for_Science__Innovation___Technolog...,register-of-ministers-gifts-and-hospitality-ju...
8,Sir Chris Bryant,Nil Return,Nil Return,Nil Return,Nil Return,Nil Return,Department For Science,Department_for_Science__Innovation___Technolog...,register-of-ministers-gifts-and-hospitality-ju...
9,Feryal Clark,15/08/2024,Dell Technologies,"Hospitality ticket for Taylor Swift concert, f...",Yes,"£1,050.00",Department For Science,Department_for_Science__Innovation___Technolog...,register-of-ministers-gifts-and-hospitality-ju...
