<a href="https://colab.research.google.com/github/markyf801/dfe_speeches/blob/main/Python_Speech_Scraper_for_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import streamlit as st
import requests
from bs4 import BeautifulSoup

def get_speech_details(url):
    """
    This function scrapes a given URL for speech announcements and returns a list
    of dictionaries, each containing the speech's title, URL, and publication date.

    Args:
        url (str): The URL of the page to scrape.

    Returns:
        list: A list of dictionaries with speech details.
    """
    speech_details = []
    try:
        # Fetch the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all document list items, which contain the announcements
        announcements = soup.find_all('li', class_='gem-c-document-list__item')

        # Iterate through the announcements to find speeches
        for item in announcements:
            # Check if the document type is a speech
            doc_type_tag = item.find(class_='gem-c-document-list__item-type')
            if doc_type_tag and 'speech' in doc_type_tag.text.lower():

                # Find the link and title
                title_tag = item.find('a', class_='govuk-link')

                # Find the publication date
                date_tag = item.find('time')

                if title_tag and date_tag:
                    title = title_tag.text.strip()
                    href = title_tag['href']
                    date = date_tag.text.strip()

                    # Construct the full URL if it's a relative path
                    speech_url = href
                    if not speech_url.startswith('http'):
                        speech_url = f"https://www.gov.uk{speech_url}"

                    speech_details.append({
                        'title': title,
                        'url': speech_url,
                        'date': date
                    })

    except requests.exceptions.RequestException as e:
        # Display errors in the Streamlit app instead of the console
        st.error(f"Error fetching the URL: {e}")
    except Exception as e:
        st.error(f"An error occurred: {e}")

    return speech_details

# --- Streamlit App Interface ---

st.title("GOV.UK Speech Scraper")

target_url = 'https://www.gov.uk/government/people/jacqui-smith#announcements'

st.write("This app scans the following page for speeches:")
st.markdown(f"[{target_url}]({target_url})")

# Add a button to trigger the scraping process
if st.button("Find Speeches"):
    # Show a "spinner" while the function is running
    with st.spinner("Scanning for speeches..."):
        speeches = get_speech_details(target_url)

        if speeches:
            st.success(f"Found {len(speeches)} speech(es):")
            # Loop through the speeches and display their details
            for speech in speeches:
                st.markdown(f"**{speech['date']}**: [{speech['title']}]({speech['url']})")
        else:
            # If no links are found, show a warning message in the app
            st.warning("No speech links were found on the page.")