In [66]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime


In [32]:
def get_url(section):
    """returns the url to be scraped for the given section
        Args:
            section (str): the section to be searched, these work for a specific set of values.
        Returns:
            URL (str): the URL to be scraped for the given section.
    """
    url = ''
    if section == 'safety':
        url = 'https://www.europol.europa.eu/'
    elif section == 'health' or section == 'environment' or section == 'transport':
        url = f'https://{section}.ec.europa.eu/index_en'
    else:
        url = f'https://{section}.ec.europa.eu/'

    return url
    


In [33]:
print(get_url('safety'))
print(get_url('health'))
print(get_url('transport'))
print(get_url('culture'))
print(get_url('education'))
print(get_url('environment'))

https://www.europol.europa.eu/
https://health.ec.europa.eu/index_en
https://transport.ec.europa.eu/index_en
https://culture.ec.europa.eu/
https://education.ec.europa.eu/
https://environment.ec.europa.eu/index_en


In [34]:
def get_html(url):
    response = requests.get(url).text
    return BeautifulSoup(response)

In [35]:
safety_URL = get_url('safety')
health_URL = get_url('health')
education_URL = get_url('education')
transport_URL = get_url('transport')
culture_URL = get_url('culture')
environment_URL = get_url('environment')

culture = get_html(culture_URL)
health = get_html(health_URL)
environment = get_html(environment_URL)
education = get_html(education_URL)
transport = get_html(transport_URL)
safety = get_html(safety_URL)

print(safety)

<!DOCTYPE html>
Timings (ms):
bootstrapDrupal   :    18.0230
loadReactIndex    :     0.0541
renderLinkHeaders :     0.4189
loadMetadata      :     0.5269
loadLanguages     :     2.4688
loadMenus         :    13.3960
loadTerms         :    10.3290
loadNodeData      :   373.9879
shutdownDrupal    :     0.0401
processReactIndex :     2.6970
renderReactIndex  :     0.0191
total             :   424.0270
-->


In [81]:
def _top_three_culture(html):
    """ Returns a list containing the title and hyperlinks to the three most recent news items for the safety section.
        Args:
            html (str): beautiful soup html object
        Returns:
            top_three (list of soup objects): list containing top three news headlines and their links.
    """
    top_stories = html.find_all('eac-whatsnew-card-news')
    top_three = top_stories[0:3]
    story_attrs = [{'link': get_url('culture')+story.attrs['link'], 
                    'title': story.attrs['title'].strip(), 
                    'date': datetime.strptime(story.attrs['summary'].strip(), "%d %B %Y")} for story in top_three]

    return story_attrs


In [82]:
print(_top_three_culture(culture))

[{'link': 'https://culture.ec.europa.eu//news/here-is-the-laureate-of-the-2025-european-union-prize-for-literature', 'title': 'Here is the laureate of the 2025 European Union Prize for Literature', 'date': datetime.datetime(2025, 5, 19, 0, 0)}, {'link': 'https://culture.ec.europa.eu//news/eumies-awards-young-talent-2025-finalists-announced', 'title': 'EUmies Awards Young Talent 2025 finalists announced', 'date': datetime.datetime(2025, 5, 12, 0, 0)}, {'link': 'https://culture.ec.europa.eu//news/eurobarometer-publishes-findings-on-europeans-attitudes-towards-culture', 'title': 'Eurobarometer publishes findings on Europeans’ attitudes towards culture', 'date': datetime.datetime(2025, 5, 8, 0, 0)}]


In [71]:
def _top_three_health(html):
    """ Returns a list containing the title and hyperlinks to the three most recent news items for the safety section.
        Args:
            html (str): beautiful soup html object
        Returns:
            top_three (list of soup objects): list containing top three news headlines and their links.
    """
    top_stories = html.find_all('a', class_='ecl-link ecl-link--standalone')
    top_dates = html.find_all('time')
    top_three = top_stories[0:3]
    #(href, title, date)
    story_attrs = [{'link': top_stories[story].attrs['href'], 
                    'title': top_stories[story].text.strip(), 
                    'date': datetime.strptime(top_dates[story].text.strip(), "%d %B %Y")} for story in range(len(top_three))]

    return story_attrs

In [72]:
print(_top_three_health(health))

[{'link': 'https://ec.europa.eu/newsroom/sante/newsletter-archives/63609', 'title': 'The European Commission Hosts Key One Health Meeting', 'date': datetime.datetime(2025, 5, 21, 0, 0)}, {'link': 'https://ec.europa.eu/newsroom/sante/newsletter-archives/63497', 'title': 'Commission adopts temporary restriction on use of Chikungunya vaccine for older patients', 'date': datetime.datetime(2025, 5, 16, 0, 0)}, {'link': 'https://ec.europa.eu/newsroom/sante/newsletter-archives/63127', 'title': 'Register now: EU webinar on non-communicable diseases (16 May 2025, 10.00-13.00 CEST)', 'date': datetime.datetime(2025, 5, 7, 0, 0)}]


In [73]:
def _top_three_environment(html):
    """ Returns a list containing the title and hyperlinks to the three most recent news items for the safety section.
        Args:
            html (str): beautiful soup html object
        Returns:
            top_three (list of soup objects): list containing top three news headlines and their links.
    """
    news_section_str = html.find_all('a', class_='ecl-link ecl-link--standalone')
    top_three_stories = news_section_str[20:23]
    top_dates = html.find_all('time')
    top_three_dates = top_dates[0:3]
    #news_section_soup = BeautifulSoup(news_section_str)

    #top_stories = news_section_soup.find_all('a', class_ = 'ecl-link ecl-link--standalone')
    story_attrs = [{'link': get_url('environment')[0:-8]+top_three_stories[story].attrs['href'], 
                    'title': top_three_stories[story].text.strip(), 
                    'date': datetime.strptime(top_three_dates[story].text.strip(), "%d %B %Y")} for story in range(len(top_three_stories))]

    return story_attrs

In [74]:
print(_top_three_environment(environment))

[{'link': 'https://environment.ec.europa.eu//news/celebrate-33-years-protecting-eus-nature-and-join-bioblitz-near-you-2025-05-21_en', 'title': 'Celebrate 33 years of protecting the EU’s nature and join a Bioblitz near you', 'date': datetime.datetime(2025, 5, 21, 0, 0)}, {'link': 'https://environment.ec.europa.eu//news/video-how-can-we-balance-industrial-growth-and-reducing-emissions-2025-04-30_en', 'title': 'VIDEO: How can we balance industrial growth and reducing emissions?', 'date': datetime.datetime(2025, 4, 30, 0, 0)}, {'link': 'https://environment.ec.europa.eu//news/eu-green-week-2025-conference-register-now-2025-04-14_en', 'title': 'EU Green Week 2025 conference – register now!', 'date': datetime.datetime(2025, 4, 14, 0, 0)}]


In [75]:
def _top_three_education(html):
    
    stories = html.find_all('a', 'stripe-recent-content-item')

    top_three = stories[0:3]
    link_list = [get_url('education')+story.attrs['href'] for story in top_three]

    titles = html.find_all('div', class_='stripe-recent-content-item--title')
    title_list = [title.text.strip() for title in titles]

    dates = html.find_all('time')
    date_list = [date.text.strip() for date in dates[0:3]]

    list = [{'link': link_list[story], 
             'title': title_list[story], 
             'date': datetime.strptime(date_list[story], "%d %B %Y")} for story in range(len(top_three))]

    return list



In [76]:
_top_three_education(education)

[{'link': 'https://education.ec.europa.eu//event/digital-education-stakeholder-forum-2025',
  'title': 'Digital Education Stakeholder Forum 2025',
  'date': datetime.datetime(2025, 6, 24, 0, 0)},
 {'link': 'https://education.ec.europa.eu//event/study-visit-on-entrepreneurship-education',
  'title': 'Study visit on entrepreneurship education',
  'date': datetime.datetime(2025, 6, 18, 0, 0)},
 {'link': 'https://education.ec.europa.eu//event/heinnovate-train-the-trainers-workshop',
  'title': 'HEInnovate Train the Trainers workshop',
  'date': datetime.datetime(2025, 6, 18, 0, 0)}]

In [77]:
def _top_three_transport(html):

    stories = html.find_all('a', class_ = 'ecl-link ecl-link--standalone')
    top_three_stories = stories[3:6]

    dates = html.find_all('time')
    top_three_dates = dates[0:3]

    top_three = [{'link' : get_url('transport')[0:-8]+top_three_stories[story].attrs['href'], 
                  'title' : top_three_stories[story].text.strip(), 
                  'date' : datetime.strptime(top_three_dates[story].text.strip(), "%d %B %Y")} for story in range(len(top_three_stories))]

    return top_three

In [78]:
_top_three_transport(transport)

[{'link': 'https://transport.ec.europa.eu//news-events/news/deadline-extended-call-applications-selection-experts-new-performance-review-board-single-european-2025-05-22_en',
  'title': 'Deadline extended: Call for applications for the selection of experts for the new Performance Review Board of the Single European Sky',
  'date': datetime.datetime(2025, 5, 22, 0, 0)},
 {'link': 'https://transport.ec.europa.eu//news-events/news/recommendations-member-states-help-tackle-transport-poverty-and-promote-fair-sustainable-mobility-2025-05-22_en',
  'title': 'Recommendations for Member States to help tackle transport poverty and promote fair, sustainable mobility',
  'date': datetime.datetime(2025, 5, 22, 0, 0)},
 {'link': 'https://transport.ec.europa.eu//news-events/news/commission-welcomes-provisional-agreement-new-european-maritime-safety-agency-mandate-2025-05-20_en',
  'title': 'The Commission welcomes provisional agreement on new European Maritime Safety Agency mandate',
  'date': dateti

In [79]:
def find_top_three(soup, section):
    """ Returns a list containing the title and hyperlinks to the three most recent news items on each section
        Args:
            soup (str): beautiful soup html object
        Returns:
            top_three (list of soup objects): list containing top three news headlines and their links.
    """
    top_three = None
    if section == 'education':
        top_three = _top_three_education(soup)
    elif section == 'health':
        top_three = _top_three_health(soup)
    elif section == 'culture':
        top_three = _top_three_culture(soup)
    elif section == 'environment':
        top_three = _top_three_environment(soup)
    elif section == 'transport':
        top_three = _top_three_transport(soup)

    return top_three