In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def get_url(section):
    """returns the url to be scraped for the given section
        Args:
            section (str): the section to be searched, these work for a specific set of values.
        Returns:
            URL (str): the URL to be scraped for the given section.
    """
    url = ''
    if section == 'safety':
        url = 'https://www.europol.europa.eu/'
    elif section == 'health':
        url = f'https://{section}.ec.europa.eu/index_en'
    else:
        url = f'https://{section}.ec.europa.eu/'

    return url
    


In [3]:
print(get_url('safety'))
print(get_url('health'))
print(get_url('transport'))
print(get_url('culture'))
print(get_url('education'))
print(get_url('environment'))

https://www.europol.europa.eu/
https://health.ec.europa.eu/index_en
https://transport.ec.europa.eu/
https://culture.ec.europa.eu/
https://education.ec.europa.eu/
https://environment.ec.europa.eu/


In [4]:
def get_html(url):
    response = requests.get(url).text
    return BeautifulSoup(response)

In [5]:
#safety_URL = get_url('safety')
health_URL = get_url('health')
#education_URL = get_url('education')
#transport_URL = get_url('transport')
culture_URL = get_url('culture')

culture = get_html(culture_URL)
health = get_html(health_URL)

print(get_html(health_URL))

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="Public health" name="description"/>
<meta content="public health" name="keywords"/>
<meta content="en" http-equiv="content-language"/>
<link href="https://health.ec.europa.eu/index_en" rel="canonical"/>
<meta content="auto" property="og:determiner"/>
<meta content="Public Health" property="og:site_name"/>
<meta content="website" property="og:type"/>
<meta content="https://health.ec.europa.eu/index_en" property="og:url"/>
<meta content="Public Health" property="og:title"/>
<meta content="Public health" property="og:description"/>
<meta content="summary_large_image" name="twitter:card"/>
<meta content="Public Health" name="twitter:title"/>
<meta content="Public health" name="twitter:description"/>
<meta content="https://health.ec.europa.eu/index_en" name="twitter:url"/>
<meta content="https://health.ec.europa.eu/profiles/contrib/ewcms/modules/ewcms_seo/assets/images/ec

In [None]:
def _top_three_culture(html):
    """ Returns a list containing the title and hyperlinks to the three most recent news items for the safety section.
        Args:
            html (str): beautiful soup html object
        Returns:
            top_three (list of soup objects): list containing top three news headlines and their links.
    """
    top_stories = html.find_all('eac-whatsnew-card-news')
    top_three = top_stories[0:3]
    story_attrs = [story.attrs for story in top_three]

    return story_attrs


In [7]:
print(_top_three_culture(culture))

[{'title': 'Here is the laureate of the 2025 European Union Prize for Literature', 'link': '/news/here-is-the-laureate-of-the-2025-european-union-prize-for-literature', 'summary': '19 May 2025', 'image': 'https://culture.ec.europa.eu/sites/default/files/styles/eac_ratio_16_9_w_480/public/2025-05/3023_EUPL_NI_Verna_pic.png?h=d1cb525d&itok=2j8CF4Ek', 'parent-background-color': 'white', 'variant': 'crecul'}, {'title': 'EUmies Awards Young Talent 2025 finalists announced', 'link': '/news/eumies-awards-young-talent-2025-finalists-announced', 'summary': '12 May 2025', 'image': 'https://culture.ec.europa.eu/sites/default/files/styles/eac_ratio_16_9_w_480/public/2025-01/2025-eumies-young-visual-Maria_de%20_la%20_O%20_Molina_Perez-Tome_Winner-YT2023-16-9.jpg?h=c673cd1c&itok=AeHW5ZgT', 'parent-background-color': 'white', 'variant': 'crecul'}, {'title': 'Eurobarometer publishes findings on Europeans’ attitudes towards culture', 'link': '/news/eurobarometer-publishes-findings-on-europeans-attitude

In [12]:
def _top_three_health(html):
    """ Returns a list containing the title and hyperlinks to the three most recent news items for the safety section.
        Args:
            html (str): beautiful soup html object
        Returns:
            top_three (list of soup objects): list containing top three news headlines and their links.
    """
    top_stories = html.find_all('a', class_='ecl-link ecl-link--standalone')
    top_three = top_stories[0:3]
    story_attrs = [(story.attrs, story.text.strip()) for story in top_three]

    return story_attrs

In [13]:
print(_top_three_health(health))

[({'href': 'https://ec.europa.eu/newsroom/sante/newsletter-archives/63609', 'class': ['ecl-link', 'ecl-link--standalone']}, 'The European Commission Hosts Key One Health Meeting'), ({'href': 'https://ec.europa.eu/newsroom/sante/newsletter-archives/63497', 'class': ['ecl-link', 'ecl-link--standalone']}, 'Commission adopts temporary restriction on use of Chikungunya vaccine for older patients'), ({'href': 'https://ec.europa.eu/newsroom/sante/newsletter-archives/63127', 'class': ['ecl-link', 'ecl-link--standalone']}, 'Register now: EU webinar on non-communicable diseases (16 May 2025, 10.00-13.00 CEST)')]


In [None]:
def _top_three_environment(html):
    """ Returns a list containing the title and hyperlinks to the three most recent news items for the safety section.
        Args:
            html (str): beautiful soup html object
        Returns:
            top_three (list of soup objects): list containing top three news headlines and their links.
    """
    top_stories = html.find_all('a', class_='ecl-link ecl-link--standalone')
    top_three = top_stories[0:3]
    story_attrs = [(story.attrs, story.text.strip()) for story in top_three]

    return story_attrs

In [None]:
# def find_top_three(soup, section):
#     """ Returns a list containing the title and hyperlinks to the three most recent news items on each section
#         Args:
#             soup (str): beautiful soup html object
#         Returns:
#             top_three (list of soup objects): list containing top three news headlines and their links.
#     """
#     if section == 'safety':

        
#     elif section == 'education':

#     elif section == 'health':

#     elif section == 'culture':

#     elif section == 'environment':

#     elif section == 'transport':
