In [17]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from unidecode import unidecode
import re

In [18]:
if __name__ == '__main__':

    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.ecb.europa.eu/press/pr/activities/mopo/html/index.en.html"
    
    # HTTP Request
    webpage = requests.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    date_list = []
    title_list = []
    link_list = []
    article_list = []

    # Fetch dates, names and links as lists
    dates = soup.find_all("div", attrs={'class':'date'})
    titles = soup.find_all("div", attrs={'class':'category'})
    links = soup.find_all('dd')

    for date in dates:
        date_list.append(date.text)
    
    for title in titles:
        title_list.append(title.text)

    for link in links:
        link_list.append('https://www.ecb.europa.eu/' + link.find('div', class_='title').find('a').get('href'))        

In [19]:
# Create a dataframe with the lists
df = pd.DataFrame(list(zip(date_list, title_list, link_list)), columns=['Date', 'Title', 'Link'])

# Filter to keep only those with Title MONETARY POLICY DECISION and Date from 2021 onward
df = df[df['Title'] == 'MONETARY POLICY DECISION']
df = df[df['Date'].apply(lambda x: datetime.strptime(x, '%d %B %Y').year >= 2021)]

df.head()

Unnamed: 0,Date,Title,Link
4,11 April 2024,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...
12,7 March 2024,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...
15,25 January 2024,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...
19,14 December 2023,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2023/...
23,26 October 2023,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2023/...


In [20]:
if __name__ == '__main__':

    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; ru) Opera 8.0', 'Accept-Language': 'en-US, en;q=0.5'})

    # The base webpage URL
    url_list = df['Link'].tolist()
    article_list = []

    for link in url_list:
        try:
            # HTTP Request
            webpage = requests.get(link, headers=HEADERS)

            # Soup Object containing all data
            soup = BeautifulSoup(webpage.content, "html.parser")
            a_tag = soup.find('div', id='main-wrapper')
            text = a_tag.find('div', class_='section').text.strip()

        except: text = np.nan

        try: text = unidecode(text)
        except:
            text = np.nan
        article_list.append(text)

In [21]:
# Define a regular expression pattern to match the date at the beginning of each text
date_pattern = r'^\d+\s+\w+\s+\d+'

# Remove the date from each text using re.sub()
cleaned_articles = [re.sub(date_pattern, '', text).lstrip() for text in article_list]

In [22]:
# Add the cleaned articles to the DataFrame
df['Article'] = cleaned_articles

In [24]:
# Change date format 11 April 2024 to 2024-04-11
df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, '%d %B %Y').strftime('%Y-%m-%d'))

Unnamed: 0,Date,Title,Link,Article
4,2024-04-11,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...,The Governing Council today decided to keep th...
12,2024-03-07,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...,The Governing Council today decided to keep th...
15,2024-01-25,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...,The Governing Council today decided to keep th...
19,2023-12-14,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2023/...,The Governing Council today decided to keep th...
23,2023-10-26,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2023/...,The Governing Council today decided to keep th...


In [28]:
# Filter to keep only those between July 2022 and March 2024
df = df[df['Date'].apply(lambda x: '2022-07-01' <= x <= '2024-03-31')]

# Remove duplicates
df = df.drop_duplicates()
df = df.reset_index(drop=True)
print(df.shape)
df.head()

(14, 4)


Unnamed: 0,Date,Title,Link,Article
0,2024-03-07,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...,The Governing Council today decided to keep th...
1,2024-01-25,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2024/...,The Governing Council today decided to keep th...
2,2023-12-14,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2023/...,The Governing Council today decided to keep th...
3,2023-10-26,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2023/...,The Governing Council today decided to keep th...
4,2023-09-14,MONETARY POLICY DECISION,https://www.ecb.europa.eu//press/pr/date/2023/...,Inflation continues to decline but is still ex...


In [30]:
df.to_csv('C:/Users/joaqu/OneDrive/Desktop/Masters/ECB_Perceived_Cacophony/Scraped_Data/ecb_monetary_policy_decisions.csv', index=False)