In [1]:
# In this section, we are gonna build a web crawler for ECB press conference transcripts
# The original link is: https://www.ecb.europa.eu/press/html/index.en.html

# Import the necessary modules
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
import os

# The first of this work is collect all necessary links
# To this end, open a dictionary
final_links = {}

# The ECB disclosures are available from 1998 until now, so, we will scrape all of them.
base_url = "https://www.ecb.europa.eu/press/pressconf/"
for year in range(1998, 2020):    # from 1998 to 2019
    path = str(year) + "/html/index.en.html"
    # Get the content of this page under the html form
    html = requests.get(base_url + path)
    # Parsing it using the model BeautifulSoup
    parsing_html = BeautifulSoup(html.content, "html.parser")
    # Since in each year, many meetings are held, and each meeting transcript is located
    # ... in a separate link, we are gonna take each link
    links = parsing_html.find_all("div", string = "ENGLISH")
    link_base_url = "https://www.ecb.europa.eu"
    if links != []:
        transcript_link = [link_base_url + link.find("a")["href"] for link in links]
        final_links[str(year)] = transcript_link
    else:
        l1 = []
        for i in parsing_html.find_all("div", {"class": "ecb-langSelector"})[1:]:
            l1.append(i.find("span", {"class": "offeredLanguage"}))
        l2 = []
        for i in l1:
            l2.append(i.find("a", {"class": "arrow"}))
        transcript_link = [link_base_url + link.get("href") for link in l2]
        final_links[str(year)] = transcript_link
    print("Completed: " + str(year))

Completed: 1998
Completed: 1999
Completed: 2000
Completed: 2001
Completed: 2002
Completed: 2003
Completed: 2004
Completed: 2005
Completed: 2006
Completed: 2007
Completed: 2008
Completed: 2009
Completed: 2010
Completed: 2011
Completed: 2012
Completed: 2013
Completed: 2014
Completed: 2015
Completed: 2016
Completed: 2017
Completed: 2018
Completed: 2019


In [2]:
# Now, scrape them
# Open a folder
for year in final_links.keys():
    if not os.path.exists("./ECB PC transcripts/" + year):
        os.makedirs("./ECB PC transcripts/" + year)
    for link in final_links[year]:
        tail_year = year[2:]
        p = re.compile(tail_year + "[0-9][0-9][0-9][0-9]")
        name = p.search(link)
        response = urllib.request.urlretrieve(str(link), name.group() + ".txt")
        cwd = os.getcwd()
        try:
            os.rename(cwd + "/" + name.group() + ".txt", "./ECB PC transcripts/" + year + "/" + name.group() + ".txt")
        except FileExistsError:
            os.rename(cwd + "/" + name.group() + ".txt", "./ECB PC transcripts/" + year + "/" + name.group() + "_2" + ".txt")
    print("Download completed: " + year)

Download completed: 1998
Download completed: 1999
Download completed: 2000
Download completed: 2001
Download completed: 2002
Download completed: 2003
Download completed: 2004
Download completed: 2005
Download completed: 2006
Download completed: 2007
Download completed: 2008
Download completed: 2009
Download completed: 2010
Download completed: 2011
Download completed: 2012
Download completed: 2013
Download completed: 2014
Download completed: 2015
Download completed: 2016
Download completed: 2017
Download completed: 2018
Download completed: 2019
