In [None]:
# Scraping the minutes
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
import os

# Before 1993, the "Record of Policy action" is released ...
# not immediately after each meeting. Therefore, we are not ...
# interested in that period.

# From 1994 until now, the statement after each meeting is published ...
# immediately and they are our target in researching.
l_state = len("monetary")
statement_links = {}

# Again, we split our code into 3 subparts, before 2014 ...
# 2014 and 2015, and from 2016 until now
for year in range(1994, 2020): # from 1994 - 2019
    if year < 2014:
        base_url = "https://www.federalreserve.gov/monetarypolicy/"
        path = "fomchistorical" + str(year) + ".htm"
        html_doc = requests.get(base_url + path)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = "Statement")
        link_base_url = "https://www.federalreserve.gov"
        statement_links[str(year)] = [link_base_url + link["href"] for link in links]
        print("Year Completed: ", year)
    elif year in [2014, 2015]:
        base_url = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"
        html_doc = requests.get(base_url)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = "Statement")
        link_base_url = "https://www.federalreserve.gov"
        final_link = []
        for link in links:
            p = re.compile("[^/]*$")
            if p.search(link.get("href")).group()[:(l_state+4)] == 'monetary' + str(year):
                final_link.append(link_base_url + link["href"])
        statement_links[str(year)] = final_link
        print("Year Completed: ", year)
    else:
        # After 2014, since all years are located in only one URL, we try to extract them out
        base_url = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"
        html_doc = requests.get(base_url)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = re.compile("PDF.*"))
        link_base_url = "https://www.federalreserve.gov"
        final_link = []
        for link in links:
            p = re.compile("[^/]*$")
            if p.search(link.get("href")).group()[:(l_state+4)] == 'monetary' + str(year):
                final_link.append(link_base_url + link["href"])
        statement_links[str(year)] = final_link
        print("Year Completed: ", year)

In [None]:
# Scrape them
for year in statement_links.keys():
    if not os.path.exists("./FOMCstatements/" + year):
        os.makedirs("./FOMCstatements/" + year)
    if int(year) < 2016:
        for link in statement_links[year]:
            p = re.compile(year + "[0-9][0-9][0-9][0-9]")
            name = p.search(str(link))
            response = urllib.request.urlretrieve(str(link), name.group() + ".txt")
            cwd = os.getcwd()
            os.rename(cwd + "/" + name.group() + ".txt", "./FOMCstatements/" + year + "/" + name.group() + ".txt")
        print("Download completed: " + year)
    else:
        for link in statement_links[year]:
            response = urllib.request.urlopen(str(link))
            name = re.search("[^/]*$", str(link))
            with open("./FOMCstatements/" + year + "/" + name.group(), 'wb') as f:
                f.write(response.read())
        print("Download completed" + year)