In [None]:
# Scraping the minutes
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
import os

# generates a dictionary of appropriate transcript paths
# if you already have the text data, set path_to_local_txt to True. 
link_to_file_on_website = True

# We are only interested in minutes in this kernel
l_min = len("fomcminutes")
minute_links = {}

# Before storing the links, we have some pitfalls to notice:
# 1. Before 2007 (2006 to the past), minutes are written in HTML form.
# 2. Before 2014 (2013 to the past), minutes and other disclosures are
# ... located in a different URL to that of minutes from 2014 till now.

# Accordingly, I split the scraper into 3 subparts.
for year in range(1982, 2020): # from 1982 - 2013
    if year < 1993:
        base_url = "https://www.federalreserve.gov/monetarypolicy/"
        path = "fomchistorical" + str(year) + ".htm"
        html_doc = requests.get(base_url + path)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = re.compile(r"\bMinutes\b"))
        link_base_url = "https://www.federalreserve.gov"
        minute_links[str(year)] = [link_base_url + link["href"] for link in links]
        print("Year Complete: ", year)
    elif year > 1992 and year < 2007:
        # before 2007
        base_url = "https://www.federalreserve.gov/monetarypolicy/"
        path = "fomchistorical" + str(year) + ".htm"
        html_doc = requests.get(base_url + path)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = "Minutes")
        link_base_url = "https://www.federalreserve.gov"
        final_link = []
        for link in links:
            if link["href"][:4] == "http":
                final_link.append(link["href"])
            else:
                final_link.append(link_base_url + link["href"])
        minute_links[str(year)] = final_link
        print("Year Complete: ", year)
    elif year == 2007:
        # For year 2007
        base_url = "https://www.federalreserve.gov/monetarypolicy/"
        path = "fomchistorical" + str(year) + ".htm"
        html_doc = requests.get(base_url + path)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = "Minutes")
        link_base_url = "https://www.federalreserve.gov"
        final_link = []
        for link in links:
            if link["href"][:4] == "http":
                final_link.append(link["href"])
            else:
                final_link.append(link_base_url + link["href"])
        minute_links[str(year)] = final_link
        print("Year Complete: ", year)
    elif year > 2007 and year < 2014:
        # For year 2008 to 2013
        base_url = "https://www.federalreserve.gov/monetarypolicy/"
        path = "fomchistorical" + str(year) + ".htm"
        html_doc = requests.get(base_url + path)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = re.compile("PDF.*"))
        link_base_url = "https://www.federalreserve.gov"
        final_link = []
        for link in links:
            p = re.compile("[^/]*$")
            if p.search(link.get("href")).group()[:(l_min+4)] == 'fomcminutes' + str(year):
                if link["href"][:4] == "http":
                    final_link.append(link["href"])
                else:
                    final_link.append(link_base_url + link["href"])
        minute_links[str(year)] = final_link
        print("Year Complete: ", year)
    else:
        # After 2014, notice that 
        base_url = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"
        html_doc = requests.get(base_url)
        soup = BeautifulSoup(html_doc.content, 'html.parser')
        links = soup.find_all("a", string = re.compile("PDF.*"))
        link_base_url = "https://www.federalreserve.gov"
        final_link = []
        for link in links:
            p = re.compile("[^/]*$")
            if p.search(link.get("href")).group()[:(l_min+4)] == 'fomcminutes' + str(year):
                final_link.append(link_base_url + link["href"])
        minute_links[str(year)] = final_link
        print("Year Complete: ", year)

In [None]:
# create list of all paths and sort in increasing order
sorted_transcripts = []
for linkset in minute_links.values():
    sorted_transcripts += linkset
print("Number of Documents", len(sorted_transcripts))

In [None]:
# Now, scraping the minutes
for year in minute_links.keys():
    if not os.path.exists("./FOMCminutes/" + year):
        os.makedirs("./FOMCminutes/" + year)
    for link in minute_links[year]:
        response = urllib.request.urlopen(str(link))
        name = re.search("[^/]*$", str(link))
        print(link)
        with open("./FOMCminutes/" + year + "/" + name.group(), 'wb') as f:
            f.write(response.read())
        print("file downloaded")