In [None]:
import lxml
import lxml.html
import requests
import cssselect
from bs4 import BeautifulSoup
import time
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import re
import os
import nltk

This notebook is intended to document the text-gathering process for analyzing the federal reserve's use of language.

In [None]:
url = "https://www.federalreserve.gov/monetarypolicy/fomchistorical{}.htm"
base_url = "https://www.federalreserve.gov"

In [None]:
r = requests.get(url.format(1936))
soup = BeautifulSoup(r.text, "html.parser")
links = soup.find_all("a")
for link in links:
    if "Minutes" in link.text:
        print(link.text)

In [None]:
def get_html_min(year):
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, "html.parser")
    links = soup.find_all("a")
    minutes = []
    
    for link in links:
        if "Minutes" in link.text:
            minutes.append(link)
            
    minutes_links = []
    
    for link in minutes:
        minutes_links.append(requests.compat.urljoin(base_url, link.get('href')))
    
    year_text = []
    for meeting in minutes_links:
        r = requests.get(meeting)
        soup = BeautifulSoup(r.text, "html.parser")
        min_text = soup.body.get_text()
        min_text = re.sub("\n", "", min_text)
        min_text = re.sub("\r", "", min_text)
        year_text.append(min_text)
        
    return(year_text)

In [None]:
year = "2005"
r = requests.get(url.format(year))
soup = BeautifulSoup(r.text, "html.parser")
links = soup.find_all("a")

minutes = []

for link in links:
    if "Minutes" in link.text:
            minutes.append(link)
            
    minutes_links = []
    
    for link in minutes:
        minutes_links.append(requests.compat.urljoin(base_url, link.get('href')))

    for mine in minutes_links:
        print(mine[-12:-4])

In [None]:
text_1994 = get_html_min(1994)

In [None]:
total_text = []
for year in range(1994, 2008):
    total_text.append(get_html_min(year))

In [None]:
print(total_text)

In [None]:
flattext = [item for thing in total_text for item in thing]

In [None]:
flattext[0]

In [None]:
df = {"First": flattext[0],
      "Second": flattext[1],
      "Third": flattext[2]}

In [None]:
for i in df:
    with open("texts/" + i + ".txt", "w") as text_file:
        text_file.write(df[i])

In [None]:
for i in range(0, 10):
    with open("test" + str(i) + ".txt", "w") as text_file:
        text_file.write(flattext[i])

In [None]:
txt_list = []
names = os.listdir()
for name in names:
    if name.endswith(".txt"):
        txt_list.append(name)

In [None]:
test_df = {}
for i in txt_list:
    with open(i, "r") as text_file:
        name = i[:-4]
        test_df[name] = text_file.read()

In [None]:
test_df.keys()

In [None]:
doc_tokens = []
for doc in text_1994:
    doc_tokens.append(nltk.tokenize.word_tokenize(doc))
    
print(doc_tokens)

In [None]:
url = "https://www.federalreserve.gov/fomc/MINUTES/1994/19940204min.htm"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
soup.body.get_text()

In [None]:
def get_pdfs(year):
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, "html.parser")
    links = soup.find_all("a")
    minutes = []
    
    for link in links:
        if "Minutes" in link.text:
            minutes.append(link)
    minutes_links = []
    
    for link in minutes:
        minutes_links.append(requests.compat.urljoin(base_url, link.get('href')))
    
    directory = "C:/Users/mjcor/Documents/GitHub/fed_reserve_text_project/pdfs/" + str(year) + "/"
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    for link in minutes_links:
        download_pdf(link, year)

In [None]:
def get_transcripts(year):
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, "html.parser")
    links = soup.find_all("a")
    minutes = []
    
    for link in links:
        if "Transcript " in link.text:
            minutes.append(link)
    minutes_links = []
    
    for link in minutes:
        minutes_links.append(requests.compat.urljoin(base_url, link.get('href')))
    
    directory = "C:/Users/mjcor/Documents/GitHub/fed_reserve_text_project/pdfs/" + str(year) + "/"
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    for link in minutes_links:
        download_pdf(link, year)

In [None]:
r = requests.get(url.format("2008"))
soup = BeautifulSoup(r.text, "html.parser")
links = soup.find_all("a")
minutes = []
for link in links:
    if "Transcript" in link.text:
        minutes.append(link)
print(minutes)

In [None]:
for year in range(2008, 2013):
    get_transcripts(year)

In [None]:
base_url

In [None]:
url = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.find_all("a")
pdfs = []
for link in links:
    if "PDF" in link.contents:
        pdfs.append(link.get("href"))
        
for pdf in pdfs:
    if "minutes" in pdf:
        yr = pdf[-12:-8]
        dl_pdf = requests.compat.urljoin(base_url, pdf)
        directory = "C:/Users/mjcor/Documents/GitHub/fed_reserve_text_project/pdfs/" + str(yr) + "/"
        try:
            os.stat(directory)
        except:
            os.mkdir(directory)
        download_pdf(dl_pdf, yr)

In [None]:
for i in range(1954, 2013):
    get_pdfs(i)

In [None]:
min_link[0][-19:-4]

In [None]:
def download_pdf(pdf_url, year):
    response = requests.get(pdf_url)
    filename = pdf_url[-19:-4]
    print(filename)
    with open('C:/Users/mjcor/Documents/GitHub/fed_reserve_text_project/pdfs/' + str(year) + "/" + filename + ".pdf", "wb") as f:
        f.write(response.content)

In [None]:
for link in min_link:
    download_pdf(link)

In [None]:
download_pdf(min_link[1])

In [None]:
?re.findall

In [None]:
min_link

In [None]:
min_link[1]

In [None]:
def get_statement_links(year):
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, "html.parser")
    links = soup.find_all("a")
    statements = []
    
    for link in links:
        if link.text == "Statement":
            statements.append(link)
    state_links = []
    
    for link in statements:
        state_links.append(requests.compat.urljoin(base_url, link.get('href')))
    return(state_links)

In [None]:
def get_statement_texts(links):
    dates = []
    texts = []

    for link in links:
        r = requests.get(link)
        soup = BeautifulSoup(r.text, "html.parser")
        dte = soup.find("p", {"class": "article__time"}).contents
        txt = soup.find("div", {"class" : "col-xs-12 col-sm-8 col-md-8"}).get_text()
        txt = re.sub("\n", "", txt)
        dates.append(dte[0])
        texts.append(txt)

    statements_df = pd.DataFrame({"Dates": dates,
                                  "Texts": texts})
    return(statements_df)

In [None]:
statements_master = pd.DataFrame(columns = ["Dates", "Texts"])
years = range(2006, 2013)
for year in years:
    print(year)
    temp_df = get_statement_texts(get_statement_links(year))
    statements_master = statements_master.append(temp_df)

In [None]:
statements_master

In [None]:
r = requests.get(url.format("2012"))
soup = BeautifulSoup(r.text, "html.parser")

In [None]:
links = soup.find_all("a")
statements = []
for link in links:
    if link.text == "Statement":
        statements.append(link)

In [None]:
statements

In [None]:
state_links = []
for link in statements:
    state_links.append(requests.compat.urljoin(base_url, link.get('href')))

In [None]:
r = requests.get(state_links[0])
soup = BeautifulSoup(r.text, "html.parser")

In [None]:
?pd.DataFrame.from_records

In [None]:
dates = []
texts = []

for link in state_links:
    r = requests.get(link)
    soup = BeautifulSoup(r.text, "html.parser")
    dte = soup.find("p", {"class": "article__time"}).contents
    txt = soup.find("div", {"class" : "col-xs-12 col-sm-8 col-md-8"}).get_text()
    txt = re.sub("\n", "", txt)
    dates.append(dte[0])
    texts.append(txt)
    
statements_df = pd.DataFrame({"Dates": dates,
                              "Texts": texts})

In [None]:
statements_df

In [None]:
soup.find("p", {"class": "article__time"}).contents

In [None]:
soup.find("div", {"class" : "col-xs-12 col-sm-8 col-md-8"}).contents

In [None]:
state_links[1]