# Beige Books Data

This notebook does a few things:
- Downloads all the Beige Books summaries
- Validates the data isn't sus
- Cleans up the text

## Imports

In [83]:
from bs4 import BeautifulSoup
from collections import Counter
import nltk
import pandas as pd
import re
import requests
import string
import time

In [22]:
nltk.download("stopwords")
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mihikabairathi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mihikabairathi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Download the data

In [2]:
def fetch_beigebook(url, year, month):
    data = []
    
    try:
        r = requests.get(url, timeout=10)
        if r.status_code != 200:
            return []
        
        soup = BeautifulSoup(r.text, "html.parser")
        paragraphs = []
        
        for p in soup.find_all("p"):
            parents = [parent.name for parent in p.parents]
            if not any(tag in parents for tag in ["header", "footer", "nav"]):
                text = p.get_text(strip=True)
                if text.lower().startswith("full report"):
                    break
                if text:
                    paragraphs.append(text)
        
        if paragraphs:
            combined = " ".join(paragraphs)
            data.append({
                "year": year,
                "month": month,
                "url": url,
                "text": combined
            })
            print(f"Fetched {year}-{month}, {len(paragraphs)} paragraphs")
        
        time.sleep(0.5)
        return data
    
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return []

In [None]:
beige_books_data = []

# 1996 - 2010
for year in range(1996, 2011):
    for month in range(1, 13):
        for day in range(1, 32):
            url = f"https://www.federalreserve.gov/fomc/beigebook/{year}/{year}{month:02d}{day:02d}/default.htm"
            beige_books_data.extend(fetch_beigebook(url, year, month))

# 2011 - 2016
for year in range(2011, 2017):
    for month in range(1, 13):
        url = f"https://www.federalreserve.gov/monetarypolicy/beigebook/beigebook{year}{month:02d}.htm?summary"
        beige_books_data.extend(fetch_beigebook(url, year, month))

# 2017 - 2025
for year in range(2017, 2026):
    for month in range(1, 13):
        url = f"https://www.federalreserve.gov/monetarypolicy/beigebook{year}{month:02d}-summary.htm"
        beige_books_data.extend(fetch_beigebook(url, year, month))

# Save to CSV
df = pd.DataFrame(beige_books_data)
df.to_csv("data/beige_book_1996_2025.csv", index=False)
print("Saved beige_book_1996_2025.csv")

## Data Validation

In [137]:
beige_df = pd.read_csv("data/beige_book_1996_2025.csv")

In [138]:
# assume the first day of the month for timestamp
beige_df["timestamp"] = pd.to_datetime(beige_df["year"].astype(str) + "-" + beige_df["month"].astype(str) + "-01")

In [139]:
# check for nulls and data types
print(f"Number of rows: {len(beige_df)}")
beige_df.info()

Number of rows: 231
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   year       231 non-null    int64         
 1   month      231 non-null    int64         
 2   url        231 non-null    object        
 3   text       231 non-null    object        
 4   timestamp  231 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 9.2+ KB


In [140]:
# see samples of data
print(beige_df.loc[0, "url"])
print(beige_df.loc[230, "text"])
beige_df.head()

https://www.federalreserve.gov/fomc/beigebook/1996/19961030/default.htm
An official website of the United States Government Official websites use .govA.govwebsite belongs to an official government organization in the United States. Secure .gov websites use HTTPSAlock(LockLocked padlock icon) orhttps://means you've safely connected to the .gov website. Share sensitive information only on official, secure websites. Economic activity changed little on balance since the previous report, with three Districts reporting slight to modest growth in activity, five reporting no change, and four noting a slight softening. Overall consumer spending, particularly on retail goods, inched down in recent weeks, although auto sales were boosted in some Districts by strong demand for electric vehicles ahead of the expiration of a federal tax credit at the end of September. Demand for leisure and hospitality services by international travelers fell further over the reporting period, while demand by domest

Unnamed: 0,year,month,url,text,timestamp
0,1996,10,https://www.federalreserve.gov/fomc/beigebook/...,"October 30, 1996SummarySkip to contentSummaryD...",1996-10-01
1,1996,12,https://www.federalreserve.gov/fomc/beigebook/...,"December 4, 1996SummarySkip to contentSummaryD...",1996-12-01
2,1997,1,https://www.federalreserve.gov/fomc/beigebook/...,"January 22, 1997SummarySkip to contentSummaryD...",1997-01-01
3,1997,3,https://www.federalreserve.gov/fomc/beigebook/...,"March 12, 1997SummarySkip to contentSummaryDis...",1997-03-01
4,1997,5,https://www.federalreserve.gov/fomc/beigebook/...,"May 7, 1997SummarySkip to contentSummaryDistri...",1997-05-01


In [141]:
# make sure all the data has been collected
print(f"earliest = {beige_df["timestamp"].min()}")
print(f"latest = {beige_df["timestamp"].max()}")
beige_df.groupby("year")["timestamp"].agg(["count", "min", "max"])

earliest = 1996-10-01 00:00:00
latest = 2025-10-01 00:00:00


Unnamed: 0_level_0,count,min,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1996,2,1996-10-01,1996-12-01
1997,8,1997-01-01,1997-12-01
1998,8,1998-01-01,1998-12-01
1999,8,1999-01-01,1999-12-01
2000,8,2000-01-01,2000-12-01
2001,8,2001-01-01,2001-11-01
2002,8,2002-01-01,2002-11-01
2003,8,2003-01-01,2003-11-01
2004,8,2004-01-01,2004-12-01
2005,8,2005-01-01,2005-11-01


## Text Cleaning

We perform basic case normalization, whitespace cleaning, boilerplate removal. Notably, we don't perform the following:
- stopword removal (needed for sentiment analysis down the road)
- punctuation cleaning (gets rid of financial symbols such as currency, percentages, ratios)
- lemmatization (removes specificity of financial terms)

In [142]:
# case normalization
beige_df["text"] = beige_df["text"].str.lower()

In [143]:
# clean whitespace
beige_df["text"] = beige_df["text"].str.strip()
beige_df["text"] = beige_df["text"].str.replace(r"\s+", " ", regex=True)

In [144]:
# boilerplate removal
def remove_prefixes(text):
    prefixes = [
        "and is not a commentary on the views of federal reserve officials.", 
        "and is not a comment on the views of federal reserve officials", 
        "and is not a representation of the views of federal reserve officials.",
        "and is not a commentary on the views of the federal reserve officials.",
        "and is not a commentary of the views of federal reserve officials.",
        "share sensitive information only on official, secure websites."
    ]
    prefix_regex = re.compile("|".join(prefixes))
    
    match = prefix_regex.search(text)
    new_text = text[match.end():].strip() if match else text

    if len(text) - len(new_text) > 600:
        print(f"Warning: removed prefix length {len(text) - len(new_text)} exceeds threshold")
    return new_text

def remove_suffixes(text):
    suffixes = [
        "return to topbostonhome", 
        "summary districtsbostonnew", 
        "return to top this page uses javascript.",
        "note: this report was prepared at the federal reserve bank of"
    ]
    suffix_regex = re.compile("|".join(suffixes))
    
    match = suffix_regex.search(text)
    new_text = text[:match.start()].strip() if match else text

    if len(text) - len(new_text) > 400:
        print(f"Warning: removed prefix length {len(text) - len(new_text)} exceeds threshold")
    return new_text

beige_df["text"] = beige_df["text"].apply(remove_prefixes)
beige_df["text"] = beige_df["text"].apply(remove_suffixes)

In [145]:
# confirm that we don't need to do chunking since the summary lengths are relatively small
print("Number of words:")
beige_df["text"].str.split().apply(len).agg(["min", "max", "mean", "median"])

Number of words:


min         979.00000
max       17667.00000
mean       4377.30303
median     1693.00000
Name: text, dtype: float64

In [146]:
# unigrams and bigrams
all_tokens = []
stopwords = set(nltk.corpus.stopwords.words("english"))

for text in beige_df["text"]:
    all_tokens.extend([word for word in nltk.word_tokenize(text) if word not in stopwords and word not in string.punctuation])

print(f"Top unigrams: {Counter(all_tokens).most_common(10)}")
print(f"Top bigrams: {Counter(nltk.ngrams(all_tokens, 2)).most_common(10)}")

Top unigrams: [('reported', 10520), ('sales', 8944), ('activity', 8470), ('contacts', 7532), ('prices', 7448), ('demand', 7410), ('districts', 7190), ('district', 6742), ('new', 5737), ('continued', 5390)]
Top bigrams: [(('real', 'estate'), 3054), (('kansas', 'city'), 2669), (('new', 'york'), 2568), (('san', 'francisco'), 2485), (('st.', 'louis'), 1852), (('since', 'last'), 1450), (('commercial', 'real'), 1309), (('districts', 'reported'), 1238), (('last', 'report'), 1204), (('year', 'ago'), 1074)]


# Write Final Data

In [147]:
beige_df.to_csv("data/beige_book_1996_2025.csv", index=False)