In [1]:
import re
import json
import time
import requests
import numpy as np
import pandas as pd

from random import randint
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.float_format", "{:.2f}".format)

In [3]:
def get_date_and_time(text: str):
    time_regex = r"([0-9]+:[0-9]+ [A-Za-z]+)"
    date_regex = r"([A-Za-z]+ [0-9]+, [0-9]+)"
    time_match = re.search(time_regex, text.replace(".", ""))
    date_match = re.search(date_regex, text.replace(".", ""))
    time = datetime.strptime(time_match.group(1), "%H:%M %p")
    date = datetime.strptime(date_match.group(1), "%B %d, %Y")
    
    return datetime.combine(date, time.time())

In [4]:
def get_bea_press_releases(params: dict):
    press_releases = list()
    start_date = datetime.strptime("2020", "%Y").date()
    # start_date = datetime.strptime("2008", "%Y").date()
    press_release_date = datetime.now().date()

    while press_release_date >= start_date:
        response = requests.get(url="https://www.bea.gov/news/archive?", params=params)
        soup = BeautifulSoup(response.content, "html.parser")
        press_release_elements = soup.find_all("tr", {"class": "release-row"})

        for element in press_release_elements:
            link_element = element.find("a")
            date_element = element.find("td", {"class": "views-field-created"})
            press_release_date = datetime.strptime(date_element.text.strip(), "%B %d, %Y").date()
            
            if press_release_date >= start_date:
                link = link_element["href"]
                press_releases.append(link)

        params["page"] += 1
    
    return press_releases

In [5]:
def get_gdp_from_press_releases(column, press_releases):
    output = list()

    for link in press_releases:
        try:
            url = f"https://www.bea.gov{link}"
            response = requests.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            header = soup.find("div", {"class": "field--name-field-release-date"}).text
            date = get_date_and_time(header)
            press_release = soup.find("div", {"class": "release-body"})
            press_release = " ".join(press_release.stripped_strings)
            press_release = " ".join(press_release.split())
            press_release = press_release.replace(",", "")
            pattern = r"current.dollar gdp.*?quarter.*?\$(\d+\.\d+).(trillion|billion)"
            result = re.search(pattern, press_release, re.IGNORECASE)
            value = float(result.group(1)) / 1000.0 if result.group(2) == "billion" else float(result.group(1))
        except Exception as e:
            value = np.nan

        output.append({"date": date, column: value})
        time.sleep(randint(0, 1))

    return pd.DataFrame(output)

In [6]:
def get_bls_press_releases(indicator: str):
    press_releases = list()
    start_date = datetime.strptime("2020", "%Y").date()
    # start_date = datetime.strptime("2008", "%Y").date()
    press_release_date = datetime.now().date()
    response = requests.get(f"https://www.bls.gov/bls/news-release/{indicator}.htm")
    soup = BeautifulSoup(response.text, "html.parser")
    news_releases = soup.find_all("a", href=re.compile(r"/news.release/archives/.*?.htm"))

    for item in news_releases:
        year_element = re.findall(r"\b\d{4}\b", item.text)[0]
        press_release_date = datetime.strptime(year_element.strip(), "%Y").date()

        if press_release_date >= start_date:
            link = item["href"]
            press_releases.append(link)

    return press_releases

In [7]:
def get_data_from_bls_press_releases(column, press_releases):
    output = list()

    for link in press_releases:
        try:
            url = f"https://www.bls.gov{link}"
            response = requests.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            press_release = soup.find("div", {"class": "normalnews"})
            press_release = " ".join(press_release.stripped_strings)
            press_release = " ".join(press_release.split("\n"))
            date = get_date_and_time(press_release)

            if column == "consumer_price_index":
                pattern = r"(\d+\.\d+).\(1982-84=100\)"
                result = re.search(pattern, press_release, re.IGNORECASE)
                value = result.group(1)
            elif column == "unemployment_rate":
                pattern = r"the unemployment rate.*?(\d+\.\d+).*?(\d+\.\d+)"
                result = re.search(pattern, press_release, re.IGNORECASE)
                value = result.group(2)
            elif column == "nonfarm_payroll":
                pattern = r"nonfarm payroll employment.*?(\d+[.,]\d+).(?:(million)?)"
                result = re.search(pattern, press_release, re.IGNORECASE)
                value = float(result.group(1)) if result.group(2) else float(result.group(1).replace(",", "")) / 1000000.0
            elif column == "producer_price_index":
                pattern = r"the Producer Price Index.*?(advanced|increased|rose|decreased|fell|declined|changed|unchanged).*?(\d+\.\d+)"
                result = re.search(pattern, press_release, re.IGNORECASE)
                value = f"-{result.group(2)}" if result.group(1) in ["decreased","fell","declined"] else result.group(2)
            else:
                raise ValueError("Invalid column name!")
                
        except Exception as e:
            print(e)
            value = np.nan

        output.append({"date": date, column: value})
        time.sleep(randint(0, 1))

    return pd.DataFrame(output)

In [8]:
params = {"page": 0, "created_1": "All", "field_related_product_target_id": "451"}
press_releases = get_bea_press_releases(params)
gross_domestic_product = get_gdp_from_press_releases("gross_domestic_product", press_releases)

In [9]:
press_releases = get_bls_press_releases(indicator="cpi")
consumer_price_index = get_data_from_bls_press_releases("consumer_price_index", press_releases)

In [10]:
press_releases = get_bls_press_releases(indicator="ppi")
producer_price_index = get_data_from_bls_press_releases("producer_price_index", press_releases)

In [11]:
press_releases = get_bls_press_releases(indicator="empsit")
unemployment_rate = get_data_from_bls_press_releases("unemployment_rate", press_releases)

In [12]:
press_releases = get_bls_press_releases(indicator="empsit")
nonfarm_payroll = get_data_from_bls_press_releases("nonfarm_payroll", press_releases)

In [13]:
df = gross_domestic_product \
    .merge(consumer_price_index, on="date", how="outer") \
    .merge(producer_price_index, on="date", how="outer") \
    .merge(unemployment_rate, on="date", how="outer") \
    .merge(nonfarm_payroll, on="date", how="outer")

df.sort_values(by="date", inplace=True, ascending=True, ignore_index=True)
# df.fillna(method="ffill", inplace=True)
# df.fillna(method="bfill", inplace=True)

df

Unnamed: 0,date,gross_domestic_product,consumer_price_index,producer_price_index,unemployment_rate,nonfarm_payroll
0,2020-01-30 08:30:00,21.73,,,,
1,2020-02-07 08:30:00,,,,3.6,0.23
2,2020-02-13 08:30:00,,257.971,,,
3,2020-02-19 08:30:00,,,0.5,,
4,2020-02-27 08:30:00,21.73,,,,
5,2020-03-06 08:30:00,,,,3.5,0.27
6,2020-03-11 08:30:00,,258.678,,,
7,2020-03-12 08:30:00,,,-0.6,,
8,2020-03-26 08:30:00,21.73,,,,
9,2020-04-03 08:30:00,,,,0.9,0.7
