In [1]:
import os
from datetime import date

import polars as pl
from dotenv import load_dotenv

from eco_stats import BLSClient
from eco_stats.vintage import scrape_range

_ = load_dotenv()

In [2]:
bls = BLSClient(api_key=os.getenv('BLS_API_KEY'))

In [3]:
vintage = (
    scrape_range(
        start_year=2016,
        end_year=2026,
        delay=1.0
    )
    .filter(
        pl.col('source') == 'ces_national'
    )
    .select(
        ref_date=pl.col('ref_date'),
        vintage_date=pl.col('release_date')
    )
    .unique(subset=['ref_date', 'vintage_date'], keep='last')
    .sort('ref_date')
    .filter(
        pl.col('ref_date').is_between(
            date(2016, 1, 12),
            date(2026, 1, 12)
        )
    )
)

  Scraping 2016... ces_national=13, ces_state=12, qcew=4
  Scraping 2017... ces_national=13, ces_state=12, qcew=4
  Scraping 2018... ces_national=13, ces_state=12, qcew=7
  Scraping 2019... ces_national=13, ces_state=12, qcew=4
  Scraping 2020... ces_national=13, ces_state=12, qcew=4
  Scraping 2021... ces_national=26, ces_state=24, qcew=8
  Scraping 2022... ces_national=26, ces_state=24, qcew=8
  Scraping 2023... ces_national=26, ces_state=24, qcew=8
  Scraping 2024... ces_national=26, ces_state=24, qcew=8
  Scraping 2025... ces_national=24, ces_state=20, qcew=10
  Scraping 2026... ces_national=24, ces_state=26, qcew=8


In [4]:
vintage = (
    vintage
    .with_columns(
        ref_date=pl.col('ref_date'),
        vintage_date_0=pl.col('vintage_date'),
        vintage_date_1=pl.col('vintage_date').shift(-1),
        vintage_date_2=pl.col('vintage_date').shift(-2)
    )
)

In [5]:
vintage_dict = {d[0]: d[1] for d in vintage.iter_rows()}

In [6]:
dict_list = []
for d in list(
    vintage
    .iter_rows(named=True)
):

    ref_date = d['ref_date']
    vintage_date_0=d['vintage_date_0']
    vintage_date_1=d['vintage_date_1']
    vintage_date_2=d['vintage_date_2']

    if date(2016, 1, 12) <= ref_date <= date(2016, 3, 12):
        r, d = 9, vintage_dict[date(2017, 1, 12)]
    elif date(2016, 4, 12) <= ref_date <= date(2017, 3, 12):
        r, d = 9, vintage_dict[date(2018, 1, 12)]
    elif date(2017, 4, 12) <= ref_date <= date(2018, 3, 12):
        r, d = 9, vintage_dict[date(2019, 1, 12)]
    elif date(2018, 4, 12) <= ref_date <= date(2019, 3, 12):
        r, d = 9, vintage_dict[date(2020, 1, 12)]
    elif date(2019, 4, 12) <= ref_date <= date(2020, 3, 12):
        r, d = 9, vintage_dict[date(2021, 1, 12)]
    elif date(2020, 4, 12) <= ref_date <= date(2021, 3, 12):
        r, d = 9, vintage_dict[date(2022, 1, 12)]
    elif date(2021, 4, 12) <= ref_date <= date(2022, 3, 12):
        r, d = 9, vintage_dict[date(2023, 1, 12)]
    elif date(2022, 4, 12) <= ref_date <= date(2023, 3, 12):
        r, d = 9, vintage_dict[date(2024, 1, 12)]   
    elif date(2023, 4, 12) <= ref_date <= date(2024, 3, 12):
        r, d = 9, vintage_dict[date(2025, 1, 12)]
    elif date(2024, 4, 12) <= ref_date <= date(2025, 3, 12):
        r, d = 9, vintage_dict[date(2026, 1, 12)]
    elif date(2025, 4, 12) <= ref_date <= date(2025, 11, 12):
        r, d = 2, vintage_date_2
    elif ref_date == date(2025, 12, 12):
        r, d = 1, vintage_date_1
    elif ref_date == date(2026, 1, 12):
        r, d = 0, vintage_date_0

    dict_list.append({
        'ref_date': ref_date,
        'revision': r,
        'vintage_date': d,
    })

revisions = (
    pl
    .DataFrame(
        dict_list,
        schema_overrides={
            'ref_date': pl.Date,
            'revision': pl.UInt8,
            'vintage_date': pl.Date,
        }
    )
)

In [9]:
ces_national_series = [
    ('000000', '00', 'Total Non-Farm', 'national'),
    ('050000', '05', 'Total Private', 'domain'),
    ('060000', '06', 'Goods-Producing Industries', 'domain'),
    ('070000', '07', 'Service-Providing Industries', 'domain'),
    ('080000', '08', 'Private Service-Providing', 'domain'),
    ('100000', '10', 'Natural Resources and Mining', 'supersector'),
    ('200000', '20', 'Construction', 'supersector'),
    ('300000', '30', 'Manufacturing', 'supersector'),
    ('400000', '40', 'Trade, Transportation, and Utilities', 'supersector'),
    ('500000', '50', 'Information', 'supersector'),
    ('550000', '55', 'Financial Activities', 'supersector'),
    ('600000', '60', 'Professional and Business Services', 'supersector'),
    ('650000', '65', 'Education and Health Services', 'supersector'),
    ('700000', '70', 'Leisure and Hospitality', 'supersector'),
    ('800000', '80', 'Other Services', 'supersector'),
    ('900000', '90', 'Government', 'supersector'),
    ('102100', '21', 'Mining, quarrying, and oil and gas extraction', 'sector'),
    ('310000', '31', 'Durable goods', 'sector'),
    ('320000', '32', 'Nondurable goods', 'sector'),
    ('414200', '41', 'Wholesale trade', 'sector'),
    ('420000', '42', 'Retail trade', 'sector'),
    ('430000', '43', 'Transportation and warehousing', 'sector'),
    ('442200', '22', 'Utilities', 'sector'),
    ('555200', '52', 'Finance and insurance', 'sector'),
    ('555300', '53', 'Real estate and rental and leasing', 'sector'),
    ('605400', '54', 'Professional, scientific, and technical services', 'sector'),
    ('605500', '55', 'Management of companies and enterprises', 'sector'),
    ('605600', '56', 'Administrative and support and waste management and remediation services', 'sector'),
    ('656100', '61', 'Private educational services', 'sector'),
    ('656200', '62', 'Health care and social assistance', 'sector'),
    ('707100', '71', 'Arts, entertainment, and recreation', 'sector'),
    ('707200', '72', 'Accommodation and food services', 'sector'),
    ('909100', '91', 'Federal', 'sector'),
    ('909200', '92', 'State government', 'sector'),
    ('909300', '93', 'Local government', 'sector'),
]

ces_national_series_ids = [f'CE{a}{ind}0001' for ind, _, _, _ in ces_national_series for a in ['S', 'U']]
ces_national_series_dict = {f'CEU{ind}0001': desc for ind, _, desc, _ in ces_national_series}

ind_codes = {i: c for i, c, _, _ in ces_national_series}

national = ['00']
domain = [c for _, c, _, t in ces_national_series if t == 'domain']
supersector = [c for _, c, _, t in ces_national_series if t == 'supersector']
sector = [c for _, c, _, t in ces_national_series if t == 'sector']

In [12]:
ces_national = (
    bls
    .get_series(
        series_ids=ces_national_series_ids,
        start_year='2016',
        end_year='2026'
    )
    .with_columns(
        ref_date=pl.col('date'),
        ref_year=pl.col('date')
                   .dt.year()
    )
    .join(
        revisions,
        on='ref_date',
        how='left'
    )
    .select(
        adjusted=pl.col('series_id')
                   .str.slice(2, 1)
                   .eq('S'),
        ref_date=pl.col('date'),
        ref_year=pl.col('ref_year'),
        ref_month=pl.col('date')
                    .dt.month(),
        revision=pl.col('revision'),
        vintage_date=pl.col('vintage_date'),
        geographic_type=pl.lit('national', pl.Utf8),
        geographic_code=pl.lit('00', pl.Utf8),
        industry_type=pl.lit(None, pl.Utf8),
        industry_code=pl.col('series_id')
                        .str.slice(3, 6)
                        .replace_strict(ind_codes, default=None),
        employment=pl.col('value')
    )
    .with_columns(
        industry_type=pl.when(pl.col('industry_code').eq('00'))
                        .then(pl.lit('national', pl.Utf8))
                        .when(pl.col('industry_code').is_in(domain))
                        .then(pl.lit('domain', pl.Utf8))
                        .when(pl.col('industry_code').is_in(supersector))
                        .then(pl.lit('supersector', pl.Utf8))
                        .when(pl.col('industry_code').is_in(sector))
                        .then(pl.lit('sector', pl.Utf8))
                        .otherwise(pl.lit(None, pl.Utf8))
    )
)

In [13]:
(
    ces_national
    .filter(
        ~pl.col('adjusted')
    )
    .drop('adjusted')
    .sort(
        'ref_date', 'revision',
        'geographic_type', 'geographic_code',
        'industry_type', 'industry_code'
    )
    .write_parquet(
        '/Users/lowell/Projects/revisions/data/ces_national_nsa.parquet'
    )
)

In [14]:
(
    ces_national
    .filter(
        pl.col('adjusted')
    )
    .drop('adjusted')
    .sort(
        'ref_date', 'revision',
        'geographic_type', 'geographic_code',
        'industry_type', 'industry_code'
    )
    .write_parquet(
        '/Users/lowell/Projects/revisions/data/ces_national_sa.parquet'
    )
)