In [13]:
from datetime import date
import os

import polars as pl

from dotenv import load_dotenv

from eco_stats import BLSClient

load_dotenv()

True

In [2]:
bls = BLSClient(api_key=os.getenv('BLS_API_KEY'))

In [3]:
import shutil
shutil.rmtree('.cache/bls', ignore_errors=True)

In [15]:
vintage_by_year = {y: date(y + 1, 3, 12) for y in range(2016, 2025)}
vintage_by_year

{2016: datetime.date(2017, 3, 12),
 2017: datetime.date(2018, 3, 12),
 2018: datetime.date(2019, 3, 12),
 2019: datetime.date(2020, 3, 12),
 2020: datetime.date(2021, 3, 12),
 2021: datetime.date(2022, 3, 12),
 2022: datetime.date(2023, 3, 12),
 2023: datetime.date(2024, 3, 12),
 2024: datetime.date(2025, 3, 12)}

In [4]:
ces_national_series = [
    ('000000', '00', 'Total Non-Farm', 'national'),
    ('050000', '05', 'Total Private', 'domain'),
    ('060000', '06', 'Goods-Producing Industries', 'domain'),
    ('070000', '07', 'Service-Providing Industries', 'domain'),
    ('080000', '08', 'Private Service-Providing', 'domain'),
    ('100000', '10', 'Natural Resources and Mining', 'supersector'),
    ('200000', '20', 'Construction', 'supersector'),
    ('300000', '30', 'Manufacturing', 'supersector'),
    ('400000', '40', 'Trade, Transportation, and Utilities', 'supersector'),
    ('500000', '50', 'Information', 'supersector'),
    ('550000', '55', 'Financial Activities', 'supersector'),
    ('600000', '60', 'Professional and Business Services', 'supersector'),
    ('650000', '65', 'Education and Health Services', 'supersector'),
    ('700000', '70', 'Leisure and Hospitality', 'supersector'),
    ('800000', '80', 'Other Services', 'supersector'),
    ('900000', '90', 'Government', 'supersector'),
    ('102100', '21', 'Mining, quarrying, and oil and gas extraction', 'sector'),
    ('310000', '31', 'Durable goods', 'sector'),
    ('320000', '32', 'Nondurable goods', 'sector'),
    ('414200', '41', 'Wholesale trade', 'sector'),
    ('420000', '42', 'Retail trade', 'sector'),
    ('430000', '43', 'Transportation and warehousing', 'sector'),
    ('442200', '22', 'Utilities', 'sector'),
    ('555200', '52', 'Finance and insurance', 'sector'),
    ('555300', '53', 'Real estate and rental and leasing', 'sector'),
    ('605400', '54', 'Professional, scientific, and technical services', 'sector'),
    ('605500', '55', 'Management of companies and enterprises', 'sector'),
    ('605600', '56', 'Administrative and support and waste management and remediation services', 'sector'),
    ('656100', '61', 'Private educational services', 'sector'),
    ('656200', '62', 'Health care and social assistance', 'sector'),
    ('707100', '71', 'Arts, entertainment, and recreation', 'sector'),
    ('707200', '72', 'Accommodation and food services', 'sector'),
    ('909100', '91', 'Federal', 'sector'),
    ('909200', '92', 'State government', 'sector'),
    ('909300', '93', 'Local government', 'sector'),
]

ces_national_series_ids = [f'CE{a}{ind}0001' for ind, _, _, _ in ces_national_series for a in ['S', 'U']]

ind_codes = {i: c for i, c, _, _ in ces_national_series}

national = ['00']
domain = [c for _, c, _, t in ces_national_series if t == 'domain']
supersector = [c for _, c, _, t in ces_national_series if t == 'supersector']
sector = [c for _, c, _, t in ces_national_series if t == 'sector']

In [None]:
ces_national = (
    bls
    .get_series(
        series_ids=ces_national_series_ids,
        start_year='2016',
        end_year='2026'
    )
    .with_columns(
        ref_year=pl.col('date')
                   .dt.year()
    )
    .select(
        adjusted=pl.col('series_id')
                   .str.slice(2, 1)
                   .eq('S'),
        ref_date=pl.col('date'),
        ref_year=pl.col('ref_year'),
        ref_month=pl.col('date')
                    .dt.month(),
        revision=pl.when(pl.col('ref_year').le(2024)).then(pl.lit(9, pl.UInt8))
                   .otherwise(pl.lit(None, pl.UInt8)),
        vintage_date=pl.col('date')
                       .dt.year()
                       .replace_strict(vintage_by_year, default=None),
        geographic_type=pl.lit('national', pl.Utf8),
        geographic_code=pl.lit('00', pl.Utf8),
        industry_type=pl.lit(None, pl.Utf8),
        industry_code=pl.col('series_id')
                        .str.slice(3, 6)
                        .replace_strict(ind_codes, default=None),
        employment=pl.col('value')
    )
    .with_columns(
        industry_type=pl.when(pl.col('industry_code').eq('00'))
                        .then(pl.lit('national', pl.Utf8))
                        .when(pl.col('industry_code').is_in(domain))
                        .then(pl.lit('domain', pl.Utf8))
                        .when(pl.col('industry_code').is_in(supersector))
                        .then(pl.lit('supersector', pl.Utf8))
                        .when(pl.col('industry_code').is_in(sector))
                        .then(pl.lit('sector', pl.Utf8))
                        .otherwise(pl.lit(None, pl.Utf8))
    )
)

ces_national

adjusted,ref_date,ref_year,ref_month,revision,vintage_date,geographic_type,geographic_code,industry_type,industry_code,employment
bool,date,i32,i8,u8,date,str,str,str,str,f64
true,2016-01-12,2016,1,9,2017-03-12,"""national""","""00""","""national""","""00""",143210.0
false,2016-01-12,2016,1,9,2017-03-12,"""national""","""00""","""national""","""00""",141073.0
true,2016-01-12,2016,1,9,2017-03-12,"""national""","""00""","""domain""","""05""",121096.0
false,2016-01-12,2016,1,9,2017-03-12,"""national""","""00""","""domain""","""05""",119093.0
true,2016-01-12,2016,1,9,2017-03-12,"""national""","""00""","""domain""","""06""",19717.0
…,…,…,…,…,…,…,…,…,…,…
false,2026-01-12,2026,1,,,"""national""","""00""","""sector""","""22""",604.9
true,2026-01-12,2026,1,,,"""national""","""00""","""sector""","""52""",6721.5
false,2026-01-12,2026,1,,,"""national""","""00""","""sector""","""52""",6711.3
true,2026-01-12,2026,1,,,"""national""","""00""","""sector""","""53""",2440.6


In [21]:
(
    pl
    .read_parquet(
        '/Users/lowell/Projects/revisions/data/ces_national_nsa_revisions.parquet'
    )
    .filter(
        pl.col('ref_year').gt(2024)
    )
    .group_by('ref_date', maintain_order=True)
    .agg(
        revision=pl.col('revision').max(),
        vintate_date=pl.col('vintage_date').max()
    )
)

ref_date,revision,vintate_date
date,i32,date
2025-01-12,0,2025-02-07
