In [46]:
import os
from datetime import date

import polars as pl
from dotenv import load_dotenv

from eco_stats import BLSClient
from eco_stats.vintage import scrape_range

_ = load_dotenv()

In [47]:
bls = BLSClient(api_key=os.getenv('BLS_API_KEY'))

In [48]:
vintage = (
    scrape_range(
        start_year=2016,
        end_year=2026,
        delay=1.0
    )
    .filter(
        pl.col('source') == 'ces_state'
    )
    .select(
        ref_date=pl.col('ref_date'),
        vintage_date=pl.col('release_date')
    )
    .unique(subset=['ref_date', 'vintage_date'], keep='last')
    .sort('ref_date')
    .filter(
        pl.col('ref_date').is_between(
            date(2016, 1, 12),
            date(2026, 1, 12)
        )
    )
)

  Scraping 2016... ces_national=13, ces_state=12, qcew=4
  Scraping 2017... ces_national=13, ces_state=12, qcew=4
  Scraping 2018... ces_national=13, ces_state=12, qcew=7
  Scraping 2019... ces_national=13, ces_state=12, qcew=4
  Scraping 2020... ces_national=13, ces_state=12, qcew=4
  Scraping 2021... ces_national=26, ces_state=24, qcew=8
  Scraping 2022... ces_national=26, ces_state=24, qcew=8
  Scraping 2023... ces_national=26, ces_state=24, qcew=8
  Scraping 2024... ces_national=26, ces_state=24, qcew=8
  Scraping 2025... ces_national=24, ces_state=20, qcew=10
  Scraping 2026... ces_national=24, ces_state=26, qcew=8


In [49]:
vintage = (
    vintage
    .with_columns(
        ref_date=pl.col('ref_date'),
        vintage_date_0=pl.col('vintage_date'),
        vintage_date_1=pl.col('vintage_date').shift(-1)
    )
)

In [50]:
vintage_dict = {d[0]: d[1] for d in vintage.iter_rows()}

In [51]:
dict_list = []
for d in list(
    vintage
    .iter_rows(named=True)
):

    ref_date = d['ref_date']
    vintage_date_0=d['vintage_date_0']
    vintage_date_1=d['vintage_date_1']

    if date(2016, 1, 12) <= ref_date <= date(2016, 3, 12):
        r, d = 9, vintage_dict[date(2017, 3, 12)]
    elif date(2016, 4, 12) <= ref_date <= date(2017, 3, 12):
        r, d = 9, vintage_dict[date(2018, 3, 12)]
    elif date(2017, 4, 12) <= ref_date <= date(2018, 3, 12):
        r, d = 9, vintage_dict[date(2019, 3, 12)]
    elif date(2018, 4, 12) <= ref_date <= date(2019, 3, 12):
        r, d = 9, vintage_dict[date(2020, 3, 12)]
    elif date(2019, 4, 12) <= ref_date <= date(2020, 3, 12):
        r, d = 9, vintage_dict[date(2021, 3, 12)]
    elif date(2020, 4, 12) <= ref_date <= date(2021, 3, 12):
        r, d = 9, vintage_dict[date(2022, 3, 12)]
    elif date(2021, 4, 12) <= ref_date <= date(2022, 3, 12):
        r, d = 9, vintage_dict[date(2023, 3, 12)]
    elif date(2022, 4, 12) <= ref_date <= date(2023, 3, 12):
        r, d = 9, vintage_dict[date(2024, 3, 12)]   
    elif date(2023, 4, 12) <= ref_date <= date(2024, 3, 12):
        r, d = 9, vintage_dict[date(2025, 3, 12)]
    elif date(2024, 4, 12) <= ref_date <= date(2025, 3, 12):
        r, d = 1, vintage_date_1
    else:
        r, d = 0, vintage_date_0

    dict_list.append({
        'ref_date': ref_date,
        'revision': r,
        'vintage_date': d,
    })

revisions = (
    pl
    .DataFrame(
        dict_list,
        schema_overrides={
            'ref_date': pl.Date,
            'revision': pl.UInt8,
            'vintage_date': pl.Date,
        }
    )
)

In [53]:
states = [
    '01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', 
    '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', 
    '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', 
    '39', '40', '41', '42', '44', '45', '46', '47', '48', '49', '50', '51', 
    '53', '54', '55', '56', '72', '78', 
]


adjusted = ['S', 'U']
len(states)

53

In [54]:
industries = [
    ('00000000', '00', 'Total Nonfarm', 'national'),
    
    ('05000000', '05', 'Total Private', 'domain'),
    ('06000000', '06', 'Goods Producing', 'domain'),
    ('07000000', '07', 'Service-Providing', 'domain'),
    ('08000000', '08', 'Private Service Providing', 'domain'),

    ('10000000', '10', 'Mining and Logging', 'supersector'),
    ('20000000', '20', 'Construction', 'supersector'),
    ('30000000', '30', 'Manufacturing', 'supersector'),
    ('40000000', '40', 'Trade, Transportation, and Utilities', 'supersector'),
    ('50000000', '50', 'Information', 'supersector'),
    ('55000000', '55', 'Financial Activities', 'supersector'),
    ('60000000', '60', 'Professional and Business Services', 'supersector'),
    ('65000000', '65', 'Private Education and Health Services', 'supersector'),
    ('70000000', '70', 'Leisure and Hospitality', 'supersector'),
    ('80000000', '80', 'Other Services', 'supersector'),
    ('90000000', '90', 'Government', 'supersector'),

    ('10210000', '21', 'Mining, Quarrying, and Oil and Gas Extraction', 'sector'),
    ('31000000', '31', 'Durable Goods', 'sector'),
    ('32000000', '32', 'Non-Durable Goods', 'sector'),
    ('41000000', '41', 'Wholesale Trade', 'sector'),
    ('42000000', '42', 'Retail Trade', 'sector'),
    ('43220000', '22', 'Utilities', 'sector'),
    ('43400089', '89', 'Transportation and Warehousing', 'sector'),
    ('55520000', '52', 'Finance and Insurance', 'sector'),
    ('55530000', '53', 'Real Estate and Rental and Leasing', 'sector'),
    ('60540000', '54', 'Professional, Scientific, and Technical Services', 'sector'),
    ('60550000', '55', 'Management of Companies and Enterprises', 'sector'),
    ('60560000', '56', 'Administrative and Support and Waste Management and Remediation Services', 'sector'),
    ('65610000', '61', 'Private Educational Services', 'sector'),
    ('65620000', '62', 'Health Care and Social Assistance', 'sector'),
    ('70710000', '71', 'Arts, Entertainment, and Recreation', 'sector'),
    ('70720000', '72', 'Accommodation and Food Services', 'sector'),
    ('90910000', '91', 'Federal Government', 'sector'),
    ('90920000', '92', 'State Government', 'sector'),
    ('90930000', '93', 'Local Government', 'sector'),
]

ind_dict = {i: c for i, c, _, _ in industries}

domain = [c for _, c, _, t in industries if t == 'domain']
supersector = [c for _, c, _, t in industries if t == 'supersector']
sector = [c for _, c, _, t in industries if t == 'sector']

In [57]:
ces_state_series_ids = []
for a in adjusted:
    for s in states:
        for i in industries:
            ces_state_series_ids.append(f'SM{a}{s}00000{i[0]}01')

In [43]:
ces_state = (
    bls
    .get_series(
        series_ids=ces_state_series_ids,
        start_year='2016',
        end_year='2026'
    )
    .with_columns(
        ref_date=pl.col('date')
                   .dt.offset_by('11d'),
        ref_year=pl.col('date')
                   .dt.year()
    )
    .join(
        revisions,
        on='ref_date',
        how='left'
    )
    .select(
        adjusted=pl.col('series_id')
                   .str.slice(2, 1)
                   .eq('S'),
        ref_date=pl.col('ref_date'),
        ref_year=pl.col('ref_year'),
        ref_month=pl.col('ref_date')
                    .dt.month(),
        revision=pl.col('revision'),
        vintage_date=pl.col('vintage_date'),
        geographic_type=pl.lit('state', pl.Utf8),
        geographic_code=pl.col('series_id')
                          .str.slice(3, 2),
        industry_type=pl.lit(None, pl.Utf8),
        industry_code=pl.col('series_id')
                        .str.slice(10, 8)
                        .replace_strict(ind_dict, default=None),
        employment=pl.col('value')
    )
    .with_columns(
        industry_type=pl.when(pl.col('industry_code').eq('00'))
                        .then(pl.lit('national', pl.Utf8))
                        .when(pl.col('industry_code').is_in(domain))
                        .then(pl.lit('domain', pl.Utf8))
                        .when(pl.col('industry_code').is_in(supersector))
                        .then(pl.lit('supersector', pl.Utf8))
                        .when(pl.col('industry_code').is_in(sector))
                        .then(pl.lit('sector', pl.Utf8))
                        .otherwise(pl.lit(None, pl.Utf8))
    )
    .sort(
        'ref_date', 'revision',
        'geographic_type', 'geographic_code',
        'industry_type', 'industry_code'
    )
)

In [44]:
(
    ces_state
    .filter(
        ~pl.col('adjusted')
    )
    .drop('adjusted')
    .sort(
        'ref_date', 'revision',
        'geographic_type', 'geographic_code',
        'industry_type', 'industry_code'
    )
    .write_parquet(
        '/Users/lowell/Projects/revisions/data/ces_state_nsa.parquet'
    )
)

In [45]:
(
    ces_state
    .filter(
        pl.col('adjusted')
    )
    .drop('adjusted')
    .sort(
        'ref_date', 'revision',
        'geographic_type', 'geographic_code',
        'industry_type', 'industry_code'
    )
    .write_parquet(
        '/Users/lowell/Projects/revisions/data/ces_state_sa.parquet'
    )
)