In [14]:
import pandas as pd
import os

def clean_and_load_laus_data(file_path):
    try:
        df = pd.read_excel(file_path, header=1, skipfooter=4)
    except ValueError:
        df = pd.read_excel(file_path, header=1)

    df.dropna(how='all', inplace=True)
    df.columns = df.columns.str.strip()
    
    df = df.rename(columns={
        'LAUS Code': 'LAUS_Code',
        'State FIPS Code': 'State_FIPS',
        'County FIPS Code': 'County_FIPS',
        'County Name/State Abbreviation': 'County_Name',
        'Year': 'Year',
        'Labor Force': 'Labor_Force',
        'Employed': 'Employed',
        'Unemployed': 'Unemployed',
        'Unemployment Rate (%)': 'Unemployment_Rate'
    })
    df['State_FIPS'] = df['State_FIPS'].astype(str).str.zfill(2)
    df['County_FIPS'] = df['County_FIPS'].astype(str).str.zfill(3)
    df['FIPS'] = df['State_FIPS'] + df['County_FIPS']
    
    numeric_cols = ['Year', 'Labor_Force', 'Employed', 'Unemployed', 'Unemployment_Rate']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    int_cols = ['Year', 'Labor_Force', 'Employed', 'Unemployed']
    for col in int_cols:
        df[col] = df[col].astype('Int64')
        
    df.dropna(subset=['Year', 'Labor_Force'], inplace=True)
    
    return df[['LAUS_Code', 'State_FIPS', 'County_FIPS', 'FIPS', 'County_Name', 
               'Year', 'Labor_Force', 'Employed', 'Unemployed', 'Unemployment_Rate']]



In [15]:
all_years_df = []
rawdata_dir = 'rawdata'

print("--- Starting to process raw BLS County data (2015-2023) ---")
for filename in os.listdir(rawdata_dir):
    if filename.startswith('laucnty') and filename.endswith('.xlsx'):
        file_path = os.path.join(rawdata_dir, filename)
        print(f"Loading and cleaning: {filename}")
        df_year = clean_and_load_laus_data(file_path)
        all_years_df.append(df_year)

df_county_combined = pd.concat(all_years_df, ignore_index=True)
print(f"\nTotal county records after cleaning and combining: {len(df_county_combined)}")
print(f"Expected records (approx): 3223 * 9 = 29007")


print("\n--- Aggregating County data to State-Year level ---")
df_state_agg = df_county_combined.groupby(['Year', 'State_FIPS']).agg(
    Total_Labor_Force=('Labor_Force', 'sum'),
    Total_Employed=('Employed', 'sum'),
    Total_Unemployed=('Unemployed', 'sum')
).reset_index()

df_state_agg['State_Unemployment_Rate'] = (
    df_state_agg['Total_Unemployed'] / df_state_agg['Total_Labor_Force']
) * 100

print(f"Total state-year records created: {len(df_state_agg)}")

df_state_agg.to_parquet('data/bls_labor_force_state_level_2015_2023.parquet', index=False, compression='snappy')
df_county_combined.to_parquet('data/bls_labor_force_county_level_2015_2023.parquet', index=False, compression='snappy')
print(f"\nSuccessfully saved aggregated state-level data and county-level data.")

--- Starting to process raw BLS County data (2015-2023) ---
Loading and cleaning: laucnty17.xlsx
Loading and cleaning: laucnty21.xlsx
Loading and cleaning: laucnty20.xlsx
Loading and cleaning: laucnty16.xlsx
Loading and cleaning: laucnty15.xlsx
Loading and cleaning: laucnty19.xlsx
Loading and cleaning: laucnty23.xlsx
Loading and cleaning: laucnty22.xlsx
Loading and cleaning: laucnty18.xlsx

Total county records after cleaning and combining: 28898
Expected records (approx): 3223 * 9 = 29007

--- Aggregating County data to State-Year level ---
Total state-year records created: 467

Successfully saved aggregated state-level data and county-level data.


In [16]:

file_path = 'data/bls_labor_force_state_level_2015_2023.parquet'

if os.path.exists(file_path):
    df_bls_state = pd.read_parquet(file_path)
    
    print(f"--- BLS State-Year Aggregated Data ({len(df_bls_state)} records) Sample (15 Random Rows) ---")
    print(df_bls_state.sample(15).to_markdown(index=False))
    
    print("\n--- Data Structure Info ---")
    df_bls_state.info()

else:
    print(f"Error: Aggregated file not found at {file_path}. Please ensure '1_BLS_LAUCnty_Data_Aggregation.ipynb' ran successfully.")

--- BLS State-Year Aggregated Data (467 records) Sample (15 Random Rows) ---
|   Year |   State_FIPS |   Total_Labor_Force |   Total_Employed |   Total_Unemployed |   State_Unemployment_Rate |
|-------:|-------------:|--------------------:|-----------------:|-------------------:|--------------------------:|
|   2018 |           01 |             2236161 |          2148824 |              87337 |                   3.90567 |
|   2020 |           04 |             3473146 |          3202107 |             271039 |                   7.80385 |
|   2023 |           02 |              356835 |           341743 |              15092 |                   4.22941 |
|   2021 |           20 |             1499636 |          1448832 |              50804 |                   3.38776 |
|   2017 |           56 |              294017 |           281273 |              12744 |                   4.33444 |
|   2016 |           37 |             4781353 |          4538204 |             243149 |                   5.085