In [48]:
import pandas as pd
import os

In [49]:
raw_data_dir = 'rawdata'

In [50]:
all_data = []
# 0: LAUS Code
# 1: State FIPS Code
# 2: County FIPS Code
# 3: County Name/State Abbreviation
# 4: Year
# 5: Labor Force
# 6: Employed
# 7: Unemployed
# 8: Unemployment Rate (%)
column_indices_to_use = [0, 1, 2, 3, 4, 5, 6, 7, 8]


new_columns = ['LAUS_Code', 'State_FIPS', 'County_FIPS', 'County_Name', 
               'Year', 'Labor_Force', 'Employed', 'Unemployed', 'Unemployment_Rate']

for filename in os.listdir(raw_data_dir):
    if filename.startswith('laucnty') and filename.endswith('.xlsx'):
        file_path = os.path.join(raw_data_dir, filename)
        
        print(f"Loading {filename}...")
        
        try:
            # loading data，skip headers
            df = pd.read_excel(
                file_path, 
                header=2, 
                usecols=column_indices_to_use 
            )
            
            df.columns = new_columns
            
            all_data.append(df)
            print(f"Successfully loaded {len(df)} records from {filename}.")
            
        except Exception as e:
            print(f"CRITICAL ERROR loading {filename}: {e}")

Loading laucnty17.xlsx...
Successfully loaded 3222 records from laucnty17.xlsx.
Loading laucnty21.xlsx...
Successfully loaded 3223 records from laucnty21.xlsx.
Loading laucnty20.xlsx...
Successfully loaded 3223 records from laucnty20.xlsx.
Loading laucnty16.xlsx...
Successfully loaded 3222 records from laucnty16.xlsx.
Loading laucnty15.xlsx...
Successfully loaded 3222 records from laucnty15.xlsx.
Loading laucnty19.xlsx...
Successfully loaded 3222 records from laucnty19.xlsx.
Loading laucnty23.xlsx...
Successfully loaded 3223 records from laucnty23.xlsx.
Loading laucnty22.xlsx...
Successfully loaded 3223 records from laucnty22.xlsx.
Loading laucnty18.xlsx...
Successfully loaded 3222 records from laucnty18.xlsx.


In [55]:
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal combined records: {len(combined_df)}")
    
    # step2: Data Cleaning and FIPS Standardization
    print("\nStarting basic cleaning and FIPS code standardization...")
    
    # --- FIPS FIX ---
    combined_df['State_FIPS'] = pd.to_numeric(combined_df['State_FIPS'], errors='coerce').fillna(0).astype(int)
    combined_df['County_FIPS'] = pd.to_numeric(combined_df['County_FIPS'], errors='coerce').fillna(0).astype(int)
    
    combined_df['State_FIPS'] = combined_df['State_FIPS'].astype(str).str.zfill(2)
    combined_df['County_FIPS'] = combined_df['County_FIPS'].astype(str).str.zfill(3)
    
    # combine FIPS:(State_FIPS + County_FIPS)
    combined_df['FIPS'] = combined_df['State_FIPS'] + combined_df['County_FIPS']
    
    #  Labor_Force, Employed, Unemployed：Int64 
    int_cols = ['Labor_Force', 'Employed', 'Unemployed']
    float_cols = ['Unemployment_Rate']
    
    all_numeric_cols = int_cols + float_cols
    for col in all_numeric_cols:
        combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        
    for col in int_cols:
        combined_df[col] = combined_df[col].astype('Int64')

    #Year：int
    combined_df['Year'] = pd.to_numeric(combined_df['Year'], errors='coerce').astype('Int64') 
    # -------------------------------------------------------------------------
        
    # remove all the n/a
    combined_df.dropna(subset=['FIPS', 'Year'] + all_numeric_cols, inplace=True)
    
    print("FIPS codes standardized and data types fixed.")
    print(f"Final records after cleaning: {len(combined_df)}")

    # --- Verification Steps ---
    print("\n--- Verification of Combined and Cleaned Data ---")
    
    print("\nDataFrame Info (All Columns and Data Types):")
    combined_df.info()

    # (Unique Years)
    print(f"\nUnique Years (Expected 2015-2023): {sorted(combined_df['Year'].unique())}")

    print("\nFirst 5 Rows (Head - Check FIPS, Year, and Labor data):")
    print(combined_df.head())
    print("\nLast 5 Rows (Tail - Check FIPS, Year, and Labor data):")
    print(combined_df.tail())
    # --- End Verification Steps ---
    
    # save combined data to data directory
    output_data_dir = 'data'
    if not os.path.exists(output_data_dir):
        os.makedirs(output_data_dir)

    output_filename = 'bls_labor_force_2015_2023.parquet'
    output_path = os.path.join(output_data_dir, output_filename)

    combined_df.to_parquet(output_path, index=False)
    print(f"\n Successfully saved combined data to: {output_path}")

    # (Optional: check file size)
    print(f"File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")

else:
    print("\n FATAL: No data files were successfully loaded. Please check the 'rawdata' folder contents.")


Total combined records: 29002

Starting basic cleaning and FIPS code standardization...
FIPS codes standardized and data types fixed.
Final records after cleaning: 28897

--- Verification of Combined and Cleaned Data ---

DataFrame Info (All Columns and Data Types):
<class 'pandas.core.frame.DataFrame'>
Index: 28897 entries, 0 to 28998
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   LAUS_Code          28897 non-null  object 
 1   State_FIPS         28897 non-null  object 
 2   County_FIPS        28897 non-null  object 
 3   County_Name        28897 non-null  object 
 4   Year               28897 non-null  Int64  
 5   Labor_Force        28897 non-null  Int64  
 6   Employed           28897 non-null  Int64  
 7   Unemployed         28897 non-null  Int64  
 8   Unemployment_Rate  28897 non-null  float64
 9   FIPS               28897 non-null  object 
dtypes: Int64(4), float64(1), object(5)
memory usage