In [1]:
import pandas as pd
import os
import numpy as np 
import warnings 

warnings.filterwarnings(
    'ignore', 
    category=FutureWarning, 
    message="Downcasting behavior in `replace` is deprecated"
)
rawdata_dir = 'rawdata/oews'
output_file_path = 'data/oews_data_2015_2023_cleaned.csv'

VALID_STATE_FIPS = [
    '01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', 
    '17', '18', '19', '20', '21', '22', '24', '25', '26', '27', '28', '29', '30', 
    '31', '32', '34', '35', '36', '37', '38', '39', '40', '41', '42', '44', '45', 
    '47', '48', '49', '50', '51', '53', '54', '55', '56', '72'
]

In [3]:
def clean_and_load_oews_data(file_path):
    
    filename = os.path.basename(file_path)
    year = int(filename.split('_')[-1].split('.')[0])
    
    try:
        df = pd.read_excel(file_path, header=0, skipfooter=4)
    except ValueError:
        df = pd.read_excel(file_path, header=0)

    df.columns = df.columns.str.strip().str.upper()

    if 'AREA_TITLE' in df.columns:
        df['AREA_TITLE'] = df['AREA_TITLE'].astype(str).str.strip()
        df = df[df['AREA_TITLE'] != 'U.S.'].copy()
    
    df = df.rename(columns={
        'OCC CODE': 'OCC_CODE',
        'OCC TITLE': 'OCC_TITLE',
        'NAICS TITLE': 'NAICS_TITLE',
        'TOT_EMP': 'TOT_EMP',
        'A_MEAN': 'A_MEAN',
        'A_MEDIAN': 'A_MEDIAN',
        'A_PCT10': 'A_PCT10',
        'A_PCT90': 'A_PCT90',
        'AREA': 'AREA',
        'AREA_TITLE': 'AREA_TITLE',
        'NAICS': 'NAICS'
    })
    
    df['Year'] = year
    
    required_cols = ['Year', 'AREA', 'AREA_TITLE', 'NAICS', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE', 
                     'TOT_EMP', 'A_MEAN', 'A_MEDIAN', 'A_PCT10', 'A_PCT90']
    
    df_cleaned = df[required_cols].copy()

    numeric_cols = ['TOT_EMP', 'A_MEAN', 'A_MEDIAN', 'A_PCT10', 'A_PCT90']
    for col in numeric_cols:
        df_cleaned[col] = df_cleaned[col].replace(['#', '*', 'nan'], np.nan)
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
        
    return df_cleaned


In [4]:
all_years_df = []
if not os.path.exists(rawdata_dir):
    print(f"Error: Raw data directory '{rawdata_dir}' not found.")
else:
    print("--- Starting to process raw OEWS data (2015-2023) ---")
    
    for filename in sorted(os.listdir(rawdata_dir)):
        if filename.startswith('all_data_M_') and filename.endswith('.xlsx'):
            file_path = os.path.join(rawdata_dir, filename)
            print(f"Loading and cleaning: {filename}")
            
            try:
                df_year = clean_and_load_oews_data(file_path)
                all_years_df.append(df_year)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                
    if all_years_df:
        df_oews_combined = pd.concat(all_years_df, ignore_index=True)
        print(f"\nTotal OEWS records after cleaning and combining (before FIPS filter): {len(df_oews_combined)}")
        
        #create state fips columns
        df_oews_combined['State_FIPS'] = df_oews_combined['AREA'].astype(str).str.zfill(2)

        df_oews_state = df_oews_combined[
            (df_oews_combined['State_FIPS'].isin(VALID_STATE_FIPS))
        ].copy()

        print(f"Total OEWS records retained for State FIPS mapping: {len(df_oews_state)}")
        print(f"Sample of new State_FIPS codes: {df_oews_state['State_FIPS'].unique()[:10]}")

        
        df_oews_state.to_csv(output_file_path, index=False)
        print(f"\nSuccessfully saved cleaned OEWS state-level data to: {output_file_path}")
    else:
        print("No OEWS files were processed.")

--- Starting to process raw OEWS data (2015-2023) ---
Loading and cleaning: all_data_M_2015.xlsx
Loading and cleaning: all_data_M_2016.xlsx
Loading and cleaning: all_data_M_2017.xlsx
Loading and cleaning: all_data_M_2018.xlsx
Loading and cleaning: all_data_M_2019.xlsx
Loading and cleaning: all_data_M_2020.xlsx
Loading and cleaning: all_data_M_2021.xlsx
Loading and cleaning: all_data_M_2022.xlsx
Loading and cleaning: all_data_M_2023.xlsx

Total OEWS records after cleaning and combining (before FIPS filter): 2142823
Total OEWS records retained for State FIPS mapping: 313597
Sample of new State_FIPS codes: ['01' '02' '04' '05' '06' '08' '09' '10' '11' '12']

Successfully saved cleaned OEWS state-level data to: data/oews_data_2015_2023_cleaned.csv


In [5]:
output_file_path = 'data/oews_data_2015_2023_cleaned.csv'

df_cleaned_sample = pd.read_csv(output_file_path)

print("--- Cleaned OEWS Data Sample (Random 15 Rows) ---")
print(f"Total records in cleaned file: {len(df_cleaned_sample)}")

print(df_cleaned_sample.sample(n=15, random_state=42).to_markdown(index=False))

print("\n--- Key Column Check ---")
print(f"Sample of State_FIPS codes: {df_cleaned_sample['State_FIPS'].unique()[:10]}")
print(f"Sample of Years: {df_cleaned_sample['Year'].unique()}")

--- Cleaned OEWS Data Sample (Random 15 Rows) ---
Total records in cleaned file: 313597
|   Year |   AREA | AREA_TITLE   |   NAICS | NAICS_TITLE    | OCC_CODE   | OCC_TITLE                                                          |   TOT_EMP |   A_MEAN |   A_MEDIAN |   A_PCT10 |   A_PCT90 |   State_FIPS |
|-------:|-------:|:-------------|--------:|:---------------|:-----------|:-------------------------------------------------------------------|----------:|---------:|-----------:|----------:|----------:|-------------:|
|   2023 |     36 | New York     |       0 | Cross-industry | 17-3027    | Mechanical Engineering Technologists and Technicians               |      1260 |    68210 |      64310 |     45150 |     96250 |           36 |
|   2022 |     12 | Florida      |       0 | Cross-industry | 43-3061    | Procurement Clerks                                                 |      3940 |    44030 |      41000 |     31240 |     57240 |           12 |
|   2023 |      6 | California   |  