In [1]:
import pandas as pd
import numpy as np

In [2]:
DATASETS_OCCUPATIONS = "./datasets_occupations/"
DATASETS_LABOR = './datasets_labor_metropolitan_area/'

In [3]:
df_labor = pd.read_parquet(DATASETS_LABOR + "df_labor_usa_metropolitan_area_N2022.parquet",engine='fastparquet')
df_labor['NAICS_2022'] = df_labor['NAICS_2022'].astype(str) # Ensure NAICS is string*
df_labor['naics_level'] = df_labor['NAICS_2022'].str.len() # Add a column for NAICS code length
mask_5digit = df_labor["NAICS_2022"].str.len().eq(6) & df_labor["NAICS_2022"].str.isnumeric()
df_labor = df_labor[mask_5digit]
print(df_labor.shape)
df_labor.head()

(213397, 4)


Unnamed: 0,NAICS_2022,metropolitan statistical area/micropolitan statistical area,EMP,naics_level
0,113110,10780,7,6
1,113110,11660,7,6
2,113110,12060,50,6
3,113110,12260,12,6
4,113110,12620,31,6


In [4]:
df_occupations = pd.read_excel(DATASETS_OCCUPATIONS + "all_data_M_2024.xlsx", sheet_name="All May 2024 data")
df_occupations["NAICS"] = df_occupations["NAICS"].astype(str)
df_occupations.head()

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,...,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY,NAICS_CLEAN
0,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,...,37.89,60.44,29990,36730,49500,78810,125720,,,0
1,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,...,82.5,#,57010,79900,122090,171610,#,,,0
2,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,...,81.01,#,47510,68800,104990,168490,#,,,0
3,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,...,#,#,73710,126080,206420,#,#,,,0
4,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,...,#,#,73710,126080,206420,#,#,,,0


In [5]:
naics_ref = pd.read_excel(DATASETS_LABOR + "NAICS_EQUIVALENCE.xlsx")
naics_ref['naics_code'] = naics_ref['naics_code'].astype(str)
naics_ref.head()

Unnamed: 0,naics_code,naics_description,level,parent_code,automotive_subsector,supply_chain_stage,sector_alignment,strategic_sector
0,11,"Agriculture, Forestry, Fishing and Hunting",2,,Others,Others,,
1,111,Crop Production,3,11.0,Others,Others,,
2,1111,Oilseed and Grain Farming,4,111.0,Others,Others,,
3,11111,Soybean Farming,5,111.0,Others,Others,,
4,111110,Soybean Farming,6,1111.0,Others,Others,,


In [6]:
df_occupations["NAICS_CLEAN"] = df_occupations["NAICS_CLEAN"].astype(str).str.upper().str.replace(r"[^0-9A-Z]", "", regex=True)
level_map = {"cross-industry": 0,"cross-industry, ownership": 0,"sector": 2,"3-digit": 3,"3-digit, ownership": 3,"4-digit": 4,"4-digit, ownership": 4,
    "5-digit": 5,"6-digit": 6,}
df_occupations["i_group_clean"] = df_occupations["I_GROUP"].astype(str).str.strip().str.lower()
df_occupations["naics_len"] = df_occupations["i_group_clean"].map(level_map)
df_occupations["naics_len"].unique()

array([0, 2, 3, 4, 6, 5], dtype=int64)

In [7]:
df_occupations = df_occupations.merge(naics_ref[['naics_code', 'naics_description']], left_on=['NAICS_CLEAN'], right_on=['naics_code'], how='left')

In [8]:
df_occupations.columns

Index(['AREA', 'AREA_TITLE', 'AREA_TYPE', 'PRIM_STATE', 'NAICS', 'NAICS_TITLE',
       'I_GROUP', 'OWN_CODE', 'OCC_CODE', 'OCC_TITLE', 'O_GROUP', 'TOT_EMP',
       'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'PCT_TOTAL', 'PCT_RPT',
       'H_MEAN', 'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN',
       'H_PCT75', 'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75',
       'A_PCT90', 'ANNUAL', 'HOURLY', 'NAICS_CLEAN', 'i_group_clean',
       'naics_len', 'naics_code', 'naics_description'],
      dtype='object')

### Build occupation matrix

In [19]:
df_nat = df_occupations.loc[(df_occupations["AREA"] == 99)].copy()
df_sd = df_occupations.loc[(df_occupations["AREA"] == 41740)].copy()

In [None]:
df_occupations.loc[(df_occupations["NAICS_CLEAN	"] == 99)].shape

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,...,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY,NAICS_CLEAN,i_group_clean,naics_len,naics_code,naics_description
0,99,U.S.,1,US,000000,Cross-industry,cross-industry,1235,00-0000,All Occupations,...,49500,78810,125720,,,000000,cross-industry,0,,
1,99,U.S.,1,US,000000,Cross-industry,cross-industry,1235,11-0000,Management Occupations,...,122090,171610,#,,,000000,cross-industry,0,,
2,99,U.S.,1,US,000000,Cross-industry,cross-industry,1235,11-1000,Top Executives,...,104990,168490,#,,,000000,cross-industry,0,,
3,99,U.S.,1,US,000000,Cross-industry,cross-industry,1235,11-1010,Chief Executives,...,206420,#,#,,,000000,cross-industry,0,,
4,99,U.S.,1,US,000000,Cross-industry,cross-industry,1235,11-1011,Chief Executives,...,206420,#,#,,,000000,cross-industry,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416548,5500002,Northeastern Wisconsin nonmetropolitan area,6,WI,000000,Cross-industry,cross-industry,1235,53-7199,"Material Moving Workers, All Other",...,29210,34080,34080,,,000000,cross-industry,0,,
416549,5500003,South Central Wisconsin nonmetropolitan area,6,WI,000000,Cross-industry,cross-industry,1235,53-7199,"Material Moving Workers, All Other",...,30680,33800,33910,,,000000,cross-industry,0,,
416550,5500004,Western Wisconsin nonmetropolitan area,6,WI,000000,Cross-industry,cross-industry,1235,53-7199,"Material Moving Workers, All Other",...,31700,33910,61690,,,000000,cross-industry,0,,
416551,800001,Eastern and Southern Colorado nonmetropolitan ...,6,CO,000000,Cross-industry,cross-industry,1235,53-7199,"Material Moving Workers, All Other",...,39070,45650,54430,,,000000,cross-industry,0,,


In [20]:
def add_naics_hierarchy(df, naics_col="naics_code"):
    out = df.copy()

    # Make sure NAICS is a clean string of digits
    out[naics_col] = out[naics_col].astype(str).str.strip()
    out = out.loc[out[naics_col].str.fullmatch(r"\d+")]  # keep only numeric NAICS

    # NAICS levels to display (you can add 5 if you use it)
    levels = [2, 3, 4, 5, 6]

    rows = []
    for L in levels:
        tmp = out.loc[out[naics_col].str.len().ge(L)].copy()
        tmp["NAICS_LEVEL"] = L
        tmp["NAICS_HIER"] = tmp[naics_col].str.slice(0, L)
        rows.append(tmp)

    long = pd.concat(rows, ignore_index=True)
    return long

In [21]:
nat_long = add_naics_hierarchy(df_nat, naics_col="naics_code")
sd_long  = add_naics_hierarchy(df_sd,  naics_col="naics_code")

In [24]:
def build_naics_soc_matrix(long_df):
    # pick what you want to aggregate; TOT_EMP is the usual weight
    agg = (long_df
           .groupby(["AREA", "AREA_TITLE", "AREA_TYPE", "PRIM_STATE",
                     "NAICS_LEVEL", "NAICS_HIER",
                     "OCC_CODE", "OCC_TITLE"], as_index=False)
           .agg(
               TOT_EMP=("TOT_EMP", "sum"),
               # optional extras:
               # A_MEAN=("A_MEAN", "mean"),   # careful: means should be weighted if needed
               # H_MEAN=("H_MEAN", "mean"),
               N_ROWS=("OCC_CODE", "size")
           ))

    # Wide “matrix” (NAICS x SOC) using TOT_EMP
    mat = agg.pivot_table(
        index=["NAICS_LEVEL", "NAICS_HIER"],
        columns="OCC_CODE",
        values="TOT_EMP",
        aggfunc="sum",
        fill_value=0
    )

    return agg, mat

nat_agg, nat_matrix = build_naics_soc_matrix(nat_long)
sd_agg,  sd_matrix  = build_naics_soc_matrix(sd_long)

TypeError: unsupported operand type(s) for +: 'int' and 'str'

## Education