In [1]:
import pandas as pd
import numpy as np

In [2]:
DATASETS_OCCUPATIONS = "./datasets_occupations/"
DATASETS_LABOR = './datasets_labor_metropolitan_area/'

In [3]:
df_labor = pd.read_parquet(DATASETS_LABOR + "df_labor_usa_metropolitan_area_N2022.parquet",engine='fastparquet')
df_labor['NAICS_2022'] = df_labor['NAICS_2022'].astype(str) # Ensure NAICS is string*
df_labor['naics_level'] = df_labor['NAICS_2022'].str.len() # Add a column for NAICS code length
mask_5digit = df_labor["NAICS_2022"].str.len().eq(6) & df_labor["NAICS_2022"].str.isnumeric()
df_labor = df_labor[mask_5digit]
print(df_labor.shape)
df_labor.head()

(213397, 4)


Unnamed: 0,NAICS_2022,metropolitan statistical area/micropolitan statistical area,EMP,naics_level
0,113110,10780,7,6
1,113110,11660,7,6
2,113110,12060,50,6
3,113110,12260,12,6
4,113110,12620,31,6


In [18]:
df_occupations = pd.read_excel(DATASETS_OCCUPATIONS + "all_data_M_2024.xlsx", sheet_name="All May 2024 data")
df_occupations["NAICS"] = df_occupations["NAICS"].astype(str)
df_occupations.head()

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,...,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY,NAICS_CLEAN
0,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,...,37.89,60.44,29990,36730,49500,78810,125720,,,0
1,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,...,82.5,#,57010,79900,122090,171610,#,,,0
2,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1000,Top Executives,...,81.01,#,47510,68800,104990,168490,#,,,0
3,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1010,Chief Executives,...,#,#,73710,126080,206420,#,#,,,0
4,99,U.S.,1,US,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,...,#,#,73710,126080,206420,#,#,,,0


In [19]:
naics_ref = pd.read_excel(DATASETS_LABOR + "NAICS_EQUIVALENCE.xlsx")
naics_ref['naics_code'] = naics_ref['naics_code'].astype(str)
naics_ref.head()

Unnamed: 0,naics_code,naics_description,level,parent_code,automotive_subsector,supply_chain_stage,sector_alignment,strategic_sector
0,11,"Agriculture, Forestry, Fishing and Hunting",2,,Others,Others,,
1,111,Crop Production,3,11.0,Others,Others,,
2,1111,Oilseed and Grain Farming,4,111.0,Others,Others,,
3,11111,Soybean Farming,5,111.0,Others,Others,,
4,111110,Soybean Farming,6,1111.0,Others,Others,,


In [20]:
df_occupations["NAICS_CLEAN"] = df_occupations["NAICS_CLEAN"].astype(str).str.upper().str.replace(r"[^0-9A-Z]", "", regex=True)
level_map = {"cross-industry": 0,"cross-industry, ownership": 0,"sector": 2,"3-digit": 3,"3-digit, ownership": 3,"4-digit": 4,"4-digit, ownership": 4,
    "5-digit": 5,"6-digit": 6,}
df_occupations["i_group_clean"] = df_occupations["I_GROUP"].astype(str).str.strip().str.lower()
df_occupations["naics_len"] = df_occupations["i_group_clean"].map(level_map)
df_occupations["naics_len"].unique()

array([0, 2, 3, 4, 6, 5], dtype=int64)

In [21]:
df_occupations = df_occupations.merge(naics_ref[['naics_code', 'naics_description', 'parent_code']], left_on=['NAICS_CLEAN'], right_on=['naics_code'], how='left')

In [22]:
df_occupations.columns

Index(['AREA', 'AREA_TITLE', 'AREA_TYPE', 'PRIM_STATE', 'NAICS', 'NAICS_TITLE',
       'I_GROUP', 'OWN_CODE', 'OCC_CODE', 'OCC_TITLE', 'O_GROUP', 'TOT_EMP',
       'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'PCT_TOTAL', 'PCT_RPT',
       'H_MEAN', 'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN',
       'H_PCT75', 'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75',
       'A_PCT90', 'ANNUAL', 'HOURLY', 'NAICS_CLEAN', 'i_group_clean',
       'naics_len', 'naics_code', 'naics_description', 'parent_code'],
      dtype='object')

### Build occupation matrix

In [23]:
df_nat = df_occupations.loc[(df_occupations["AREA"] == 99)].copy()

In [30]:
# Some NAs in tot_emp and in average wage
bad = df_nat.loc[pd.to_numeric(df_nat["TOT_EMP"], errors="coerce").isna(), ["TOT_EMP"]].drop_duplicates()
print("Non-numeric TOT_EMP examples:\n", bad)
bad = df_nat.loc[pd.to_numeric(df_nat["H_MEAN"], errors="coerce").isna(), ["H_MEAN"]].drop_duplicates()
print("Non-numeric H_MEAN examples:\n", bad)

Non-numeric TOT_EMP examples:
      TOT_EMP
1678      **
Non-numeric H_MEAN examples:
      H_MEAN
7         *
8238      #


In [None]:
SPECIAL_NA = {"**": np.nan, "*": np.nan, "#": np.nan}

def clean_bls_numeric(df, cols):
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = (df[c]
                     .astype(str).str.strip()
                     .replace(SPECIAL_NA)
                    )
            df[c] = pd.to_numeric(df[c], errors="coerce")  # keeps NaN for anything non-numeric
    return df

df_nat = clean_bls_numeric(df_nat, ["TOT_EMP", "H_MEAN", "A_MEAN", "JOBS_1000", "LOC_QUOTIENT", "PCT_TOTAL"])

In [139]:
def build_naics_soc_matrix(long_df, aggregation):
    # pick what you want to aggregate
    long_df = long_df.loc[long_df["O_GROUP"]==aggregation]
    agg = (long_df.groupby(["AREA", "AREA_TITLE",  "NAICS_CLEAN", "naics_len","OCC_CODE", "OCC_TITLE"], as_index=False)
           .agg(
               TOT_EMP=("TOT_EMP", "sum"),
               H_MEAN=("H_MEAN", "mean"),
               N_ROWS=("OCC_CODE", "size")
           ))
    # Wide “matrix” (NAICS x SOC) using TOT_EMP
    mat = agg.pivot_table(index=["NAICS_CLEAN", "naics_len"], columns="OCC_CODE",values="TOT_EMP",aggfunc="sum",fill_value=0)
    return agg, mat

nat_agg, nat_matrix = build_naics_soc_matrix(df_nat, "detailed")

In [38]:
nat_matrix.to_excel("./outputs_occupations/naics_soc_matrix_national_totemp.xlsx")

  nat_matrix.to_excel("./outputs_occupations/naics_soc_matrix_national_totemp.xlsx")


In [58]:
df_california = pd.read_excel("./datasets_occupations/oes_research_2023_california.xlsx")
df_california["NAICS_CLEAN"] = df_california["NAICS_CLEAN"].astype(str).str.upper().str.replace(r"[^0-9A-Z]", "", regex=True)
level_map = {"cross-industry": 0,"cross-industry, ownership": 0,"sector": 2,"3-digit": 3,"3-digit, ownership": 3,"4-digit": 4,"4-digit, ownership": 4,
    "5-digit": 5,"6-digit": 6,}
df_california["i_group_clean"] = df_california["I_GROUP"].astype(str).str.strip().str.lower()
df_california["naics_len"] = df_california["i_group_clean"].map(level_map)
df_california.head()

Unnamed: 0,AREA,AREA_TITLE,NAICS,NAICS_TITLE,I_GROUP,OCC_CODE,OCC_TITLE,O_GROUP,TOT_EMP,EMP_PRSE,...,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY,NAICS_CLEAN,i_group_clean,naics_len
0,6,California,11,"Agriculture, Forestry, Fishing and Hunting",sector,00-0000,All Occupations,total,231340,0.0,...,32720,33520,35110,38490,50800,,,11,sector,2
1,6,California,11,"Agriculture, Forestry, Fishing and Hunting",sector,11-0000,Management Occupations,major,3560,5.5,...,62630,84840,113520,156580,210150,,,11,sector,2
2,6,California,11,"Agriculture, Forestry, Fishing and Hunting",sector,11-1011,Chief Executives,detailed,30,17.1,...,76840,114450,#,#,#,,,11,sector,2
3,6,California,11,"Agriculture, Forestry, Fishing and Hunting",sector,11-1021,General and Operations Managers,detailed,1340,8.6,...,51460,68700,103570,154490,216470,,,11,sector,2
4,6,California,11,"Agriculture, Forestry, Fishing and Hunting",sector,11-2022,Sales Managers,detailed,110,12.9,...,96650,121640,188170,#,#,,,11,sector,2


In [63]:
df_california = clean_bls_numeric(df_california, ["TOT_EMP", "H_MEAN", "A_MEAN", "JOBS_1000", "LOC_QUOTIENT", "PCT_TOTAL"])
cal_agg, cal_matrix = build_naics_soc_matrix(df_california, "detailed")
cal_agg.head()

Unnamed: 0,AREA,AREA_TITLE,NAICS_CLEAN,naics_len,OCC_CODE,OCC_TITLE,TOT_EMP,H_MEAN,N_ROWS
0,6,California,11,2,11-1011,Chief Executives,30.0,154.93,1
1,6,California,11,2,11-1021,General and Operations Managers,1340.0,60.44,1
2,6,California,11,2,11-2022,Sales Managers,110.0,89.09,1
3,6,California,11,2,11-3012,Administrative Services Managers,100.0,55.09,1
4,6,California,11,2,11-3013,Facilities Managers,100.0,54.02,1


In [64]:
cal_agg.shape

(33191, 9)

In [115]:
# 1) Build P_CA(NAICS | SOC)
ca_soc_tot = (cal_agg.groupby("OCC_CODE", as_index=False)["TOT_EMP"]
                    .sum()
                    .rename(columns={"TOT_EMP":"TOT_CA_SOC"}))

ca_shares = cal_agg.merge(ca_soc_tot, on="OCC_CODE", how="left")
ca_shares["p_naics_given_soc"] = np.where(
    ca_shares["TOT_CA_SOC"] > 0,
    ca_shares["TOT_EMP"] / ca_shares["TOT_CA_SOC"],
    np.nan
)
ca_shares.head()

Unnamed: 0,AREA,AREA_TITLE,NAICS_CLEAN,naics_len,OCC_CODE,OCC_TITLE,TOT_EMP,H_MEAN,N_ROWS,TOT_CA_SOC,p_naics_given_soc
0,6,California,11,2,11-1011,Chief Executives,30.0,154.93,1,117750.0,0.000255
1,6,California,11,2,11-1021,General and Operations Managers,1340.0,60.44,1,872170.0,0.001536
2,6,California,11,2,11-2022,Sales Managers,110.0,89.09,1,349560.0,0.000315
3,6,California,11,2,11-3012,Administrative Services Managers,100.0,55.09,1,114930.0,0.00087
4,6,California,11,2,11-3013,Facilities Managers,100.0,54.02,1,54140.0,0.001847


In [158]:
# 2) Apply to SD SOC totals
df_sd = df_occupations.loc[df_occupations["AREA"]==41740]
df_sd = clean_bls_numeric(df_sd, ["TOT_EMP", "H_MEAN", "A_MEAN", "JOBS_1000", "LOC_QUOTIENT", "PCT_TOTAL"])
sd_hat = ca_shares.merge(df_sd[["OCC_CODE", "O_GROUP","TOT_EMP", "H_MEAN"]], on="OCC_CODE", how="inner")
sd_hat.rename(columns={"TOT_EMP_y": "TOT_EMP_SD", "H_MEAN_y":"H_MEAN_SD"}, inplace=True)
sd_hat["TOT_EMP_SD_hat"] = sd_hat["TOT_EMP_SD"] * sd_hat["p_naics_given_soc"]
sd_naics_soc = (sd_hat.groupby(["NAICS_CLEAN", "naics_len","OCC_CODE","OCC_TITLE","O_GROUP", "H_MEAN_SD"], dropna=False,as_index=False)["TOT_EMP_SD_hat"].sum())
sd_naics_soc.head()

Unnamed: 0,NAICS_CLEAN,naics_len,OCC_CODE,OCC_TITLE,O_GROUP,H_MEAN_SD,TOT_EMP_SD_hat
0,11,2,11-1011,Chief Executives,detailed,135.49,0.830573
1,11,2,11-1021,General and Operations Managers,detailed,,38.287031
2,11,2,11-2022,Sales Managers,detailed,70.29,2.596121
3,11,2,11-3012,Administrative Services Managers,detailed,62.65,2.38406
4,11,2,11-3013,Facilities Managers,detailed,57.59,3.084595


In [159]:
sd_naics_soc["AREA"] = 41740
sd_naics_soc["AREA_TITLE"] = "San Diego-Carlsbad, CA"

In [160]:
check = (sd_naics_soc.groupby("OCC_CODE")["TOT_EMP_SD_hat"].sum().reset_index().merge(df_sd[["OCC_CODE", "TOT_EMP"]], on="OCC_CODE", how="left"))
check["diff"] = round(check["TOT_EMP_SD_hat"] - check["TOT_EMP"], 2).abs()
check[["OCC_CODE", "TOT_EMP_SD_hat", "TOT_EMP", "diff"]].sort_values(by= "diff", ascending= False)

Unnamed: 0,OCC_CODE,TOT_EMP_SD_hat,TOT_EMP,diff
560,49-9092,0.0,90.0,90.0
0,11-1011,3260.0,3260.0,0.0
452,43-4121,830.0,830.0,0.0
445,43-4031,970.0,970.0,0.0
446,43-4041,50.0,50.0,0.0
...,...,...,...,...
502,47-2132,0.0,,
512,47-3012,0.0,,
515,47-3015,0.0,,
533,49-2096,0.0,,


In [161]:
# Manually distribute "49-9092"
sd_total = (df_sd.loc[(df_sd["OCC_CODE"] == "49-9092") & (df_sd["O_GROUP"] == "detailed"), "TOT_EMP"]
              .pipe(pd.to_numeric, errors="coerce")
              .sum(min_count=1))

mask_occ = sd_naics_soc["OCC_CODE"].eq("49-9092")
sd_naics_soc.loc[mask_occ, "naics_len"] = sd_naics_soc.loc[mask_occ, "NAICS_CLEAN"].astype(str).str.len()

half = sd_total / 2

def sector_leaf_codes(df, sector_prefix):
    s = df.loc[mask_occ, "NAICS_CLEAN"]
    in_sector = s.str.startswith(sector_prefix)
    if not in_sector.any():
        return []
    max_len = s[in_sector].str.len().max()
    leaves = s[in_sector & s.str.len().eq(max_len)].unique().tolist()
    return leaves

for sec in ["48", "56"]:
    leaves = sector_leaf_codes(sd_naics_soc, sec)
    alloc = half / len(leaves)
    sd_naics_soc.loc[mask_occ & sd_naics_soc["NAICS_CLEAN"].isin(leaves), "TOT_EMP_SD_hat"] = alloc

base = sd_naics_soc.loc[mask_occ, ["NAICS_CLEAN", "TOT_EMP_SD_hat"]].copy()

codes = sd_naics_soc.loc[mask_occ, "NAICS_CLEAN"].tolist()
for code in codes:
    sd_naics_soc.loc[mask_occ & sd_naics_soc["NAICS_CLEAN"].eq(code), "TOT_EMP_SD_hat"] = base.loc[
        base["NAICS_CLEAN"].str.startswith(code), "TOT_EMP_SD_hat"].sum()

sd_naics_soc.loc[sd_naics_soc.OCC_CODE=="49-9092"]

Unnamed: 0,NAICS_CLEAN,naics_len,OCC_CODE,OCC_TITLE,O_GROUP,H_MEAN_SD,TOT_EMP_SD_hat,AREA,AREA_TITLE
12157,4849,4,49-9092,Commercial Divers,detailed,38.3,22.5,41740,"San Diego-Carlsbad, CA"
12464,488,3,49-9092,Commercial Divers,detailed,38.3,22.5,41740,"San Diego-Carlsbad, CA"
12597,4883,4,49-9092,Commercial Divers,detailed,38.3,22.5,41740,"San Diego-Carlsbad, CA"
20041,56,2,49-9092,Commercial Divers,detailed,38.3,45.0,41740,"San Diego-Carlsbad, CA"
20428,561,3,49-9092,Commercial Divers,detailed,38.3,45.0,41740,"San Diego-Carlsbad, CA"
21731,5619,4,49-9092,Commercial Divers,detailed,38.3,45.0,41740,"San Diego-Carlsbad, CA"


In [164]:
sd_naics_soc.rename(columns={"TOT_EMP_SD_hat": "TOT_EMP", "H_MEAN_SD":"H_MEAN"}, inplace=True)
sd_naics_soc["TOT_EMP"] = round(sd_naics_soc["TOT_EMP"])
sd_agg, sd_matrix = build_naics_soc_matrix(sd_naics_soc, "detailed")

## Education

In [169]:
education_entry_level = pd.read_excel("./datasets_occupations/education_2023.xlsx", sheet_name= "educ_list")
education_entry_level.rename(columns={"OEWS May 2023 Code": "OCC_CODE", "OEWS May 2023 Title": "OCC_TITLE","Typical entry-level educational requirement": "ENTRY_LEVEL"}, inplace=True)
education_entry_level.head()

Unnamed: 0,OCC_CODE,OCC_TITLE,ENTRY_LEVEL
0,11-1011,Chief Executives,Bachelor's degree
1,11-1021,General and Operations Managers,Bachelor's degree
2,11-1031,Legislators,Bachelor's degree
3,11-2011,Advertising and Promotions Managers,Bachelor's degree
4,11-2021,Marketing Managers,Bachelor's degree


## Critical Occupations

In [180]:
ED_OK = {
    "Bachelor's degree",
    "Master's degree",
    "Associate's degree",
    "Doctoral or professional degree"
}

# 1) Keep only the "all NAICS" row if that's what your nat_agg uses (naics_len==0 / NAICS_CLEAN=='000000')
nat_base = nat_agg.loc[nat_agg["naics_len"].eq(0), ["OCC_CODE", "OCC_TITLE", "TOT_EMP", "H_MEAN"]].copy()
nat_base = nat_base.rename(columns={"TOT_EMP": "TOT_EMP_US", "H_MEAN": "H_MEAN_US"})

nat_base = (nat_base.groupby(["OCC_CODE", "OCC_TITLE"], as_index=False)
          .apply(lambda g: pd.Series({
              "TOT_EMP_US": g["TOT_EMP_US"].sum(min_count=1),
              "H_MEAN_US": (np.average(g.loc[g["H_MEAN_US"].notna() & (g["TOT_EMP_US"]>0), "H_MEAN_US"],
                                       weights=g.loc[g["H_MEAN_US"].notna() & (g["TOT_EMP_US"]>0), "TOT_EMP_US"])
                            if ((g["H_MEAN_US"].notna() & (g["TOT_EMP_US"]>0)).any()) else np.nan)
          }))
          .reset_index(drop=True))

nat_base

Unnamed: 0,OCC_CODE,OCC_TITLE,TOT_EMP_US,H_MEAN_US
0,11-1011,Chief Executives,462090.0,122.773096
1,11-1021,General and Operations Managers,7333160.0,64.042869
2,11-1031,Legislators,79530.0,
3,11-2011,Advertising and Promotions Managers,42590.0,71.614724
4,11-2021,Marketing Managers,776120.0,82.271482
...,...,...,...,...
826,53-7072,"Pump Operators, Except Wellhead Pumpers",25610.0,29.758665
827,53-7073,Wellhead Pumpers,34700.0,33.690000
828,53-7081,Refuse and Recyclable Material Collectors,322800.0,24.547422
829,53-7121,"Tank Car, Truck, and Ship Loaders",21800.0,28.825046


In [181]:
# 2) For San Diego: you likely have multiple NAICS rows; build an occupation-level SD wage.
#    Use employment-weighted mean wage across NAICS.
sd_base = sd_agg[["OCC_CODE", "OCC_TITLE", "TOT_EMP", "H_MEAN"]].copy()
sd_base["TOT_EMP"] = pd.to_numeric(sd_base["TOT_EMP"], errors="coerce")
sd_base["H_MEAN"] = pd.to_numeric(sd_base["H_MEAN"], errors="coerce")

sd_occ = (sd_base.groupby(["OCC_CODE", "OCC_TITLE"], as_index=False)
          .apply(lambda g: pd.Series({
              "TOT_EMP_SD": g["TOT_EMP"].sum(min_count=1),
              "H_MEAN_SD": (np.average(g.loc[g["H_MEAN"].notna() & (g["TOT_EMP"]>0), "H_MEAN"],
                                       weights=g.loc[g["H_MEAN"].notna() & (g["TOT_EMP"]>0), "TOT_EMP"])
                            if ((g["H_MEAN"].notna() & (g["TOT_EMP"]>0)).any()) else np.nan)
          }))
          .reset_index(drop=True))
sd_occ

Unnamed: 0,OCC_CODE,OCC_TITLE,TOT_EMP_SD,H_MEAN_SD
0,11-1011,Chief Executives,3254.0,135.49
1,11-1021,General and Operations Managers,24919.0,
2,11-1031,Legislators,210.0,
3,11-2011,Advertising and Promotions Managers,159.0,74.17
4,11-2021,Marketing Managers,4393.0,93.41
...,...,...,...,...
668,53-7063,Machine Feeders and Offbearers,329.0,19.40
669,53-7064,"Packers and Packagers, Hand",5162.0,18.81
670,53-7065,Stockers and Order Fillers,21023.0,20.47
671,53-7081,Refuse and Recyclable Material Collectors,949.0,31.76


In [182]:
# 3) Merge SD + US + education
crit = (sd_occ.merge(nat_base[["OCC_CODE", "TOT_EMP_US", "H_MEAN_US"]], on="OCC_CODE", how="left")
              .merge(education_entry_level[["OCC_CODE", "ENTRY_LEVEL"]], on="OCC_CODE", how="left"))
crit = crit.drop_duplicates()
crit.head()

Unnamed: 0,OCC_CODE,OCC_TITLE,TOT_EMP_SD,H_MEAN_SD,TOT_EMP_US,H_MEAN_US,ENTRY_LEVEL
0,11-1011,Chief Executives,3254.0,135.49,462090.0,122.773096,Bachelor's degree
1,11-1021,General and Operations Managers,24919.0,,7333160.0,64.042869,Bachelor's degree
2,11-1031,Legislators,210.0,,79530.0,,Bachelor's degree
3,11-2011,Advertising and Promotions Managers,159.0,74.17,42590.0,71.614724,Bachelor's degree
4,11-2021,Marketing Managers,4393.0,93.41,776120.0,82.271482,Bachelor's degree


In [183]:
nat_agg.loc[(nat_agg["OCC_CODE"]== "13-2052")]

Unnamed: 0,AREA,AREA_TITLE,NAICS_CLEAN,naics_len,OCC_CODE,OCC_TITLE,TOT_EMP,H_MEAN,N_ROWS
61,99,U.S.,000000,0,13-2052,Personal Financial Advisors,270480.0,77.02,1
890,99,U.S.,000001,0,13-2052,Personal Financial Advisors,270000.0,77.10,1
2239,99,U.S.,21,2,13-2052,Personal Financial Advisors,40.0,67.35,1
2480,99,U.S.,211,3,13-2052,Personal Financial Advisors,40.0,68.03,1
2630,99,U.S.,2111,4,13-2052,Personal Financial Advisors,40.0,68.03,1
...,...,...,...,...,...,...,...,...,...
72562,99,U.S.,999001,0,13-2052,Personal Financial Advisors,480.0,35.98,1
73979,99,U.S.,9992,4,13-2052,Personal Financial Advisors,130.0,38.20,1
74452,99,U.S.,999201,0,13-2052,Personal Financial Advisors,180.0,36.23,1
75077,99,U.S.,9993,4,13-2052,Personal Financial Advisors,200.0,42.06,1


In [185]:
# 4) Compute wage premium + apply your rules
crit["wage_premium_sd_vs_us"] = crit["H_MEAN_SD"] / crit["H_MEAN_US"]
crit["is_edu_ok"] = crit["ENTRY_LEVEL"].isin(ED_OK)

critical_occupations = crit.loc[
    crit["is_edu_ok"] &
    crit["H_MEAN_SD"].notna() &
    crit["H_MEAN_US"].notna() &
    (crit["H_MEAN_SD"] > crit["H_MEAN_US"])
].sort_values(["wage_premium_sd_vs_us", "TOT_EMP_SD"], ascending=[False, False])

critical_occupations.shape

(194, 9)

In [187]:
critical_occupations.head(20)

Unnamed: 0,OCC_CODE,OCC_TITLE,TOT_EMP_SD,H_MEAN_SD,TOT_EMP_US,H_MEAN_US,ENTRY_LEVEL,wage_premium_sd_vs_us,is_edu_ok
269,27-4032,Film and Video Editors,351.0,68.99,58210.0,40.046406,Bachelor's degree,1.722751,True
254,27-3011,Broadcast Announcers and Radio Disc Jockeys,151.0,63.09,48800.0,38.342328,Bachelor's degree,1.64544,True
153,19-4043,"Geological Technicians, Except Hydrologic Tech...",122.0,44.01,19690.0,28.007288,Associate's degree,1.571377,True
221,25-3011,"Adult Basic Education, Adult Secondary Educati...",179.0,46.72,99390.0,31.404579,Bachelor's degree,1.487681,True
309,29-2033,Nuclear Medicine Technologists,99.0,69.38,35570.0,47.874546,Associate's degree,1.449204,True
289,29-1141,Registered Nurses,27663.0,67.0,7097490.0,47.426724,Bachelor's degree,1.412706,True
92,17-1022,Surveyors,361.0,51.92,110370.0,37.216167,Bachelor's degree,1.395093,True
282,29-1124,Radiation Therapists,140.0,73.67,36550.0,53.162892,Associate's degree,1.385741,True
34,11-9161,Emergency Management Directors,50.0,62.99,34970.0,46.135019,Bachelor's degree,1.36534,True
486,45-2011,Agricultural Inspectors,81.0,36.71,32930.0,26.920255,Bachelor's degree,1.363657,True
