# Choose sectors, products and diversification paths
This notebook selects target products and sectors for diversification analysis. It focuses on identifying products based on strategic indicators like PCI, density, and proximityâ€”both at the state and firm level.

In [1]:
import pandas as pd
import numpy as np
import requests
import re

In [2]:
# Select which data and aggregation to use

data_choice = "labor" # trade / labor
labor_aggregation = "metropolitan_area" # state / metropolitan_area
loc = "41740" # Location to analyze 33140 Michigan city

## 1. Read Data

In [3]:
pd.set_option('display.float_format', '{:.2f}'.format) 

In [4]:
# Directory structure and read database

if data_choice == "trade":
    DATASETS_DIR = './datasets_trade/'
    OUTPUTS_DIR = "./outputs_trade/"
    variable = "trade_value"
    code = "HS6"
    location = 'location'
    
elif data_choice == "labor":
    if labor_aggregation == "state":
        DATASETS_DIR = './datasets_labor_states/'
        OUTPUTS_DIR = "./outputs_labor_states/"
        variable = "EMP"
        location = "STATE_NAME"
        code = "NAICS2017"
    
    elif labor_aggregation == "metropolitan_area":
        DATASETS_DIR = './datasets_labor_metropolitan_area/'
        OUTPUTS_DIR = "./outputs_labor_metropolitan_area/"
        variable = "EMP"

In [5]:
relative_density = pd.read_parquet(OUTPUTS_DIR + "relative_density.parquet",engine='fastparquet')

pci = pd.read_parquet(OUTPUTS_DIR + "pci.parquet",engine='fastparquet')

relative_cog = pd.read_parquet(OUTPUTS_DIR + "relative_cog.parquet",engine='fastparquet')

products = pd.read_parquet(OUTPUTS_DIR + "codes.parquet",engine='fastparquet')

locations = pd.read_parquet(OUTPUTS_DIR + "locations.parquet",engine='fastparquet')

rca = pd.read_parquet(OUTPUTS_DIR + "RCA.parquet",engine='fastparquet')

mpa = pd.read_parquet(OUTPUTS_DIR + "Mpa.parquet",engine='fastparquet')

value_level = pd.read_parquet(OUTPUTS_DIR + "value_level.parquet",engine='fastparquet')

eci = pd.read_parquet(OUTPUTS_DIR + "eci.parquet",engine='fastparquet')

proximity = pd.read_parquet(OUTPUTS_DIR + "proximity.parquet",engine='fastparquet')

## 1. Choose Sectors
Filter and rank products by sector, using PCI, export potential, and relevance to key industries like electric vehicles (EVs).

### Create files for states

In [6]:
# First of all, we establish the weights we will use for the three dimensions

dens_low = 0.7
cog_low = 0.1
pci_low = 1. - dens_low - cog_low

dens_bal = 0.3
cog_bal = 0.1
pci_bal = 1. - dens_bal - cog_bal

dens_long = 0.2
cog_long = 0.4
pci_long = 1. - dens_long - cog_long

In [7]:
def top_criterios(x, idx_low, idx_bal, idx_long, idx_VCR):
    resultado = ''
    
    if x.name in idx_low:
        resultado += 'Low-hanging fruits, '
    if x.name in idx_bal:
        resultado += 'Balance strategy, '
    if x.name in idx_long:
        resultado += 'Long jump, '
    if x.name in idx_VCR:
        resultado += 'Productive trajectory'
    if resultado[-2:] == ', ':
        resultado = resultado[:-2]
    
    return resultado

In [8]:
# Define the criteria in a function.

def criterios_atlas(df_temp, criterio):
    
    # Note that only those with mcp < 1 are considered.
    if df_temp['mcp'] >= 1.:
        return np.nan  
    
    if criterio == 'low_hanging':
        dens_mul, cog_mul, pci_mul = dens_low, cog_low, pci_low
    elif criterio == 'balanced':
        dens_mul, cog_mul, pci_mul = dens_bal, cog_bal, pci_bal
    elif criterio == 'long_jump':
        dens_mul, cog_mul, pci_mul = dens_long, cog_long, pci_long

    index = dens_mul * df_temp['density'] + \
            cog_mul * df_temp['relative_cog']  + \
            pci_mul * df_temp['pci']
    
    return index

In [9]:
concordance_2017_2022 = pd.read_excel(DATASETS_DIR + "2017_to_2022_NAICS.xlsx")
concordance_2017_2022.rename(columns={"2017 NAICS Code": "code", "2022 NAICS Code" :"NAICS_2022", "2022 NAICS Title": "NAICS_Description"}, inplace=True)
concordance_2017_2022["code"] = concordance_2017_2022["code"].astype(str)
concordance_2017_2022["NAICS_2022"] = concordance_2017_2022["NAICS_2022"].astype(str)
concordance_2017_2022.head()

Unnamed: 0,code,2017 NAICS Title\n(and specific piece of the 2017 industry that is contained in the 2022 industry),NAICS_2022,NAICS_Description
0,111110,Soybean Farming,111110,Soybean Farming
1,111120,Oilseed (except Soybean) Farming,111120,Oilseed (except Soybean) Farming
2,111130,Dry Pea and Bean Farming,111130,Dry Pea and Bean Farming
3,111140,Wheat Farming,111140,Wheat Farming
4,111150,Corn Farming,111150,Corn Farming


In [10]:
codes = products.iloc[:, 0].astype(str) 

df_full = pd.DataFrame({"code": codes.values})
df_full["pci"] = (pci.values)
df_full["relative_cog"] = (relative_cog.loc[loc].reindex(codes).values)
df_full["density"] = (relative_density.loc[loc].reindex(codes).values)
df_full["rca"] = (rca.loc[loc].reindex(codes).values)
df_full["mcp"] = (mpa.loc[loc].reindex(codes).values)
df_full[variable] = (value_level.loc[loc].reindex(codes).values)
df_full = df_full.merge(concordance_2017_2022[["code", "NAICS_2022", "NAICS_Description"]], how="left", on="code")
df_full.drop(columns="code", inplace=True)
df_full.head()

Unnamed: 0,pci,relative_cog,density,rca,mcp,EMP,NAICS_2022,NAICS_Description
0,-1.32,-0.82,1.02,0.52,0,45.0,115112,"Soil Preparation, Planting, and Cultivating"
1,-0.87,-0.56,1.34,0.63,0,119.0,212321,Construction Sand and Gravel Mining
2,-1.47,-0.59,0.81,0.74,0,2545.0,221122,Electric Power Distribution
3,-1.67,-0.9,1.36,1.1,1,2759.0,236115,New Single-Family Housing Construction (except...
4,-1.56,-0.85,1.59,1.55,1,7904.0,236118,Residential Remodelers


In [11]:
df_full['idx_low_hanging'] = df_full.apply(lambda x: criterios_atlas(x, 'low_hanging'), axis=1)
df_full['idx_balanced'] = df_full.apply(lambda x: criterios_atlas(x, 'balanced'), axis=1)
df_full['idx_long_jump'] = df_full.apply(lambda x: criterios_atlas(x, 'long_jump'), axis=1)

N_largest = 25
    
pci_mean = df_full.loc[df_full['mcp'] == 1., 'pci'].mean()
idx_low = df_full['idx_low_hanging'].nlargest(N_largest).index
idx_bal = df_full['idx_balanced'].nlargest(N_largest).index
idx_long =df_full['idx_long_jump'].nlargest(N_largest).index
idx_VCR = df_full.loc[(df_full['rca'] < 1.) & (df_full['pci'] > pci_mean), 'rca'].nlargest(N_largest).index

df_full[f'Top {N_largest} criterios'] = df_full.apply(lambda x: top_criterios(x, idx_low, idx_bal, idx_long, idx_VCR), axis=1)

In [12]:
sectors = pd.read_excel(DATASETS_DIR + "Sector classification NAICS 11.25.xlsx", sheet_name='NAICS')
sectors.rename(columns={"2022 NAICS Codes": "NAICS_6"}, inplace=True)
sectors['NAICS_6'] = sectors["NAICS_6"].apply(lambda x: f'{x:06}')
sectors = sectors.loc[sectors["Relevance"] == "Core"]
sectors_6digits = sectors[['NAICS_6', 'Strategic Sector']].drop_duplicates(subset=["NAICS_6", "Strategic Sector"], keep="first")
sectors_6digits.head()

Unnamed: 0,NAICS_6,Strategic Sector
0,336411,Aerospace
1,336412,Aerospace
2,336413,Aerospace
3,336414,Aerospace
4,336415,Aerospace


In [13]:
df_full = df_full.merge(sectors_6digits[["NAICS_6", "Strategic Sector"]], how="left", left_on = "NAICS_2022", right_on="NAICS_6")
df_full.head()

Unnamed: 0,pci,relative_cog,density,rca,mcp,EMP,NAICS_2022,NAICS_Description,idx_low_hanging,idx_balanced,idx_long_jump,Top 25 criterios,NAICS_6,Strategic Sector
0,-1.32,-0.82,1.02,0.52,0,45.0,115112,"Soil Preparation, Planting, and Cultivating",0.37,-0.57,-0.65,,,
1,-0.87,-0.56,1.34,0.63,0,119.0,212321,Construction Sand and Gravel Mining,0.71,-0.18,-0.3,,,
2,-1.47,-0.59,0.81,0.74,0,2545.0,221122,Electric Power Distribution,0.21,-0.7,-0.66,,221122.0,Clean Energies & Utilities
3,-1.67,-0.9,1.36,1.1,1,2759.0,236115,New Single-Family Housing Construction (except...,,,,,,
4,-1.56,-0.85,1.59,1.55,1,7904.0,236118,Residential Remodelers,,,,,,


## Other tables

In [14]:
df_pci = pd.DataFrame({"code": codes.values})
df_pci["pci"] = (pci.values)
df_pci = df_pci.merge(concordance_2017_2022[["code", "NAICS_2022", "NAICS_Description"]], how="left", on="code")
df_pci.drop(columns="code", inplace=True)
df_pci.sort_values(by="pci", ascending=False).head(15)

Unnamed: 0,pci,NAICS_2022,NAICS_Description
1046,2.67,336419,Other Guided Missile and Space Vehicle Parts a...
1016,2.48,332112,Nonferrous Forging
1025,2.48,523210,Securities and Commodity Exchanges
991,2.47,334112,Computer Storage Device Manufacturing
993,2.38,334610,Manufacturing and Reproducing Magnetic and Opt...
1004,2.25,325130,Synthetic Dye and Pigment Manufacturing
981,2.23,311225,Fats and Oils Refining and Blending
1048,2.23,212290,Other Metal Ore Mining
937,2.19,512120,Motion Picture and Video Distribution
1045,2.11,336415,Guided Missile and Space Vehicle Propulsion Un...


In [15]:
codes = locations.iloc[:, 0].astype(str) 

df_eci = pd.DataFrame({"cbsa": codes.values})
df_eci["eci"] = (eci.values)
df_eci.head()

Unnamed: 0,cbsa,eci
0,10100,-0.43
1,10140,-0.46
2,10180,0.21
3,11360,-0.27
4,10220,-0.9


In [16]:
if labor_aggregation == "metropolitan_area":
    msa_codes = pd.read_excel(DATASETS_DIR + "list2_2023.xlsx")
    msa_codes = msa_codes[["CBSA Code", "CBSA Title"]]
    msa_codes.rename(columns={"CBSA Code": "cbsa"}, inplace=True)
    msa_codes.drop_duplicates(subset=["cbsa"], keep="first", inplace=True)
    df_eci = df_eci.merge(msa_codes, how="left", on="cbsa").sort_values(by="eci", ascending=False)

## Proximity analysis

In [17]:
proximity.reset_index(drop=False, inplace=True)
proximity.rename(columns={"index": "code"}, inplace=True)
proximity.shape

(962, 963)

In [18]:
# Create a DataFrame with all proximity matches

top_matches = []

for naics in proximity["code"].drop_duplicates(keep="first"):
    top = (
        proximity[["code", naics]]
        .sort_values(by=naics, ascending=False)
        .copy()
    )
    top["anchor_product"] = naics
    top = top.rename(columns={naics: "proximity"})
    top_matches.append(top)

df_all_targets = pd.concat(top_matches, ignore_index=True)
df_all_targets.drop_duplicates(subset=["code", "anchor_product"], keep="first", inplace=True)
df_all_targets.shape

(925444, 3)

In [19]:
df_all_targets = df_all_targets.merge(concordance_2017_2022[["code", "NAICS_2022"]], how="left", on="code")
df_all_targets.drop(columns="code", inplace=True)
df_all_targets = df_all_targets.merge(concordance_2017_2022[["code", "NAICS_2022"]], how="left", left_on="anchor_product", right_on="code")
df_all_targets.drop(columns=["code", "anchor_product"], inplace=True)
df_all_targets.rename(columns={"NAICS_2022_x": "NAICS_2022", "NAICS_2022_y": "NAICS_2022_todiversify"}, inplace=True)
df_all_targets.head()

Unnamed: 0,proximity,NAICS_2022,NAICS_2022_todiversify
0,1.0,115112,115112
1,0.35,424510,115112
2,0.3,115114,115112
3,0.29,311119,115112
4,0.28,811114,115112


In [20]:
# Create this to have the information with the rest
df_proximity_analysis = df_full[["NAICS_2022", "NAICS_Description", "pci", "relative_cog", "density",  "idx_low_hanging", "idx_balanced", "idx_long_jump"]].sort_values(by="pci", ascending=False)

# Pre-filter for speed
df_firms = df_proximity_analysis[["NAICS_2022"]]
df_all_targets = df_all_targets[(df_all_targets.proximity > 0.3) &(df_all_targets.proximity < 1) ]

# Merge all dataframes to create the final dataframe 
df_merged = df_firms.merge(df_all_targets, how="left", on="NAICS_2022")
print(df_merged.shape)
df_merged.head()

(88097, 3)


Unnamed: 0,NAICS_2022,proximity,NAICS_2022_todiversify
0,336419,0.33,316990
1,336419,0.33,483112
2,336419,0.33,334112
3,336419,0.33,332112
4,336419,0.5,336414


In [21]:
df_merged = df_merged.merge(df_proximity_analysis, how="left", left_on="NAICS_2022_todiversify", right_on="NAICS_2022").drop(columns=["NAICS_2022_y"]).rename(columns={"NAICS_2022_x": "NAICS_2022_origin"})
df_merged.head()

Unnamed: 0,NAICS_2022_origin,proximity,NAICS_2022_todiversify,NAICS_Description,pci,relative_cog,density,idx_low_hanging,idx_balanced,idx_long_jump
0,336419,0.33,316990,Other Leather and Allied Product Manufacturing,1.54,-0.6,-1.72,-0.96,0.35,0.03
1,336419,0.33,316990,Other Leather and Allied Product Manufacturing,0.42,-0.13,1.21,,,
2,336419,0.33,483112,Deep Sea Passenger Transportation,1.91,-0.62,-1.27,-0.57,0.7,0.26
3,336419,0.33,334112,Computer Storage Device Manufacturing,2.47,-1.46,-0.98,-0.34,1.04,0.21
4,336419,0.33,332112,Nonferrous Forging,2.48,0.5,-2.13,-0.95,0.9,0.77


## Export

In [22]:
with pd.ExcelWriter(OUTPUTS_DIR + f"outputs_{loc}.xlsx", engine="xlsxwriter") as writer:
    df_eci.to_excel(writer, sheet_name="ECI", index=False)
    df_pci.to_excel(writer, sheet_name="PCI", index=False)
    df_full.to_excel(writer, sheet_name="Full database", index=False)
    df_full.loc[df_full["Top 25 criterios"] != ""].to_excel(writer, sheet_name="Selected HS", index=False)
    df_full.loc[(df_full["mcp"] == 1) & (df_full["Strategic Sector"].notna())].to_excel(writer, sheet_name="Specialized Sectors", index=False)
    df_full.loc[(df_full["mcp"] < 1) & (df_full["Strategic Sector"].notna())].to_excel(writer, sheet_name="Not Specialized Sectors", index=False)
    df_full[["pci", "relative_cog", "density", "NAICS_2022","Strategic Sector"]].groupby(["NAICS_2022","Strategic Sector"]).mean().sort_values(by="Strategic Sector", ascending=False).to_excel(writer, sheet_name="NAICS Averages")
    df_full[["pci", "relative_cog", "density","Strategic Sector"]].groupby(["Strategic Sector"]).mean().sort_values(by="Strategic Sector", ascending=False).to_excel(writer, sheet_name="Sector Averages")    
    df_merged.to_excel(writer, sheet_name="Proximity Matches", index=False)