# Choose sectors, products and diversification paths
This notebook selects target products and sectors for diversification analysis. It focuses on identifying products based on strategic indicators like PCI, density, and proximityâ€”both at the state and firm level.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Select which data and aggregation to use

data_choice = "trade" # trade / labor
labor_aggregation = "state" # state / metropolitan_area
loc = "Michigan" # Location to analyze

## 1. Read Data

In [3]:
pd.set_option('display.float_format', '{:.2f}'.format) 

In [4]:
# Directory structure and read database

if data_choice == "trade":
    DATASETS_DIR = './datasets_trade/'
    OUTPUTS_DIR = "./outputs_trade/"
    variable = "trade_value"
    code = "HS6"
    location = 'location'
    
elif data_choice == "labor":
    if labor_aggregation == "state":
        DATASETS_DIR = './datasets_labor_states/'
        OUTPUTS_DIR = "./outputs_labor_states/"
        variable = "EMP"
        location = "STATE_NAME"
        code = "NAICS2017"
    
    elif labor_aggregation == "metropolitan_area":
        DATASETS_DIR = './datasets_labor_metropolitan_area/'
        OUTPUTS_DIR = "./outputs_labor_metropolitan_area/"
        variable = "EMP"

In [25]:
relative_density = pd.read_parquet(OUTPUTS_DIR + "relative_density.parquet",engine='fastparquet')

pci = pd.read_parquet(OUTPUTS_DIR + "pci.parquet",engine='fastparquet')

relative_cog = pd.read_parquet(OUTPUTS_DIR + "relative_cog.parquet",engine='fastparquet')

products = pd.read_parquet(OUTPUTS_DIR + "codes.parquet",engine='fastparquet')

locations = pd.read_parquet(OUTPUTS_DIR + "locations.parquet",engine='fastparquet')

rca = pd.read_parquet(OUTPUTS_DIR + "RCA.parquet",engine='fastparquet')

mpa = pd.read_parquet(OUTPUTS_DIR + "Mpa.parquet",engine='fastparquet')

value_level = pd.read_parquet(OUTPUTS_DIR + "value_level.parquet",engine='fastparquet')

eci = pd.read_parquet(OUTPUTS_DIR + "eci.parquet",engine='fastparquet')

proximity = pd.read_parquet(OUTPUTS_DIR + "proximity.parquet",engine='fastparquet')

## 1. Choose Sectors
Filter and rank products by sector, using PCI, export potential, and relevance to key industries like electric vehicles (EVs).

### Create files for states

In [6]:
# First of all, we establish the weights we will use for the three dimensions

dens_low = 0.7
cog_low = 0.1
pci_low = 1. - dens_low - cog_low

dens_bal = 0.3
cog_bal = 0.1
pci_bal = 1. - dens_bal - cog_bal

dens_long = 0.2
cog_long = 0.4
pci_long = 1. - dens_long - cog_long

In [7]:
def top_criterios(x, idx_low, idx_bal, idx_long, idx_VCR):
    resultado = ''
    
    if x.name in idx_low:
        resultado += 'Low-hanging fruits, '
    if x.name in idx_bal:
        resultado += 'Balance strategy, '
    if x.name in idx_long:
        resultado += 'Long jump, '
    if x.name in idx_VCR:
        resultado += 'Productive trajectory'
    if resultado[-2:] == ', ':
        resultado = resultado[:-2]
    
    return resultado

In [8]:
# Define the criteria in a function.

def criterios_atlas(df_temp, criterio):
    
    # Note that only those with mcp < 1 are considered.
    if df_temp['mcp'] >= 1.:
        return np.nan  
    
    if criterio == 'low_hanging':
        dens_mul, cog_mul, pci_mul = dens_low, cog_low, pci_low
    elif criterio == 'balanced':
        dens_mul, cog_mul, pci_mul = dens_bal, cog_bal, pci_bal
    elif criterio == 'long_jump':
        dens_mul, cog_mul, pci_mul = dens_long, cog_long, pci_long

    index = dens_mul * df_temp['density'] + \
            cog_mul * df_temp['relative_cog']  + \
            pci_mul * df_temp['pci']
    
    return index

In [9]:
hs_oec_class = pd.read_excel(DATASETS_DIR  + 'BACI/HSCodeandDescription.xlsx')
hs_oec_class = hs_oec_class[hs_oec_class["Level"] == 6]
hs_oec_class.rename(columns={"Code": "code", "Description": "description"}, inplace=True)
hs_oec_class.head()

Unnamed: 0,Classification,code,description,Parent Code,Level,IsBasicLevel
2,H6,10121,"Horses; live, pure-bred breeding animals",101,6,1
3,H6,10129,"Horses; live, other than pure-bred breeding an...",101,6,1
4,H6,10130,Asses; live,101,6,1
5,H6,10190,Mules and hinnies; live,101,6,1
7,H6,10221,"Cattle; live, pure-bred breeding animals",102,6,1


In [10]:
codes = products.iloc[:, 0].astype(str) 

df_full = pd.DataFrame({"code": codes.values})
df_full["pci"] = (pci.values)
df_full["relative_cog"] = (relative_cog.loc[loc].reindex(codes).values)
df_full["density"] = (relative_density.loc[loc].reindex(codes).values)
df_full["rca"] = (rca.loc[loc].reindex(codes).values)
df_full["mcp"] = (mpa.loc[loc].reindex(codes).values)
df_full[variable] = (value_level.loc[loc].reindex(codes).values)
df_full = df_full.merge(hs_oec_class[["code", "description"]], how="left", on="code")
df_full.head()

Unnamed: 0,code,pci,relative_cog,density,rca,mcp,trade_value,description
0,10121,0.67,-0.81,0.22,0.0,0,10513.0,"Horses; live, pure-bred breeding animals"
1,10221,-0.07,0.08,1.32,0.26,0,794042.0,"Cattle; live, pure-bred breeding animals"
2,10619,-0.07,-0.1,1.05,0.12,0,101605.0,"Mammals; live, other than primates, whales, do..."
3,10641,-0.59,-0.13,0.58,16.49,1,3892315.0,"Insects; live, bees"
4,20120,-0.11,0.87,0.62,0.33,0,5547818.0,"Meat; of bovine animals, cuts with bone in (ex..."


In [11]:
df_full['idx_low_hanging'] = df_full.apply(lambda x: criterios_atlas(x, 'low_hanging'), axis=1)
df_full['idx_balanced'] = df_full.apply(lambda x: criterios_atlas(x, 'balanced'), axis=1)
df_full['idx_long_jump'] = df_full.apply(lambda x: criterios_atlas(x, 'long_jump'), axis=1)

N_largest = 25
    
pci_mean = df_full.loc[df_full['mcp'] == 1., 'pci'].mean()
idx_low = df_full['idx_low_hanging'].nlargest(N_largest).index
idx_bal = df_full['idx_balanced'].nlargest(N_largest).index
idx_long =df_full['idx_long_jump'].nlargest(N_largest).index
idx_VCR = df_full.loc[(df_full['rca'] < 1.) & (df_full['pci'] > pci_mean), 'rca'].nlargest(N_largest).index

df_full[f'Top {N_largest} criterios'] = df_full.apply(lambda x: top_criterios(x, idx_low, idx_bal, idx_long, idx_VCR), axis=1)

In [12]:
hs_naics = pd.read_excel(DATASETS_DIR + "HS_NAICS.xlsx")
hs_naics['HS6'] = hs_naics["HS6"].apply(lambda x: f'{x:06}')
hs_naics["NAICS_5"] = hs_naics["NAICS_5"].astype(str)
hs_naics = hs_naics[["HS6", "NAICS_5"]]
hs_naics.drop_duplicates(subset=["HS6", "NAICS_5"], keep="first", inplace=True)
hs_naics.head()

Unnamed: 0,HS6,NAICS_5
0,10121,11292
1,10129,11292
2,10130,11292
3,10190,11292
4,10221,11211


In [13]:
df_full = df_full.merge(hs_naics, how="left", left_on="code", right_on="HS6")

In [14]:
sectors = pd.read_excel(DATASETS_DIR + "Sector classification NAICS 11.25.xlsx", sheet_name='NAICS')
sectors.rename(columns={"2022 NAICS Codes": "NAICS_6"}, inplace=True)
sectors['NAICS_6'] = sectors["NAICS_6"].apply(lambda x: f'{x:06}')
sectors["NAICS_5"] = sectors["NAICS_6"].str[:5]
sectors = sectors.loc[sectors["Relevance"] == "Core"]
sectors_5digits = sectors[['NAICS_5', 'Strategic Sector']].drop_duplicates(subset=["NAICS_5", "Strategic Sector"], keep="first")
sectors_5digits.head()

Unnamed: 0,NAICS_5,Strategic Sector
0,33641,Aerospace
6,92711,Aerospace
7,33591,Batteries & Energy Storage
8,22111,Clean Energies & Utilities
15,22112,Clean Energies & Utilities


In [15]:
df_full = df_full.merge(sectors_5digits[["NAICS_5", "Strategic Sector"]], how="left", on="NAICS_5")
df_full.head()

Unnamed: 0,code,pci,relative_cog,density,rca,mcp,trade_value,description,idx_low_hanging,idx_balanced,idx_long_jump,Top 25 criterios,HS6,NAICS_5,Strategic Sector
0,10121,0.67,-0.81,0.22,0.0,0,10513.0,"Horses; live, pure-bred breeding animals",0.21,0.39,-0.01,,10121,11292,
1,10221,-0.07,0.08,1.32,0.26,0,794042.0,"Cattle; live, pure-bred breeding animals",0.92,0.36,0.27,,10221,11211,
2,10619,-0.07,-0.1,1.05,0.12,0,101605.0,"Mammals; live, other than primates, whales, do...",0.71,0.27,0.14,,10619,11299,
3,10641,-0.59,-0.13,0.58,16.49,1,3892315.0,"Insects; live, bees",,,,,10641,11291,
4,20120,-0.11,0.87,0.62,0.33,0,5547818.0,"Meat; of bovine animals, cuts with bone in (ex...",0.5,0.21,0.43,,20120,31161,


In [16]:
naics_equivalence = pd.read_excel(DATASETS_DIR + "NAICS_EQUIVALENCE.xlsx")
naics_equivalence.head()

Unnamed: 0,naics_code,naics_description,level,parent_code
0,11,"Agriculture, Forestry, Fishing and Hunting",2,
1,111,Crop Production,3,11.0
2,1111,Oilseed and Grain Farming,4,111.0
3,11111,Soybean Farming,5,111.0
4,111110,Soybean Farming,6,1111.0


In [17]:
df_full = df_full.merge(naics_equivalence[["naics_code", "naics_description"]], how="left", left_on="NAICS_5", right_on="naics_code")

## Other tables

In [18]:
df_pci = pd.DataFrame({"code": codes.values})
df_pci["pci"] = (pci.values)
df_pci = df_pci.merge(hs_oec_class[["code", "description"]], how="left", on="code")
df_pci.head()

Unnamed: 0,code,pci,description
0,10121,0.67,"Horses; live, pure-bred breeding animals"
1,10221,-0.07,"Cattle; live, pure-bred breeding animals"
2,10619,-0.07,"Mammals; live, other than primates, whales, do..."
3,10641,-0.59,"Insects; live, bees"
4,20120,-0.11,"Meat; of bovine animals, cuts with bone in (ex..."


In [19]:
codes = locations.iloc[:, 0].astype(str) 

df_eci = pd.DataFrame({"code": codes.values})
df_eci["eci"] = (eci.values)
df_eci.head()

Unnamed: 0,code,eci
0,Afghanistan,-1.18
1,Alabama,1.3
2,Alaska,0.08
3,Albania,-0.99
4,Algeria,-1.07


In [20]:
df_full[["pci", "relative_cog", "density","Strategic Sector"]].groupby(["Strategic Sector"]).mean().sort_values(by="Strategic Sector", ascending=False)

Unnamed: 0_level_0,pci,relative_cog,density
Strategic Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Telecommunications & Data Infrastructure,0.69,0.12,1.23
Shipbuilding,-0.2,-0.22,0.15
Medical Equipment,0.75,-0.07,1.09
Life Sciences,0.54,-0.34,0.21
Electronic Components & Devices,0.73,0.31,0.97
Defense,0.54,0.06,0.84
Batteries & Energy Storage,0.75,-0.03,0.94
Aerospace,0.46,-0.29,0.91


## Proximity matches

In [26]:
proximity.reset_index(drop=False, inplace=True)
proximity.rename(columns={"index": "HS6"}, inplace=True)
proximity.shape

(5605, 5606)

In [27]:
# Create a DataFrame with all proximity matches

top_matches = []

for naics in proximity["HS6"].drop_duplicates(keep="first"):
    top = (
        proximity[["HS6", naics]]
        .sort_values(by=naics, ascending=False)
        .copy()
    )
    top["anchor_product"] = naics
    top = top.rename(columns={naics: "proximity"})
    top_matches.append(top)

df_all_targets = pd.concat(top_matches, ignore_index=True)
df_all_targets.drop_duplicates(subset=["HS6", "anchor_product"], keep="first", inplace=True)
df_all_targets.shape

(31416025, 3)

In [45]:
# Pre-filter for speed
df_firms = df_full[["NAICS_5", "HS6"]]
df_all_targets = df_all_targets[(df_all_targets.proximity > 0.3) &(df_all_targets.proximity < 1) ]

# Merge all dataframes to create the final dataframe for the dashboard
df_merged = (
    df_firms
    .merge(df_all_targets, how="left", left_on="HS6", right_on="anchor_product")
    .drop(columns=["HS6_x"])
    .rename(columns={"HS6_y": "HS6"})
    .merge(hs_naics[["HS6", "NAICS_5"]], how="left", on="HS6").rename(columns={"NAICS_5_x": "NAICS_5_origin", "NAICS_5_y": "NAICS_5_todiversify"})
)

print(df_merged.shape)

df_merged = df_merged.merge(df_full[["pci", "relative_cog", "density", "NAICS_5","naics_description"]].groupby(["NAICS_5","naics_description"]).mean(), how="left", left_on="NAICS_5_todiversify", right_on=["NAICS_5"])

df_merged.drop(columns=["HS6", "anchor_product"], inplace=True)
df_merged.drop_duplicates(inplace=True)
df_merged.shape

(2025636, 5)


(682854, 6)

## Export

In [46]:
with pd.ExcelWriter(OUTPUTS_DIR + f"outputs_{loc}.xlsx", engine="xlsxwriter") as writer:
    df_eci.to_excel(writer, sheet_name="ECI", index=False)
    df_pci.to_excel(writer, sheet_name="PCI", index=False)
    df_full.to_excel(writer, sheet_name="Full database", index=False)
    df_full.loc[df_full["Top 25 criterios"] != ""].to_excel(writer, sheet_name="Selected HS", index=False)
    df_full.loc[(df_full["mcp"] == 1) & (df_full["Strategic Sector"].notna())].to_excel(writer, sheet_name="Specialized Sectors", index=False)
    df_full.loc[(df_full["mcp"] < 1) & (df_full["Strategic Sector"].notna())].to_excel(writer, sheet_name="Not Specialized Sectors", index=False)
    df_full[["pci", "relative_cog", "density", "NAICS_5","naics_description","Strategic Sector"]].groupby(["NAICS_5","naics_description", "Strategic Sector"]).mean().sort_values(by="Strategic Sector", ascending=False).to_excel(writer, sheet_name="NAICS Averages")
    df_full[["pci", "relative_cog", "density","Strategic Sector"]].groupby(["Strategic Sector"]).mean().sort_values(by="Strategic Sector", ascending=False).to_excel(writer, sheet_name="Sector Averages")    
    df_merged.to_excel(writer, sheet_name="Proximity Matches", index=False)