# Choose sectors, products and diversification paths
This notebook selects target products and sectors for diversification analysis. It focuses on identifying products based on strategic indicators like PCI, density, and proximityâ€”both at the state and firm level.

In [101]:
import pandas as pd
import numpy as np
import requests
import re

In [102]:
# Select which data and aggregation to use

data_choice = "labor" # trade / labor
labor_aggregation = "state" # state / metropolitan_area
loc = "Michigan" # Location to analyze 33140 Michigan city

## 1. Read Data

In [103]:
pd.set_option('display.float_format', '{:.2f}'.format) 

In [104]:
# Directory structure and read database

if data_choice == "trade":
    DATASETS_DIR = './datasets_trade/'
    OUTPUTS_DIR = "./outputs_trade/"
    variable = "trade_value"
    code = "HS6"
    location = 'location'
    
elif data_choice == "labor":
    if labor_aggregation == "state":
        DATASETS_DIR = './datasets_labor_states/'
        OUTPUTS_DIR = "./outputs_labor_states/"
        variable = "EMP"
        location = "STATE_NAME"
        code = "NAICS2017"
    
    elif labor_aggregation == "metropolitan_area":
        DATASETS_DIR = './datasets_labor_metropolitan_area/'
        OUTPUTS_DIR = "./outputs_labor_metropolitan_area/"
        variable = "EMP"

In [105]:
relative_density = pd.read_parquet(OUTPUTS_DIR + "relative_density.parquet",engine='fastparquet')

pci = pd.read_parquet(OUTPUTS_DIR + "pci.parquet",engine='fastparquet')

relative_cog = pd.read_parquet(OUTPUTS_DIR + "relative_cog.parquet",engine='fastparquet')

products = pd.read_parquet(OUTPUTS_DIR + "codes.parquet",engine='fastparquet')

locations = pd.read_parquet(OUTPUTS_DIR + "locations.parquet",engine='fastparquet')

rca = pd.read_parquet(OUTPUTS_DIR + "RCA.parquet",engine='fastparquet')

mpa = pd.read_parquet(OUTPUTS_DIR + "Mpa.parquet",engine='fastparquet')

value_level = pd.read_parquet(OUTPUTS_DIR + "value_level.parquet",engine='fastparquet')

eci = pd.read_parquet(OUTPUTS_DIR + "eci.parquet",engine='fastparquet')

proximity = pd.read_parquet(OUTPUTS_DIR + "proximity.parquet",engine='fastparquet')

## 1. Choose Sectors
Filter and rank products by sector, using PCI, export potential, and relevance to key industries like electric vehicles (EVs).

### Create files for states

In [106]:
# First of all, we establish the weights we will use for the three dimensions

dens_low = 0.7
cog_low = 0.1
pci_low = 1. - dens_low - cog_low

dens_bal = 0.3
cog_bal = 0.1
pci_bal = 1. - dens_bal - cog_bal

dens_long = 0.2
cog_long = 0.4
pci_long = 1. - dens_long - cog_long

In [107]:
def top_criterios(x, idx_low, idx_bal, idx_long, idx_VCR):
    resultado = ''
    
    if x.name in idx_low:
        resultado += 'Low-hanging fruits, '
    if x.name in idx_bal:
        resultado += 'Balance strategy, '
    if x.name in idx_long:
        resultado += 'Long jump, '
    if x.name in idx_VCR:
        resultado += 'Productive trajectory'
    if resultado[-2:] == ', ':
        resultado = resultado[:-2]
    
    return resultado

In [108]:
# Define the criteria in a function.

def criterios_atlas(df_temp, criterio):
    
    # Note that only those with mcp < 1 are considered.
    if df_temp['mcp'] >= 1.:
        return np.nan  
    
    if criterio == 'low_hanging':
        dens_mul, cog_mul, pci_mul = dens_low, cog_low, pci_low
    elif criterio == 'balanced':
        dens_mul, cog_mul, pci_mul = dens_bal, cog_bal, pci_bal
    elif criterio == 'long_jump':
        dens_mul, cog_mul, pci_mul = dens_long, cog_long, pci_long

    index = dens_mul * df_temp['density'] + \
            cog_mul * df_temp['relative_cog']  + \
            pci_mul * df_temp['pci']
    
    return index

In [109]:
concordance_2017_2022 = pd.read_excel(DATASETS_DIR + "2017_to_2022_NAICS.xlsx")
concordance_2017_2022.rename(columns={"2017 NAICS Code": "code", "2022 NAICS Code" :"NAICS_2022", "2022 NAICS Title": "NAICS_Description"}, inplace=True)
concordance_2017_2022["code"] = concordance_2017_2022["code"].astype(str)
concordance_2017_2022["NAICS_2022"] = concordance_2017_2022["NAICS_2022"].astype(str)
concordance_2017_2022.head()

Unnamed: 0,code,2017 NAICS Title\n(and specific piece of the 2017 industry that is contained in the 2022 industry),NAICS_2022,NAICS_Description
0,111110,Soybean Farming,111110,Soybean Farming
1,111120,Oilseed (except Soybean) Farming,111120,Oilseed (except Soybean) Farming
2,111130,Dry Pea and Bean Farming,111130,Dry Pea and Bean Farming
3,111140,Wheat Farming,111140,Wheat Farming
4,111150,Corn Farming,111150,Corn Farming


In [110]:
codes = products.iloc[:, 0].astype(str) 

df_full = pd.DataFrame({"code": codes.values})
df_full["pci"] = (pci.values)
df_full["relative_cog"] = (relative_cog.loc[loc].reindex(codes).values)
df_full["density"] = (relative_density.loc[loc].reindex(codes).values)
df_full["rca"] = (rca.loc[loc].reindex(codes).values)
df_full["mcp"] = (mpa.loc[loc].reindex(codes).values)
df_full[variable] = (value_level.loc[loc].reindex(codes).values)
df_full = df_full.merge(concordance_2017_2022[["code", "NAICS_2022", "NAICS_Description"]], how="left", on="code")
df_full.drop(columns="code", inplace=True)
df_full.head()

Unnamed: 0,pci,relative_cog,density,rca,mcp,EMP,NAICS_2022,NAICS_Description
0,-0.51,-0.16,0.44,0.74,0,79.0,113110,Timber Tract Operations
1,-0.57,-0.27,1.78,1.87,1,47.0,113210,Forest Nurseries and Gathering of Forest Products
2,-0.2,-0.35,1.6,1.2,1,1560.0,113310,Logging
3,1.24,-1.56,-0.87,0.58,0,54.0,114111,Finfish Fishing
4,1.16,-1.53,-0.99,0.0,0,0.0,114112,Shellfish Fishing


In [111]:
df_full['idx_low_hanging'] = df_full.apply(lambda x: criterios_atlas(x, 'low_hanging'), axis=1)
df_full['idx_balanced'] = df_full.apply(lambda x: criterios_atlas(x, 'balanced'), axis=1)
df_full['idx_long_jump'] = df_full.apply(lambda x: criterios_atlas(x, 'long_jump'), axis=1)

N_largest = 25
    
pci_mean = df_full.loc[df_full['mcp'] == 1., 'pci'].mean()
idx_low = df_full['idx_low_hanging'].nlargest(N_largest).index
idx_bal = df_full['idx_balanced'].nlargest(N_largest).index
idx_long =df_full['idx_long_jump'].nlargest(N_largest).index
idx_VCR = df_full.loc[(df_full['rca'] < 1.) & (df_full['pci'] > pci_mean), 'rca'].nlargest(N_largest).index

df_full[f'Top {N_largest} criterios'] = df_full.apply(lambda x: top_criterios(x, idx_low, idx_bal, idx_long, idx_VCR), axis=1)

In [112]:
sectors = pd.read_excel(DATASETS_DIR + "Sector classification NAICS 11.25.xlsx", sheet_name='NAICS')
sectors.rename(columns={"2022 NAICS Codes": "NAICS_6"}, inplace=True)
sectors['NAICS_6'] = sectors["NAICS_6"].apply(lambda x: f'{x:06}')
sectors = sectors.loc[sectors["Relevance"] == "Core"]
sectors_6digits = sectors[['NAICS_6', 'Strategic Sector']].drop_duplicates(subset=["NAICS_6", "Strategic Sector"], keep="first")
sectors_6digits.head()

Unnamed: 0,NAICS_6,Strategic Sector
0,336411,Aerospace
1,336412,Aerospace
2,336413,Aerospace
3,336414,Aerospace
4,336415,Aerospace


In [113]:
df_full = df_full.merge(sectors_6digits[["NAICS_6", "Strategic Sector"]], how="left", left_on = "NAICS_2022", right_on="NAICS_6")
df_full.head()

Unnamed: 0,pci,relative_cog,density,rca,mcp,EMP,NAICS_2022,NAICS_Description,idx_low_hanging,idx_balanced,idx_long_jump,Top 25 criterios,NAICS_6,Strategic Sector
0,-0.51,-0.16,0.44,0.74,0,79.0,113110,Timber Tract Operations,0.19,-0.19,-0.18,,,
1,-0.57,-0.27,1.78,1.87,1,47.0,113210,Forest Nurseries and Gathering of Forest Products,,,,,,
2,-0.2,-0.35,1.6,1.2,1,1560.0,113310,Logging,,,,,,
3,1.24,-1.56,-0.87,0.58,0,54.0,114111,Finfish Fishing,-0.52,0.33,-0.3,,,
4,1.16,-1.53,-0.99,0.0,0,0.0,114112,Shellfish Fishing,-0.61,0.25,-0.35,,,


## Other tables

In [114]:
df_pci = pd.DataFrame({"code": codes.values})
df_pci["pci"] = (pci.values)
df_pci = df_pci.merge(concordance_2017_2022[["code", "NAICS_2022", "NAICS_Description"]], how="left", on="code")
df_pci.drop(columns="code", inplace=True)
df_pci.sort_values(by="pci", ascending=False).head(15)

Unnamed: 0,pci,NAICS_2022,NAICS_Description
922,2.92,212390,Other Nonmetallic Mineral Mining and Quarrying
611,2.67,513120,Periodical Publishers
615,2.67,513199,All Other Publishers
612,2.67,513130,Book Publishers
613,2.67,513140,Directory and Mailing List Publishers
614,2.67,513191,Greeting Card Publishers
610,2.67,513110,Newspaper Publishers
617,2.67,519290,Web Search Portals and All Other Information S...
616,2.67,516210,"Media Streaming Distribution Services, Social ..."
418,2.39,458320,Luggage and Leather Goods Retailers


In [115]:
codes = locations.iloc[:, 0].astype(str) 

df_eci = pd.DataFrame({"cbsa": codes.values})
df_eci["eci"] = (eci.values)
df_eci.head()

Unnamed: 0,cbsa,eci
0,Alabama,-1.36
1,Alaska,1.06
2,Arizona,0.6
3,Florida,1.31
4,Arkansas,-1.21


In [116]:
if labor_aggregation == "metropolitan_area":
    msa_codes = pd.read_excel(DATASETS_DIR + "list2_2023.xlsx")
    msa_codes = msa_codes[["CBSA Code", "CBSA Title"]]
    msa_codes.rename(columns={"CBSA Code": "cbsa"}, inplace=True)
    msa_codes.drop_duplicates(subset=["cbsa"], keep="first", inplace=True)
    df_eci = df_eci.merge(msa_codes, how="left", on="cbsa").sort_values(by="eci", ascending=False)

## Proximity analysis

In [117]:
proximity.reset_index(drop=False, inplace=True)
proximity.rename(columns={"index": "code"}, inplace=True)
proximity.shape

(968, 969)

In [118]:
# Create a DataFrame with all proximity matches

top_matches = []

for naics in proximity["code"].drop_duplicates(keep="first"):
    top = (
        proximity[["code", naics]]
        .sort_values(by=naics, ascending=False)
        .copy()
    )
    top["anchor_product"] = naics
    top = top.rename(columns={naics: "proximity"})
    top_matches.append(top)

df_all_targets = pd.concat(top_matches, ignore_index=True)
df_all_targets.drop_duplicates(subset=["code", "anchor_product"], keep="first", inplace=True)
df_all_targets.shape

(937024, 3)

In [119]:
df_all_targets = df_all_targets.merge(concordance_2017_2022[["code", "NAICS_2022"]], how="left", on="code")
df_all_targets.drop(columns="code", inplace=True)
df_all_targets = df_all_targets.merge(concordance_2017_2022[["code", "NAICS_2022"]], how="left", left_on="anchor_product", right_on="code")
df_all_targets.drop(columns=["code", "anchor_product"], inplace=True)
df_all_targets.rename(columns={"NAICS_2022_x": "NAICS_2022", "NAICS_2022_y": "NAICS_2022_todiversify"}, inplace=True)
df_all_targets.head()

Unnamed: 0,proximity,NAICS_2022,NAICS_2022_todiversify
0,1.0,113110,113110
1,0.72,113310,113110
2,0.71,321114,113110
3,0.71,115310,113110
4,0.69,113210,113110


In [120]:
# Create this to have the information with the rest
df_proximity_analysis = df_full[["NAICS_2022", "NAICS_Description", "pci", "relative_cog", "density",  "idx_low_hanging", "idx_balanced", "idx_long_jump"]].sort_values(by="pci", ascending=False)

# Pre-filter for speed
df_firms = df_proximity_analysis[["NAICS_2022"]]
df_all_targets = df_all_targets[(df_all_targets.proximity > 0.3) &(df_all_targets.proximity < 1) ]

# Merge all dataframes to create the final dataframe 
df_merged = df_firms.merge(df_all_targets, how="left", on="NAICS_2022")
print(df_merged.shape)
df_merged.head()

(574456, 3)


Unnamed: 0,NAICS_2022,proximity,NAICS_2022_todiversify
0,212390,0.33,113110
1,212390,0.47,113210
2,212390,0.33,113310
3,212390,0.33,115111
4,212390,0.33,115112


In [121]:
df_merged = df_merged.merge(df_proximity_analysis, how="left", left_on="NAICS_2022_todiversify", right_on="NAICS_2022").drop(columns=["NAICS_2022_y"]).rename(columns={"NAICS_2022_x": "NAICS_2022_origin"})
df_merged.head()

Unnamed: 0,NAICS_2022_origin,proximity,NAICS_2022_todiversify,NAICS_Description,pci,relative_cog,density,idx_low_hanging,idx_balanced,idx_long_jump
0,212390,0.33,113110,Timber Tract Operations,-0.51,-0.16,0.44,0.19,-0.19,-0.18
1,212390,0.47,113210,Forest Nurseries and Gathering of Forest Products,-0.57,-0.27,1.78,,,
2,212390,0.33,113310,Logging,-0.2,-0.35,1.6,,,
3,212390,0.33,115111,Cotton Ginning,-0.84,0.11,0.3,0.05,-0.41,-0.23
4,212390,0.33,115112,"Soil Preparation, Planting, and Cultivating",-0.12,0.48,0.22,0.18,0.04,0.19


## Export

In [122]:
with pd.ExcelWriter(OUTPUTS_DIR + f"outputs_{loc}.xlsx", engine="xlsxwriter") as writer:
    df_eci.to_excel(writer, sheet_name="ECI", index=False)
    df_pci.to_excel(writer, sheet_name="PCI", index=False)
    df_full.to_excel(writer, sheet_name="Full database", index=False)
    df_full.loc[df_full["Top 25 criterios"] != ""].to_excel(writer, sheet_name="Selected HS", index=False)
    df_full.loc[(df_full["mcp"] == 1) & (df_full["Strategic Sector"].notna())].to_excel(writer, sheet_name="Specialized Sectors", index=False)
    df_full.loc[(df_full["mcp"] < 1) & (df_full["Strategic Sector"].notna())].to_excel(writer, sheet_name="Not Specialized Sectors", index=False)
    df_full[["pci", "relative_cog", "density", "NAICS_2022","Strategic Sector"]].groupby(["NAICS_2022","Strategic Sector"]).mean().sort_values(by="Strategic Sector", ascending=False).to_excel(writer, sheet_name="NAICS Averages")
    df_full[["pci", "relative_cog", "density","Strategic Sector"]].groupby(["Strategic Sector"]).mean().sort_values(by="Strategic Sector", ascending=False).to_excel(writer, sheet_name="Sector Averages")    
    df_merged.to_excel(writer, sheet_name="Proximity Matches", index=False)