# Process indexes

This notebook calculates key Economic Complexity indicators: RCA, M matrix, ECI, PCI, and opportunity metrics such as proximity, density, and strategic value. These are used to evaluate the productive structure of states and identify diversification paths.

In [1]:
import pandas as pd
import numpy as np

## 1. Read data

Read the necessary files and select which data and aggregation you want to use

In [2]:
# Select which data and aggregation to use

data_choice = "labor" # trade / labor
labor_aggregation = "state" # state / metropolitan_area

In [3]:
# Directory structure and read database

if data_choice == "trade":
    DATASETS_DIR = './datasets_trade/'
    OUTPUTS_DIR = "./outputs_trade/"
    variable = "trade_value"
    code = "HS6"
    location = 'location'
    
    # Read the database created in create_trade_database.ipynb
    data = pd.read_parquet(DATASETS_DIR + "df_trade_complete.parquet",engine='fastparquet')
    data = data.loc[data.location != "USA"] # Since we are adding the US states, we should remove the country to avoid duplication 
    data = data.groupby(['location', 'HS6'], as_index=False)['trade_value'].sum() # Group by location and HS6, summing the trade value
    df_temp = data.groupby(["location"]).sum() # We are adding some quality restrictions: we'll delete all countries with less than $1b trade
    remove_countries = df_temp.loc[df_temp["trade_value"]<1000000000].index 
    data = data.loc[~data.location.isin(remove_countries)]
    data = data.loc[data[variable]>0]
    print(data.shape)
    print(data.head())

elif data_choice == "labor":
    if labor_aggregation == "state":
        DATASETS_DIR = './datasets_labor_states/'
        OUTPUTS_DIR = "./outputs_labor_states/"
        variable = "EMP"
        location = "STATE_NAME"
        code = "NAICS2017"

        data= pd.read_parquet(DATASETS_DIR + "df_labor_usa_states.parquet",engine='fastparquet')
        data['NAICS2017'] = data['NAICS2017'].astype(str) # Ensure NAICS is string*
        data['naics_level'] = data['NAICS2017'].str.len() # Add a column for NAICS code length
        data = data[data.naics_level == 6]
        data = data.loc[data[variable]>0]
        print(data.shape)
        print(data.head())
        
    elif labor_aggregation == "metropolitan_area":
        DATASETS_DIR = './datasets_labor_metropolitan_area/'
        OUTPUTS_DIR = "./outputs_labor_metropolitan_area/"
        variable = "EMP"
        location = "metropolitan statistical area/micropolitan statistical area"
        code = "NAICS2017"

        data= pd.read_parquet(DATASETS_DIR + "df_labor_usa_metropolitan_area.parquet",engine='fastparquet')
        data['NAICS2017'] = data['NAICS2017'].astype(str) # Ensure NAICS is string*
        data['naics_level'] = data['NAICS2017'].str.len() # Add a column for NAICS code length
        data = data[data.naics_level == 6]
        data = data.loc[data[variable]>0]
        print(data.shape)
        print(data.head())

(39388, 5)
   NAICS2017   EMP STATE STATE_NAME  naics_level
5     113110   259    01    Alabama            6
8     113210    43    01    Alabama            6
11    113310  3483    01    Alabama            6
15    114111    10    01    Alabama            6
16    114112    16    01    Alabama            6


In [4]:
data.loc[data.NAICS2017 == "481111"]["EMP"].sum()

434755

## 1. Calculate RCA (Revealed Comparative Advantage)

Compute the RCA matrix to determine whether each region (state or country) is competitively exporting a product.

In [5]:
df_base = data[[location, code, variable]].copy()
df_base.head()

Unnamed: 0,STATE_NAME,NAICS2017,EMP
5,Alabama,113110,259
8,Alabama,113210,43
11,Alabama,113310,3483
15,Alabama,114111,10
16,Alabama,114112,16


In [6]:
# Calculate the vector of export by location/country (sum c Xcp)

df_agrup_c = df_base.groupby([location])[[variable]].sum()
df_agrup_c.head()

Unnamed: 0_level_0,EMP
STATE_NAME,Unnamed: 1_level_1
Alabama,1763740
Alaska,255209
American Samoa,3712
Arizona,2767490
Arkansas,1069509


In [7]:
# Calculate the ratio of the export of a product in all the country's exports.

df_prov = df_base.merge(df_agrup_c, how='left', on=location)
df_prov['porc_country']=df_prov[f'{variable}_x']/df_prov[f'{variable}_y' ]
df_prov.head()

Unnamed: 0,STATE_NAME,NAICS2017,EMP_x,EMP_y,porc_country
0,Alabama,113110,259,1763740,0.000147
1,Alabama,113210,43,1763740,2.4e-05
2,Alabama,113310,3483,1763740,0.001975
3,Alabama,114111,10,1763740,6e-06
4,Alabama,114112,16,1763740,9e-06


In [8]:
#Calculate the vector of export by product (sum c Xcp)

df_agrup_p = df_base.groupby([code])[[variable]].sum()
df_agrup_p.head()

Unnamed: 0_level_0,EMP
NAICS2017,Unnamed: 1_level_1
113110,3707
113210,870
113310,44976
114111,3207
114112,1600


In [9]:
# Calculate total export

df_agrup_pc = df_agrup_p[variable].sum()
df_agrup_pc

136067071

In [10]:
# Calculate the ratio of an exported product over total export

df_agrup_p[f'{variable}_total'] = df_agrup_pc
df_agrup_p['porc_product'] = df_agrup_p[variable]/df_agrup_p[f'{variable}_total']
df_agrup_p.head()

Unnamed: 0_level_0,EMP,EMP_total,porc_product
NAICS2017,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
113110,3707,136067071,2.7e-05
113210,870,136067071,6e-06
113310,44976,136067071,0.000331
114111,3207,136067071,2.4e-05
114112,1600,136067071,1.2e-05


In [11]:
# Unify both tables and ratios to calculate RCA.

df_RCA = df_prov.merge(df_agrup_p, how='left', on=code)

df_RCA['RCA'] = df_RCA['porc_country'] / df_RCA['porc_product']
df_RCA['M'] = np.where(df_RCA['RCA'] >= 1, 1, 0)

df_RCA.drop(columns=[variable, f'{variable}_total', f'{variable}_y'],inplace=True)
df_RCA.rename(columns={f'{variable}_x': variable}, inplace=True)
    
df_RCA.head()

Unnamed: 0,STATE_NAME,NAICS2017,EMP,porc_country,porc_product,RCA,M
0,Alabama,113110,259,0.000147,2.7e-05,5.390085,1
1,Alabama,113210,43,2.4e-05,6e-06,3.813008,1
2,Alabama,113310,3483,0.001975,0.000331,5.974356,1
3,Alabama,114111,10,6e-06,2.4e-05,0.240558,0
4,Alabama,114112,16,9e-06,1.2e-05,0.771469,0


In [12]:
# rca by country
df_RCA.groupby(location)[['M']].sum().sort_values(by='M', ascending=False)

Unnamed: 0_level_0,M
STATE_NAME,Unnamed: 1_level_1
Pennsylvania,400
Illinois,395
Wisconsin,388
Ohio,383
Oregon,382
California,374
Indiana,372
Texas,368
North Carolina,367
Missouri,367


In [13]:
# Transform matrix to numpy
locations = df_RCA[location].unique()
products = df_RCA[code].unique()

df_RCA['loc_int'] = df_RCA[location].apply(lambda x: np.where(locations == x)[0][0])
df_RCA['prod_int'] = df_RCA[code].apply(lambda x: np.where(products == x)[0][0])
df_RCA.head()

Unnamed: 0,STATE_NAME,NAICS2017,EMP,porc_country,porc_product,RCA,M,loc_int,prod_int
0,Alabama,113110,259,0.000147,2.7e-05,5.390085,1,0,0
1,Alabama,113210,43,2.4e-05,6e-06,3.813008,1,0,1
2,Alabama,113310,3483,0.001975,0.000331,5.974356,1,0,2
3,Alabama,114111,10,6e-06,2.4e-05,0.240558,0,0,3
4,Alabama,114112,16,9e-06,1.2e-05,0.771469,0,0,4


## 2. Calculate M matrix

Binary matrix indicating whether a region is competitive (RCA > 1) in each product.

In [14]:
# Create a matrix of zeros and fill
Mpa = np.zeros((df_RCA[location].nunique(), df_RCA[code].nunique()),
               dtype=np.int64)

for row in df_RCA.loc[df_RCA.M == 1].iterrows():
    Mpa[row[1].loc_int, row[1].prod_int] = 1

In [15]:
print('Cuantos elementos tienen RCA > 1 segun df_RCA:', len(df_RCA.loc[df_RCA.M == 1]))
print('Cuantos elementos tienen RCA > 1 segun Mpa:   ', Mpa.sum())

Cuantos elementos tienen RCA > 1 segun df_RCA: 16204
Cuantos elementos tienen RCA > 1 segun Mpa:    16204


### 2.1 Calculate Diversity

Sum across products for each region to get diversity: how many products a region exports competitively.

In [16]:
diversity = Mpa.sum(axis = 1)
diversity.shape

(56,)

### 2.2 Calculate Ubiquity

Sum across regions for each product to get ubiquity: how many regions are competitive in each product.

In [17]:
ubiquity = Mpa.sum(axis = 0)
ubiquity.shape

(968,)

### 2.3 Calculate Inverse Matrix

Create the inverse normalized matrix used in ECI and PCI calculations.

In [18]:
D_inv = np.diag(1/diversity)
D_inv.shape


(56, 56)

In [19]:
U_inv = np.diag(1/ubiquity)
U_inv.shape

(968, 968)

## 3. Calculate $\tilde{M}$ matrix
Double-standardized matrix used to compute eigenvectors for complexity metrics.

In [20]:
temp_Mpa_1 = Mpa.copy().astype(float)
for i in range(len(products)):
    temp_Mpa_1[:, i] = temp_Mpa_1[:, i]/ubiquity[i]

In [21]:
temp_Mpa_1.dtype, temp_Mpa_1.shape
Sccprima = np.matmul(temp_Mpa_1, Mpa.transpose())
Sccprima.shape

(56, 56)

In [22]:
Mmonio_c = np.matmul(D_inv, Sccprima)
Mmonio_c.shape

(56, 56)

## 4. Calculate ECI and PCI

Use the eigenvectors of the standardized matrix to calculate Economic Complexity Index (ECI) and Product Complexity Index (PCI).

In [23]:
def calc_complexity(mmonio):
    autovalores, autovectores = np.linalg.eig(mmonio)
    second_idx = np.where(autovalores == -np.sort(-autovalores)[1])[0][0]
    
    # second eigenvalue is the variance: the ECI
    complexity = autovectores[:, second_idx].real
    print('Todos los avec imaginarios son cero:', (np.isclose(autovectores[:, second_idx].imag, 0.0)).all())
    print('Mean and std of second autovector:', complexity.mean(), '+/-', complexity.std())
    return complexity

In [24]:
eci_raw = calc_complexity(Mmonio_c)
eci_norm = (eci_raw - eci_raw.mean())/eci_raw.std()

Todos los avec imaginarios son cero: True
Mean and std of second autovector: -0.02232118004253277 +/- 0.13175320784956887


In [25]:
# Check that the ECIs are not reversed. If they are reversed, run next cell to change their sign. This is because the eigenvector calculation has an indeterminate direction and must be determined manually.
eci_norm = -eci_norm
print('Mean and std of eci:', eci_norm.mean(), '+/-', eci_norm.std())

Mean and std of eci: -7.930164461608261e-17 +/- 1.0


In [26]:
# ECI results by country
df_eci = pd.DataFrame({'location': locations, 'ECI': eci_norm})
df_eci.sort_values('ECI', ascending=False, inplace=True)
df_eci = df_eci.merge(df_RCA.groupby([location])[['M']].sum().sort_values(by='M', ascending=False).reset_index().rename(columns={location:"location"}),how='left')
df_eci[0:56]

Unnamed: 0,location,ECI,M
0,District of Columbia,2.003816,140
1,American Samoa,1.613208,42
2,U.S. Virgin Islands,1.565724,124
3,Hawaii,1.544408,202
4,Northern Mariana Islands,1.495056,95
5,Guam,1.333421,164
6,California,1.31224,374
7,Florida,1.309041,251
8,New York,1.244952,303
9,Alaska,1.064609,238


### 4.1 Calculate $\hat{M}$ matrix

In [27]:
temp_Mpa_2 = Mpa.copy().astype(float)
for i in range(len(locations)):
    temp_Mpa_2[i, :] = temp_Mpa_2[i, :]/diversity[i]

In [28]:
Sppprima = np.matmul(Mpa.transpose(), temp_Mpa_2)
Sppprima.shape, Sppprima.dtype

((968, 968), dtype('float64'))

In [29]:
Mmonio_p = np.matmul(U_inv, Sppprima)
Mmonio_p.shape, Mmonio_p.dtype

((968, 968), dtype('float64'))

In [30]:
pci_raw = calc_complexity(Mmonio_p)


Todos los avec imaginarios son cero: True
Mean and std of second autovector: 0.0007687791517597319 +/- 0.032132021876245025


In [31]:
if data_choice == "trade":
    df_product_codes=pd.read_excel(DATASETS_DIR + "BACI/HSCodeandDescription.xlsx", sheet_name="HS22")
    df_product_codes = df_product_codes.loc[df_product_codes["Level"]== 6]
elif data_choice == "labor":
    df_product_codes= pd.read_excel(DATASETS_DIR + "2022_NAICS_Descriptions.xlsx")
    df_product_codes["Title"] = df_product_codes["Title"].str.replace("T$", "", regex=True).str.strip()
    df_product_codes['Code'] = df_product_codes['Code'].astype(str) # Ensure NAICS is string*
    df_product_codes = df_product_codes[["Code", "Title"]]
    df_product_codes.rename(columns={"Title":"Description"}, inplace=True)

In [32]:
# Let's check now results on PCI. 
# Also check that the PCIs are not reversed. If they are reversed, run next cell to change their sign. This is because the eigenvector calculation has an indeterminate direction and must be determined manually.
pci_norm = (pci_raw - pci_raw.mean())/pci_raw.std()
pci_norm = -pci_norm
print('Mean and std of ICA:', pci_norm.mean(), '+/-', pci_norm.std())

df_pci = pd.DataFrame({code: products, 'PCI': pci_norm}).merge(df_product_codes[["Code", "Description"]].drop_duplicates(subset=["Code", "Description"]), how="left", left_on=code, right_on="Code")
df_pci.sort_values(by="PCI", ascending=False)[0:20]

Mean and std of ICA: -2.9361270072731413e-17 +/- 1.0


Unnamed: 0,NAICS2017,PCI,Code,Description
830,212392,2.916173,,
525,519130,2.665716,,
417,448320,2.392741,,
957,522293,2.333122,,
941,336415,2.31771,336415.0,Guided Missile and Space Vehicle Propulsion Un...
818,483112,2.313617,483112.0,Deep Sea Passenger Transportation
952,512120,2.30968,512120.0,Motion Picture and Video Distribution
718,713110,2.262616,713110.0,Amusement and Theme Parks
683,611630,2.198875,611630.0,Language Schools
505,512191,2.188663,512191.0,Teleproduction and Other Postproduction Services


In [33]:
# Prepare arrays to export

df_RCA = df_RCA.merge(df_pci[[code, "PCI"]], how="left", on = code)

RCA = np.zeros((df_RCA[location].nunique(), df_RCA[code].nunique()),
               dtype=np.float64)

for row in df_RCA.loc[df_RCA.RCA != 0.0].iterrows():
    RCA[row[1].loc_int, row[1].prod_int] = row[1].RCA

value_level = np.zeros((df_RCA[location].nunique(), df_RCA[code].nunique()),
               dtype=np.float64)

# Fill RCA where you have to
for row in df_RCA.iterrows():
    value_level[row[1].loc_int, row[1].prod_int] = row[1][variable]

In [34]:
ENGINE, COMP = "fastparquet", "snappy" 

pd.DataFrame(Mpa, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"Mpa.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame({"codes": list(products)})\
  .to_parquet(OUTPUTS_DIR + f"codes.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame({"location": list(locations)})\
  .to_parquet(OUTPUTS_DIR + f"locations.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame(np.asarray(RCA), index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"RCA.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame({"eci_norm": list(eci_norm)})\
  .to_parquet(OUTPUTS_DIR + f"eci.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame({"pci_norm": list(pci_norm)})\
  .to_parquet(OUTPUTS_DIR + f"pci.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame(value_level, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"value_level.parquet", engine=ENGINE, compression=COMP)


## 5. Proximity
Calculate product proximity: how likely it is for two products to be co-exported.

In [35]:
p_RCAmay1 = np.zeros((len(products),))

for prod in range(len(products)):
    p_RCAmay1[prod] = Mpa[:, prod].sum()

In [36]:
# Calculamos la proximidad: la probabilidad condicional mínima de que una icaa tenga el sector i RCA>1 dado tiene el sector j con RCA>1, y viceversa.
# Miramos entonces primero la cantidad de icas en que ambos sectores tienen RCA, lo sumo y divido por la mayor ubicuidad de ambos sectores.

almost_proximity = np.zeros((len(products), len(products)))

for p1 in range(len(products)-1):
    # como es una matriz simetrica, calcula unicamente la mitad de los valores:
    for p2 in range(p1+1, len(products)):
        almost_proximity[p1, p2] = np.logical_and(Mpa[:, p1], Mpa[:, p2]).sum()/max(p_RCAmay1[p1], p_RCAmay1[p2])

In [47]:
len(products)

968

In [37]:
# chequeo que no haya valores incorrectamente imputados
for i in range(len(products)):
    if not np.isclose(almost_proximity[i, :i].sum(), 0.0):
        print('something wrong at row', i)

In [38]:
proximity = almost_proximity + almost_proximity.transpose() + np.diag(np.ones(len(products)))

In [39]:
proximity.shape

(968, 968)

## 6. Calculate Density

Measures how close a new product is to the set of products a region already exports. Helps assess diversification feasibility.

$$
d_{pa} = \dfrac{\sum_{a'} M_{pa'} \Phi_{a,a'}}{\sum_{a'} \Phi_{a,a'}}
$$


In [40]:
#numerador
density_pa = np.matmul(Mpa, proximity)
print(density_pa.shape)

#denominador
for i in range(len(products)):
    density_pa[:, i] = density_pa[:, i]/proximity[:, i].sum()

(56, 968)


In [41]:
relative_density = np.zeros(density_pa.shape)

for p in range(density_pa.shape[0]):
    mask = Mpa[p] - 1
    # Acá me quedo con todos los productos donde no tengo VCR
    den_option_set = density_pa[p][mask.astype(bool)]  
    relative_density[p] = (density_pa[p] - den_option_set.mean()) / den_option_set.std()

## 7. Calculate Strategic Value (COG)

Strategic Value (COG) estimates the potential benefit of moving into a new product, considering its complexity and distance to current capabilities.

$$COG_{pa} = [\sum_{a'} \dfrac{\phi_{a,a'}}{\sum_{a''} \phi_{a'',a'}}(1 - M_{pa'}) ICA_{a'}] - (1 - d_{pa})ICA_{a}$$

In [42]:
non_RCA_all = np.ones((len(locations), len(products)), dtype=np.int64) - Mpa
non_RCA_all = non_RCA_all.astype(float)

distance_pa = np.matmul(non_RCA_all, proximity)

for i in range(len(products)):
    distance_pa[:, i] = distance_pa[:, i]/proximity[i, :].sum()

inv_distance_pa = np.ones((len(locations), len(products)), float) - distance_pa

In [43]:
ica_U01 = pci_norm - min(pci_norm)
ica_U01 = ica_U01 / max(ica_U01)

In [44]:
unif_A = np.zeros((len(locations), len(products)), float)
unif_B = np.zeros((len(locations), len(products)), float)
sum_prox = proximity.sum(axis = 0)

for l in range(len(locations)):
    if l % 13 == 0:
        print('.', end=' ')
        
    for a in range(len(products)):
        unif_A[l, a] = (proximity[a, :]*non_RCA_all[l, :]*ica_U01/sum_prox).sum()
        unif_B[l, a] = inv_distance_pa[l, a]*ica_U01[a]

unif_cog2_pa = unif_A - unif_B

. . . . . 

In [45]:
relative_cog = np.zeros((len(locations), len(products)), float)
for idx_prov in range(len(locations)):
    relative_cog[idx_prov,] = unif_cog2_pa[idx_prov] - unif_cog2_pa[idx_prov][(1 - Mpa[idx_prov]).astype(bool)].mean()
    relative_cog[idx_prov,] /= unif_cog2_pa[idx_prov][(1 - Mpa[idx_prov]).astype(bool)].std()

In [46]:
pd.DataFrame(proximity, index=list(products), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"proximity.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame(relative_density, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"relative_density.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame(relative_cog, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"relative_cog.parquet", engine=ENGINE, compression=COMP)