# Process indexes

This notebook calculates key Economic Complexity indicators: RCA, M matrix, ECI, PCI, and opportunity metrics such as proximity, density, and strategic value. These are used to evaluate the productive structure of states and identify diversification paths.

In [46]:
import pandas as pd
import numpy as np

## 1. Read data

Read the necessary files and select which data and aggregation you want to use

In [47]:
# Select which data and aggregation to use

data_choice = "trade" # trade / labor
labor_aggregation = "state" # state / metropolitan_area

In [48]:
# Directory structure and read database

if data_choice == "trade":
    DATASETS_DIR = './datasets_trade/'
    OUTPUTS_DIR = "./outputs_trade/"
    variable = "trade_value"
    code = "HS6"
    location = 'location'
    
    # Read the database created in create_trade_database.ipynb
    data = pd.read_parquet(DATASETS_DIR + "df_trade_complete.parquet",engine='fastparquet')
    data = data.loc[data.location != "USA"] # Since we are adding the US states, we should remove the country to avoid duplication 
    data = data.groupby(['location', 'HS6'], as_index=False)['trade_value'].sum() # Group by location and HS6, summing the trade value
    df_temp = data.groupby(["location"]).sum() # We are adding some quality restrictions: we'll delete all countries with less than $1b trade
    remove_countries = df_temp.loc[df_temp["trade_value"]<1000000000].index 
    data = data.loc[~data.location.isin(remove_countries)]
    data = data.loc[data[variable]>0]
    print(data.shape)
    print(data.head())

elif data_choice == "labor":
    if labor_aggregation == "state":
        DATASETS_DIR = './datasets_labor_states/'
        OUTPUTS_DIR = "./outputs_labor_states/"
        variable = "EMP"
        location = "STATE_NAME"
        code = "NAICS2017"

        data= pd.read_parquet(DATASETS_DIR + "df_labor_usa_states.parquet",engine='fastparquet')
        data['NAICS2017'] = data['NAICS2017'].astype(str) # Ensure NAICS is string*
        data['naics_level'] = data['NAICS2017'].str.len() # Add a column for NAICS code length
        data = data[data.naics_level == 5]
        data = data.loc[data[variable]>0]
        print(data.shape)
        print(data.head())
        
    elif labor_aggregation == "metropolitan_area":
        DATASETS_DIR = './datasets_labor_metropolitan_area/'
        OUTPUTS_DIR = "./outputs_labor_metropolitan_area/"
        variable = "EMP"

(674521, 3)
      location     HS6  trade_value
0  Afghanistan  010121          2.0
1  Afghanistan  010221      15775.0
2  Afghanistan  010619       6178.0
3  Afghanistan  010641     126005.0
4  Afghanistan  020120       1000.0


## 1. Calculate RCA (Revealed Comparative Advantage)

Compute the RCA matrix to determine whether each region (state or country) is competitively exporting a product.

In [49]:
df_base = data[[location, code, variable]].copy()
df_base.head()

Unnamed: 0,location,HS6,trade_value
0,Afghanistan,10121,2.0
1,Afghanistan,10221,15775.0
2,Afghanistan,10619,6178.0
3,Afghanistan,10641,126005.0
4,Afghanistan,20120,1000.0


In [50]:
# Calculate the vector of export by location/country (sum c Xcp)

df_agrup_c = df_base.groupby([location])[[variable]].sum()
df_agrup_c.head()

Unnamed: 0_level_0,trade_value
location,Unnamed: 1_level_1
Afghanistan,1662865000.0
Alabama,27693170000.0
Alaska,5263277000.0
Albania,4398079000.0
Algeria,51763050000.0


In [51]:
# Calculate the ratio of the export of a product in all the country's exports.

df_prov = df_base.merge(df_agrup_c, how='left', on=location)
df_prov['porc_country']=df_prov[f'{variable}_x']/df_prov[f'{variable}_y' ]
df_prov.head()

Unnamed: 0,location,HS6,trade_value_x,trade_value_y,porc_country
0,Afghanistan,10121,2.0,1662865000.0,1.202743e-09
1,Afghanistan,10221,15775.0,1662865000.0,9.486637e-06
2,Afghanistan,10619,6178.0,1662865000.0,3.715274e-06
3,Afghanistan,10641,126005.0,1662865000.0,7.577583e-05
4,Afghanistan,20120,1000.0,1662865000.0,6.013716e-07


In [52]:
#Calculate the vector of export by product (sum c Xcp)

df_agrup_p = df_base.groupby([code])[[variable]].sum()
df_agrup_p.head()

Unnamed: 0_level_0,trade_value
HS6,Unnamed: 1_level_1
10121,1571072000.0
10129,2138688000.0
10130,1221022.0
10190,17119000.0
10221,1060868000.0


In [53]:
# Calculate total export

df_agrup_pc = df_agrup_p[variable].sum()
df_agrup_pc

22540657389642.0

In [54]:
# Calculate the ratio of an exported product over total export

df_agrup_p[f'{variable}_total'] = df_agrup_pc
df_agrup_p['porc_product'] = df_agrup_p[variable]/df_agrup_p[f'{variable}_total']
df_agrup_p.head()

Unnamed: 0_level_0,trade_value,trade_value_total,porc_product
HS6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10121,1571072000.0,22540660000000.0,6.969945e-05
10129,2138688000.0,22540660000000.0,9.488135e-05
10130,1221022.0,22540660000000.0,5.416976e-08
10190,17119000.0,22540660000000.0,7.594721e-07
10221,1060868000.0,22540660000000.0,4.706463e-05


In [55]:
# Unify both tables and ratios to calculate RCA.

df_RCA = df_prov.merge(df_agrup_p, how='left', on=code)

df_RCA['RCA'] = df_RCA['porc_country'] / df_RCA['porc_product']
df_RCA['M'] = np.where(df_RCA['RCA'] >= 1, 1, 0)

df_RCA.drop(columns=[variable, f'{variable}_total', f'{variable}_y'],inplace=True)
df_RCA.rename(columns={f'{variable}_x': variable}, inplace=True)
    
df_RCA.head()

Unnamed: 0,location,HS6,trade_value,porc_country,porc_product,RCA,M
0,Afghanistan,10121,2.0,1.202743e-09,7e-05,1.7e-05,0
1,Afghanistan,10221,15775.0,9.486637e-06,4.7e-05,0.201566,0
2,Afghanistan,10619,6178.0,3.715274e-06,1.3e-05,0.283102,0
3,Afghanistan,10641,126005.0,7.577583e-05,4e-06,20.827956,1
4,Afghanistan,20120,1000.0,6.013716e-07,0.000258,0.002335,0


In [56]:
# rca by country
df_RCA.groupby(location)[['M']].sum().sort_values(by='M', ascending=False)

Unnamed: 0_level_0,M
location,Unnamed: 1_level_1
China,2671
Italy,2328
Germany,2215
Spain,2124
France,1990
...,...
Libya,30
Iraq,27
Angola,24
Equatorial Guinea,19


In [57]:
# Transform matrix to numpy
locations = df_RCA[location].unique()
products = df_RCA[code].unique()

df_RCA['loc_int'] = df_RCA[location].apply(lambda x: np.where(locations == x)[0][0])
df_RCA['prod_int'] = df_RCA[code].apply(lambda x: np.where(products == x)[0][0])
df_RCA.head()

Unnamed: 0,location,HS6,trade_value,porc_country,porc_product,RCA,M,loc_int,prod_int
0,Afghanistan,10121,2.0,1.202743e-09,7e-05,1.7e-05,0,0,0
1,Afghanistan,10221,15775.0,9.486637e-06,4.7e-05,0.201566,0,0,1
2,Afghanistan,10619,6178.0,3.715274e-06,1.3e-05,0.283102,0,0,2
3,Afghanistan,10641,126005.0,7.577583e-05,4e-06,20.827956,1,0,3
4,Afghanistan,20120,1000.0,6.013716e-07,0.000258,0.002335,0,0,4


## 2. Calculate M matrix

Binary matrix indicating whether a region is competitive (RCA > 1) in each product.

In [58]:
# Create a matrix of zeros and fill
Mpa = np.zeros((df_RCA[location].nunique(), df_RCA[code].nunique()),
               dtype=np.int64)

for row in df_RCA.loc[df_RCA.M == 1].iterrows():
    Mpa[row[1].loc_int, row[1].prod_int] = 1

In [59]:
print('Cuantos elementos tienen RCA > 1 segun df_RCA:', len(df_RCA.loc[df_RCA.M == 1]))
print('Cuantos elementos tienen RCA > 1 segun Mpa:   ', Mpa.sum())

Cuantos elementos tienen RCA > 1 segun df_RCA: 122585
Cuantos elementos tienen RCA > 1 segun Mpa:    122585


### 2.1 Calculate Diversity

Sum across products for each region to get diversity: how many products a region exports competitively.

In [60]:
diversity = Mpa.sum(axis = 1)
diversity.shape

(210,)

### 2.2 Calculate Ubiquity

Sum across regions for each product to get ubiquity: how many regions are competitive in each product.

In [61]:
ubiquity = Mpa.sum(axis = 0)
ubiquity.shape

(5605,)

### 2.3 Calculate Inverse Matrix

Create the inverse normalized matrix used in ECI and PCI calculations.

In [62]:
D_inv = np.diag(1/diversity)
D_inv.shape


(210, 210)

In [63]:
U_inv = np.diag(1/ubiquity)
U_inv.shape

(5605, 5605)

## 3. Calculate $\tilde{M}$ matrix
Double-standardized matrix used to compute eigenvectors for complexity metrics.

In [64]:
temp_Mpa_1 = Mpa.copy().astype(float)
for i in range(len(products)):
    temp_Mpa_1[:, i] = temp_Mpa_1[:, i]/ubiquity[i]

In [65]:
temp_Mpa_1.dtype, temp_Mpa_1.shape
Sccprima = np.matmul(temp_Mpa_1, Mpa.transpose())
Sccprima.shape

(210, 210)

In [66]:
Mmonio_c = np.matmul(D_inv, Sccprima)
Mmonio_c.shape

(210, 210)

## 4. Calculate ECI and PCI

Use the eigenvectors of the standardized matrix to calculate Economic Complexity Index (ECI) and Product Complexity Index (PCI).

In [67]:
def calc_complexity(mmonio):
    autovalores, autovectores = np.linalg.eig(mmonio)
    second_idx = np.where(autovalores == -np.sort(-autovalores)[1])[0][0]
    
    # second eigenvalue is the variance: the ECI
    complexity = autovectores[:, second_idx].real
    print('Todos los avec imaginarios son cero:', (np.isclose(autovectores[:, second_idx].imag, 0.0)).all())
    print('Mean and std of second autovector:', complexity.mean(), '+/-', complexity.std())
    return complexity

In [68]:
eci_raw = calc_complexity(Mmonio_c)
eci_norm = (eci_raw - eci_raw.mean())/eci_raw.std()

Todos los avec imaginarios son cero: True
Mean and std of second autovector: 0.024766620572666233 +/- 0.06440899989375977


In [69]:
# Check that the ECIs are not reversed. If they are reversed, run next cell to change their sign. This is because the eigenvector calculation has an indeterminate direction and must be determined manually.
eci_norm = -eci_norm
print('Mean and std of eci:', eci_norm.mean(), '+/-', eci_norm.std())

Mean and std of eci: 0.0 +/- 1.0


In [70]:
# ECI results by country
df_eci = pd.DataFrame({'location': locations, 'ECI': eci_norm})
df_eci.sort_values('ECI', ascending=False, inplace=True)
df_eci = df_eci.merge(df_RCA.groupby([location])[['M']].sum().sort_values(by='M', ascending=False).reset_index().rename(columns={location:"location"}),how='left')
df_eci[0:56]

Unnamed: 0,location,ECI,M
0,Connecticut,1.603618,729
1,New Hampshire,1.557718,487
2,Indiana,1.530277,611
3,Michigan,1.494417,763
4,West Virginia,1.485831,262
5,Kentucky,1.485788,507
6,Mississippi,1.463877,331
7,Oklahoma,1.461078,551
8,Massachusetts,1.449441,791
9,Japan,1.442139,1379


### 4.1 Calculate $\hat{M}$ matrix

In [71]:
temp_Mpa_2 = Mpa.copy().astype(float)
for i in range(len(locations)):
    temp_Mpa_2[i, :] = temp_Mpa_2[i, :]/diversity[i]

In [72]:
Sppprima = np.matmul(Mpa.transpose(), temp_Mpa_2)
Sppprima.shape, Sppprima.dtype

((5605, 5605), dtype('float64'))

In [73]:
Mmonio_p = np.matmul(U_inv, Sppprima)
Mmonio_p.shape, Mmonio_p.dtype

((5605, 5605), dtype('float64'))

In [74]:
pci_raw = calc_complexity(Mmonio_p)


Todos los avec imaginarios son cero: True
Mean and std of second autovector: -0.00023918001744914635 +/- 0.01335495881477105


In [75]:
if data_choice == "trade":
    df_product_codes=pd.read_excel(DATASETS_DIR + "BACI/HSCodeandDescription.xlsx", sheet_name="HS22")
    df_product_codes = df_product_codes.loc[df_product_codes["Level"]== 6]
elif data_choice == "labor":
    df_product_codes= pd.read_excel(DATASETS_DIR + "2022_NAICS_Descriptions.xlsx")
    df_product_codes["Title"] = df_product_codes["Title"].str.replace("T$", "", regex=True).str.strip()
    df_product_codes['Code'] = df_product_codes['Code'].astype(str) # Ensure NAICS is string*
    df_product_codes = df_product_codes[["Code", "Title"]]
    df_product_codes.rename(columns={"Title":"Description"}, inplace=True)

In [76]:
# Let's check now results on PCI. 
# Also check that the PCIs are not reversed. If they are reversed, run next cell to change their sign. This is because the eigenvector calculation has an indeterminate direction and must be determined manually.
pci_norm = (pci_raw - pci_raw.mean())/pci_raw.std()
pci_norm = -pci_norm
print('Mean and std of ICA:', pci_norm.mean(), '+/-', pci_norm.std())

df_pci = pd.DataFrame({code: products, 'PCI': pci_norm}).merge(df_product_codes[["Code", "Description"]].drop_duplicates(subset=["Code", "Description"]), how="left", left_on=code, right_on="Code")
df_pci.sort_values(by="PCI", ascending=False)[0:20]

Mean and std of ICA: 2.0283111101091174e-17 +/- 1.0


Unnamed: 0,HS6,PCI,Code,Description
5569,811251,2.163447,811251,Thallium and articles thereof; unwrought thall...
3400,400241,1.89531,400241,"Rubber; synthetic, chloroprene (chlorobutadien..."
2176,400249,1.892007,400249,"Rubber; synthetic, chloroprene (chlorobutadien..."
1837,281219,1.890766,281219,Arsenic trichloride and other chlorides and ch...
5476,440794,1.841027,440794,"Wood; cherry (Prunus spp.), sawn or chipped le..."
1562,901730,1.833963,901730,"Mathematical equipment; micrometers, callipers..."
4740,391211,1.83014,391211,"Cellulose acetates; non-plasticised, in primar..."
2606,722692,1.824324,722692,"Steel, alloy; flat-rolled, width less than 600..."
5510,722591,1.821856,722591,"Steel, alloy; flat-rolled, width 600mm or more..."
4812,848610,1.807165,848610,Machines and apparatus of a kind used solely o...


In [77]:
# Prepare arrays to export

df_RCA = df_RCA.merge(df_pci[[code, "PCI"]], how="left", on = code)

RCA = np.zeros((df_RCA[location].nunique(), df_RCA[code].nunique()),
               dtype=np.float64)

for row in df_RCA.loc[df_RCA.RCA != 0.0].iterrows():
    RCA[row[1].loc_int, row[1].prod_int] = row[1].RCA

trade_value = np.zeros((df_RCA[location].nunique(), df_RCA[code].nunique()),
               dtype=np.float64)

# Fill RCA where you have to
for row in df_RCA.iterrows():
    trade_value[row[1].loc_int, row[1].prod_int] = row[1][variable]

In [78]:
ENGINE, COMP = "fastparquet", "snappy" 

pd.DataFrame(Mpa, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"Mpa.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame({"codes": list(products)})\
  .to_parquet(OUTPUTS_DIR + f"codes.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame({"location": list(location)})\
  .to_parquet(OUTPUTS_DIR + f"location.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame(np.asarray(RCA), index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"RCA.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame({"eci_norm": list(eci_norm)})\
  .to_parquet(OUTPUTS_DIR + f"eci.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame({"pci_norm": list(pci_norm)})\
  .to_parquet(OUTPUTS_DIR + f"pci.parquet", engine=ENGINE, compression=COMP, index=False)

pd.DataFrame(trade_value, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"trade_value.parquet", engine=ENGINE, compression=COMP)


## 5. Proximity
Calculate product proximity: how likely it is for two products to be co-exported.

In [79]:
p_RCAmay1 = np.zeros((len(products),))

for prod in range(len(products)):
    p_RCAmay1[prod] = Mpa[:, prod].sum()

In [80]:
# Calculamos la proximidad: la probabilidad condicional mínima de que una icaa tenga el sector i RCA>1 dado tiene el sector j con RCA>1, y viceversa.
# Miramos entonces primero la cantidad de icas en que ambos sectores tienen RCA, lo sumo y divido por la mayor ubicuidad de ambos sectores.

almost_proximity = np.zeros((len(products), len(products)))

for p1 in range(len(products)-1):
    # como es una matriz simetrica, calcula unicamente la mitad de los valores:
    for p2 in range(p1+1, len(products)):
        almost_proximity[p1, p2] = np.logical_and(Mpa[:, p1], Mpa[:, p2]).sum()/max(p_RCAmay1[p1], p_RCAmay1[p2])

In [81]:
# chequeo que no haya valores incorrectamente imputados
for i in range(len(products)):
    if not np.isclose(almost_proximity[i, :i].sum(), 0.0):
        print('something wrong at row', i)

In [82]:
proximity = almost_proximity + almost_proximity.transpose() + np.diag(np.ones(len(products)))

In [83]:
proximity.shape

(5605, 5605)

## 6. Calculate Density

Measures how close a new product is to the set of products a region already exports. Helps assess diversification feasibility.

$$
d_{pa} = \dfrac{\sum_{a'} M_{pa'} \Phi_{a,a'}}{\sum_{a'} \Phi_{a,a'}}
$$


In [84]:
#numerador
density_pa = np.matmul(Mpa, proximity)
print(density_pa.shape)

#denominador
for i in range(len(products)):
    density_pa[:, i] = density_pa[:, i]/proximity[:, i].sum()

(210, 5605)


In [85]:
relative_density = np.zeros(density_pa.shape)

for p in range(density_pa.shape[0]):
    mask = Mpa[p] - 1
    # Acá me quedo con todos los productos donde no tengo VCR
    den_option_set = density_pa[p][mask.astype(bool)]  
    relative_density[p] = (density_pa[p] - den_option_set.mean()) / den_option_set.std()

## 7. Calculate Strategic Value (COG)

Strategic Value (COG) estimates the potential benefit of moving into a new product, considering its complexity and distance to current capabilities.

$$COG_{pa} = [\sum_{a'} \dfrac{\phi_{a,a'}}{\sum_{a''} \phi_{a'',a'}}(1 - M_{pa'}) ICA_{a'}] - (1 - d_{pa})ICA_{a}$$

In [86]:
non_RCA_all = np.ones((len(locations), len(products)), dtype=np.int64) - Mpa
non_RCA_all = non_RCA_all.astype(float)

distance_pa = np.matmul(non_RCA_all, proximity)

for i in range(len(products)):
    distance_pa[:, i] = distance_pa[:, i]/proximity[i, :].sum()

inv_distance_pa = np.ones((len(locations), len(products)), float) - distance_pa

In [87]:
ica_U01 = pci_norm - min(pci_norm)
ica_U01 = ica_U01 / max(ica_U01)

In [88]:
unif_A = np.zeros((len(locations), len(products)), float)
unif_B = np.zeros((len(locations), len(products)), float)
sum_prox = proximity.sum(axis = 0)

for l in range(len(locations)):
    if l % 13 == 0:
        print('.', end=' ')
        
    for a in range(len(products)):
        unif_A[l, a] = (proximity[a, :]*non_RCA_all[l, :]*ica_U01/sum_prox).sum()
        unif_B[l, a] = inv_distance_pa[l, a]*ica_U01[a]

unif_cog2_pa = unif_A - unif_B

. . . . . . . . . . . . . . . . . 

In [89]:
relative_cog = np.zeros((len(locations), len(products)), float)
for idx_prov in range(len(locations)):
    relative_cog[idx_prov,] = unif_cog2_pa[idx_prov] - unif_cog2_pa[idx_prov][(1 - Mpa[idx_prov]).astype(bool)].mean()
    relative_cog[idx_prov,] /= unif_cog2_pa[idx_prov][(1 - Mpa[idx_prov]).astype(bool)].std()

In [90]:
pd.DataFrame(proximity, index=list(products), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"proximity.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame(relative_density, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"relative_density.parquet", engine=ENGINE, compression=COMP)

pd.DataFrame(relative_cog, index=list(locations), columns=list(products))\
  .to_parquet(OUTPUTS_DIR + f"relative_cog.parquet", engine=ENGINE, compression=COMP)