In [1]:
import os
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = "/opt/homebrew/lib:/opt/homebrew/opt/cairo/lib" # :/
import mitoolspro as mtp
import pandas as pd
import numpy as np
from itertools import product
from tqdm import tqdm
from mitoolspro.project import Project
from mitoolspro import economic_complexity as ec
from mitoolspro.utils import RECALCULATE

In [None]:
pr = Project.load(auto_load=True)

In [3]:
show = False
validate = False

# Load Data

In [5]:
db = mtp.databases.MainConnection(pr.get_path("database"))
data_tablename = ec.create_data_name(pr.vars['data_id'], 'clean')
data_file = pr.get_path("data_file")
hs_codes_tablename = 'hs_codes'
hs_bridge_tablename = 'hs_bridge'
hs_codes_file = pr.get_path('hs_codes')
create_hs_bridge_table = not mtp.databases.check_if_table(db, hs_bridge_tablename)

In [6]:
if not hs_codes_file.exists() or RECALCULATE:
    hs_codes = pd.read_sql(f"SELECT * FROM {hs_codes_tablename}", db, index_col='index')
    hs_codes = hs_codes[['Section', 'Section ID', 'HS2', 'HS2 ID', 'HS4', 'HS4 ID', 'HS6', 'HS6 ID']].drop_duplicates()
    hs6 = hs_codes[['HS6 ID', 'HS6']].drop_duplicates()
    hs_codes = hs_codes.loc[~hs6['HS6'].duplicated(keep='last')].reset_index(drop=True).sort_values(by=['Section ID', 'HS2 ID', 'HS4 ID', 'HS6 ID'])
    hs_codes.to_parquet(hs_codes_file)
else:
    hs_codes = pd.read_parquet(hs_codes_file)

In [7]:
if not data_file.exists() or RECALCULATE:
    data = pd.read_sql(f'SELECT * FROM {data_tablename}', db)
    data_sectors = data[['HS2', 'HS2 ID', 'HS4', 'HS4 ID', 'HS6', 'HS6 ID']].drop_duplicates()
    duplicated_products = data_sectors.loc[data_sectors['HS6'].duplicated(keep=False), 'HS6'].unique()
    id_map = hs_codes.loc[hs_codes['HS6'].isin(duplicated_products), ['HS6', 'HS6 ID']].set_index('HS6').to_dict()['HS6 ID']
    data.loc[data['HS6'].isin(id_map.keys()), 'HS6 ID'] = data.loc[data['HS6'].isin(id_map.keys()), 'HS6'].map(id_map)
    data = data.groupby(['Year', 'Section', 'Section ID', 'HS2', 'HS2 ID', 'HS4', 'HS4 ID', 'HS6', 'HS6 ID', 'Country', 'Country ID']).sum().reset_index()
    # Make Electrical Energy exports equals to 0
    data.loc[data['HS6'] == 'Electrical energy', ['Trade Value']] = 0.0
    data = data.loc[data['Country'] != 'North Korea']
    data.to_parquet(data_file)
else:
    data = pd.read_parquet(data_file)

#### Filter Countries

In [None]:
countries = pr.vars['countries']
data = data.loc[data['Country'].isin(countries)]
print('There are ', data['Country'].nunique(), ' considered countries.')

#### Validate Null Electrical Energy

In [None]:
print('The amount of exports in Electrical energy are: ', data.loc[data['HS6'] == 'Electrical energy', 'Trade Value'].sum())

In [None]:
pr.add_var('total_exports', data['Trade Value'].sum(), update=True)

In [11]:
if create_hs_bridge_table or RECALCULATE:
    hs_bridge = pd.read_excel(pr.get_path('sector_mapping'))
    hs_bridge = hs_bridge.drop_duplicates()
    hs_bridge.to_sql(hs_bridge_tablename, db, if_exists='replace')
hs_bridge = pd.read_sql(f"SELECT * FROM {hs_bridge_tablename}", db, index_col='index')

#### Visualize HS Codes

In [None]:
hs_bridge.head()

In [None]:
hs_bridge.nunique()

In [None]:
print('Considered Sectors:')
print('='*20)
mtp.utils.functions.iprint(hs_bridge['Sector'].unique())

In [None]:
print('Products per Sector:')
hs_bridge.groupby('Sector')['HS6'].count().to_frame()

# Store Map of IDs

In [16]:
id_mapping = {
    'Section': 'Section ID',
    'HS2': 'HS2 ID',
    'HS4': 'HS4 ID',
    'HS6': 'HS6 ID',
    'Country': 'Country ID'
}

In [17]:
if any([col in data.columns for col in id_mapping.values()]) or RECALCULATE:
    id_maps = {}
    for col, id_col in id_mapping.items():
        id_tablename = f'{col}_{id_col}_mapping'.replace(' ', '_')
        id_map = data[[col, id_col]].drop_duplicates().reset_index(drop=True)
        id_map.to_sql(id_tablename, db, if_exists='replace', index=False)

#### Add Hs Bridge Map of IDs

In [18]:
id_tablename = 'Sector_Sector_ID_mapping'
if not mtp.databases.check_if_table(db, id_tablename) or RECALCULATE:
    id_map = hs_bridge[['Sector']].drop_duplicates().reset_index(drop=True)
    id_map['Sector ID'] = id_map.index + 1
    id_map.to_sql(id_tablename, db, if_exists='replace', index=False)

### Add HS Bridge to HS Codes

In [19]:
sector_mapping = hs_codes.merge(hs_bridge, on=['HS2', 'HS4', 'HS6'])
sector_mapping = sector_mapping[[c for c in sector_mapping.columns if c.find('ID') == -1]]

In [None]:
sector_mapping

In [21]:
if 'Sector' not in hs_codes.columns:
    hs_codes = hs_codes.merge(sector_mapping, on=['Section', 'HS2', 'HS4', 'HS6'])
    hs_codes['Sector ID'] = hs_codes['Sector'].map({code: n+1 for n, code in enumerate(hs_codes['Sector'].sort_values().unique())})
    hs_codes.to_parquet(hs_codes_file)
else:
    hs_codes = pd.read_parquet(hs_codes_file)

#### Merge HS Bridge

In [22]:
if validate:
    print('Validation of Consistent Total Exports Amount:', pr.vars['total_exports'] == data['Trade Value'].sum())

In [23]:
if 'Sector' not in data.columns or RECALCULATE:
    data = data.merge(hs_codes, on=['Section', 'Section ID', 'HS2', 'HS2 ID', 'HS4', 'HS4 ID', 'HS6', 'HS6 ID'])
    data = data[['Year', 'Section', 'Section ID', 'Sector', 'Sector ID', 'HS2', 'HS2 ID', 'HS4', 'HS4 ID', 'HS6', 'HS6 ID', 'Country', 'Country ID', 'Trade Value']]
    data.to_parquet(data_file)
else:
    data = pd.read_parquet(data_file)

In [24]:
if validate:
    print('Validation of Consistent Total Exports Amount:', pr.vars['total_exports'] == data['Trade Value'].sum())

#### Drop IDs from Data

In [25]:
if any([col in data.columns for col in id_mapping.values()]) or RECALCULATE:
    data = data[['Year', 'Section', 'Section ID', 'Sector', 'Section ID', 'HS2', 'HS2 ID', 'HS4', 'HS4 ID', 'HS6', 'HS6 ID', 'Country', 'Country ID', 'Trade Value']]
    data = data[[c for c in data.columns if c.find('ID') == -1]]
    data.to_parquet(data_file)

In [None]:
print('Validation of Consistent Total Exports Amount:')
pr.vars['total_exports'] == data['Trade Value'].sum()

In [None]:
data.head()

# Yearly Data

In [28]:
yearly_data = {int(year): data[data['Year'] == year].reset_index(drop=True) for year in np.sort(data['Year'].unique())}
yearly_data = {year: data.groupby(['Year', 'Section', 'Sector', 'HS2', 'HS4', 'HS6', 'Country']).sum().reset_index() for year, data in yearly_data.items()}

# Exports Matrices

In [29]:
origin_col = 'Country'
products_col = ['Sector', 'HS2', 'HS4', 'HS6']
value_col = 'Trade Value'

In [30]:
sequence_name = 'exports_matrices'

In [31]:
all_countries = set(data['Country'].unique())
if not mtp.pandas_utils.check_if_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years']) or RECALCULATE:
    exports = {year: ec.exports_data_to_matrix(exports, origin_col, products_col, value_col, hs_codes) for year, exports in yearly_data.items()}
    exports = {year: export.reindex(all_countries).fillna(0.0).sort_index() for year, export in exports.items()}
    mtp.pandas_utils.store_dataframe_sequence(exports, sequence_name, pr.get_path('data'))
else:
    exports = mtp.pandas_utils.load_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years'])

#### Plot

In [32]:
if show:
    ec.plots.display_rca_matrix(exports[2020])

#### Test Countries and Products Existence

In [None]:
all_products = set([(sector, hs2, hs4, hs6) for _, (sector, hs2, hs4, hs6) in data[['Sector', 'HS2', 'HS4', 'HS6']].drop_duplicates().iterrows()])
print('There are a total of: ', len(all_products), 'considered products.')
print('There are a total of: ', len(all_countries), 'considered countries.')

In [34]:
if validate:
    for year, export in exports.items():
        print('Validating data of ', year, '...')
        assert all_countries == set(export.index), 'Not all countries'
        assert all_products == set(export.columns), 'Not all products'

# RCA Matrices

In [35]:
sequence_name = 'rca_matrices'

In [36]:
if not mtp.pandas_utils.check_if_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years'])  or RECALCULATE:
    rcas = {year: ec.calculate_exports_matrix_rca(exports).sort_index() for year, exports in exports.items()}
    mtp.pandas_utils.store_dataframe_sequence(rcas, sequence_name, pr.get_path('data'))
else:
    rcas = mtp.pandas_utils.load_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years'])

#### Plot

In [37]:
if show:
    ec.plots.display_rca_matrix(rcas[2020])

# Masked RCA Matrices

In [38]:
sequence_name = 'masked_rca_matrices'

In [39]:
if not mtp.pandas_utils.check_if_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years']) or RECALCULATE:
    mrcas = {year: ec.mask_matrix(rca, 1.0).sort_index() for year, rca in rcas.items()}
    mtp.pandas_utils.store_dataframe_sequence(mrcas, sequence_name, pr.get_path('data'))
else:
    mrcas = mtp.pandas_utils.load_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years'])

#### Plot

In [40]:
if show:
    ec.plots.display_rca_matrix(mrcas[2020])

# Proximity Matrices

In [41]:
sequence_name = 'proximity_matrices'

In [42]:
if not mtp.pandas_utils.check_if_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years']) or RECALCULATE:
    proximities = {year: ec.calculate_proximity_matrix(mrca) for year, mrca in mrcas.items()}
    mtp.pandas_utils.store_dataframe_sequence(proximities, sequence_name, pr.get_path('data'))
else:
    proximities = mtp.pandas_utils.load_dataframe_sequence(pr.get_path('data'), sequence_name, pr.vars['years'])

#### Plot

In [43]:
if show:
    ec.plots.display_proximity_matrix(proximities[2020])

# Relatedness Matrices

In [44]:
frame_name = 'relatedness'
file_name = pr.get_path("relatedness")

In [45]:
if not file_name.exists() or RECALCULATE:
    relatedness = []
    for (year, proximity), (_, rca) in zip(proximities.items(), rcas.items()):
        yearly_relatedness = ec.calculate_relatedness_matrix(proximity, rca)
        yearly_relatedness["Year"] = year
        yearly_relatedness = yearly_relatedness.set_index(["Year"], append=True)
        relatedness.append(yearly_relatedness)
    relatedness = pd.concat(relatedness)
    relatedness = relatedness.reorder_levels([-1, -2, 0, 1, 2, 3], axis=0).sort_index()
    relatedness.columns = ["Relatedness"]
    relatedness.to_parquet(file_name)
else:
    relatedness = pd.read_parquet(file_name)

# Complexity Indexes

In [46]:
eci_frame_name = 'economic_complexity'
eci_file_name = pr.get_path('economic_complexity')

pci_frame_name = 'product_complexity'
pci_file_name = pr.get_path('product_complexity')

In [None]:
if not eci_file_name.exists() or not pci_file_name.exists() or RECALCULATE:
    eci_frames, pci_frames = [], []
    for year, rca in tqdm(mrcas.items()):
        eci_frame, pci_frame = ec.calculate_economic_complexity(rca, standardize=False)
        eci_frame["Year"] = year
        eci_frame = eci_frame.set_index(["Year"], append=True)
        pci_frame["Year"] = year
        pci_frame = pci_frame.set_index(["Year"], append=True)
        eci_frames.append(eci_frame)
        pci_frames.append(pci_frame)
    eci_frame = pd.concat(eci_frames, axis=0)
    eci_frame = eci_frame.reorder_levels([1, 0], axis=0).sort_values(by=['Year', 'Country'])
    eci_frame.to_parquet(eci_file_name)
    pci_frame = pd.concat(pci_frames, axis=0)
    pci_frame = pci_frame.reorder_levels([-1, 0, 1, 2, 3]).sort_values(by=['Year', 'Sector', 'HS2', 'HS4', 'HS6'])
    pci_frame.to_parquet(pci_file_name)
else:
    eci_frame = pd.read_parquet(eci_file_name)
    pci_frame = pd.read_parquet(pci_file_name)

# Merge all in Data

#### Reindex All Data

In [48]:
columns = ['Year', 'Country', 'Sector', 'HS2', 'HS4', 'HS6']

In [49]:
all_index = pd.MultiIndex.from_tuples([(year, country,) + product for year, country, product in product(pr.vars['years'], all_countries, all_products)], names=columns)

In [50]:
all_data_name = 'complexity_data'
all_data_file = pr.get_path('complexity_data')

In [51]:
if not all_data_file.exists() or RECALCULATE:
    data = data[[*columns, value_col]].set_index([c for c in data.columns if c not in [value_col, 'Section']]).reorder_levels([0, -1, 1, 2, 3, 4]).reindex(all_index, fill_value=0)
    data.to_parquet(all_data_file)
else:
    data = pd.read_parquet(all_data_file)

In [None]:
data.head()

In [None]:
data.sum().values[0] == pr.vars["total_exports"]

#### Merge Complexity Indexes

In [54]:
if all([c not in data.columns for c in eci_frame.columns]) and all([c not in data.columns for c in pci_frame.columns]):
    data = data.merge(eci_frame, left_on=['Year', 'Country'], right_index=True, how='left')
    data = data.merge(pci_frame, left_on=['Year', 'Sector', 'HS2', 'HS4', 'HS6'], right_index=True, how='left')
    data.to_parquet(all_data_file)

#### Merge Relatedness

In [55]:
if all([c not in data.columns for c in relatedness.columns]):
    data = data.merge(relatedness, left_on=['Year', 'Country', 'Sector', 'HS2', 'HS4', 'HS6'], right_index=True, how='left')
    data.to_parquet(all_data_file)

#### Merge RCA

In [56]:
rrca_frame_name = 'raw_rca'
rrca_frame_file = pr.get_path('raw_rca')

In [57]:
if not rrca_frame_file.exists() or RECALCULATE:
    rca = []
    for year, dataframe in rcas.items():
        dataframe = dataframe.stack(level=list(dataframe.columns.names), future_stack=True).reset_index(name="RCA").set_index(['Country', 'Sector', 'HS2', 'HS4', 'HS6'])
        dataframe = dataframe.reindex(set(all_index.droplevel('Year').tolist()), fill_value=0.0)
        dataframe['Year'] = year
        dataframe = dataframe.set_index(['Year'], append=True)
        rca.append(dataframe)
    rca = pd.concat(rca, axis=0).rename(columns={'RCA': 'rawRCA'})
    rca = rca.reorder_levels([-1, 0, 1, 2, 3, 4])
    rca = rca.sort_index()
    rca.to_parquet(rrca_frame_file)
else:
    rca = pd.read_parquet(rrca_frame_file)

In [58]:
if all([c not in data.columns for c in rca.columns]):
    data = data.merge(rca, left_on=['Year', 'Country', 'Sector', 'HS2', 'HS4', 'HS6'], right_index=True, how='left')
    data.to_parquet(all_data_file)

In [59]:
rca_frame_name = 'rca'
rca_frame_file = pr.get_path('rca')

In [60]:
if not rca_frame_file.exists() or RECALCULATE:
    rca = []
    for year, dataframe in mrcas.items():
        dataframe = dataframe.stack(level=list(dataframe.columns.names), future_stack=True).reset_index(name="RCA").set_index(['Country', 'Sector', 'HS2', 'HS4', 'HS6'])
        dataframe = dataframe.reindex(set(all_index.droplevel('Year').tolist()), fill_value=0.0)
        dataframe['Year'] = year
        dataframe = dataframe.set_index(['Year'], append=True)
        rca.append(dataframe)
    rca = pd.concat(rca, axis=0)
    rca = rca.reorder_levels([-1, 0, 1, 2, 3, 4])
    rca = rca.sort_index()
    rca.to_parquet(rca_frame_file)
else:
    rca = pd.read_parquet(rca_frame_file)

In [61]:
if all([c not in data.columns for c in rca.columns]):
    data = data.merge(rca, left_on=['Year', 'Country', 'Sector', 'HS2', 'HS4', 'HS6'], right_index=True, how='left')
    data.to_parquet(all_data_file)

In [None]:
data

## Calculate Standardized ECI and PCI

In [None]:
[c for c in data.index.get_level_values('HS6').unique() if 'Apple' in c]

In [64]:
def standardize_group(group, col):
    std = group[col].std()
    mean = group[col].mean()
    group[col] = (group[col] - mean) / std
    return group[col]

In [65]:
data['PCI'] = data.groupby(by=['Year', 'Country']).apply(standardize_group, col='PCI').droplevel([0, 1]).to_frame()
data['ECI'] = data.groupby(by=['Year', 'HS6']).apply(standardize_group, col='ECI').droplevel([0, 1]).to_frame()

In [66]:
data.to_parquet(all_data_file)

In [67]:
if validate:
    print('Validation of Consistent Total Exports Amount:', pr.vars['total_exports'] == data['Trade Value'].sum())

***