In [None]:
import os
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = "/opt/homebrew/lib:/opt/homebrew/opt/cairo/lib" # :/
import pandas as pd
from mitoolspro.project import Project
from mitoolspro.utils import RECALCULATE

In [None]:
pr = Project.load(auto_load=True)

In [5]:
show = True
validate = True

# Load Data

In [6]:
origin_col = 'Country'
products_col = ['Sector', 'HS2', 'HS4', 'HS6']
value_col = 'Trade Value'

In [7]:
all_data_file = pr.get_path("complexity_data")

In [22]:
data_file = pr.get_path("data_file")
data = pd.read_parquet(all_data_file)

In [None]:
print("Total Amount of Sectors:", len(data.index.get_level_values("Sector").unique()))
print("Sectors:")
for s in data.index.get_level_values("Sector").unique():
    print(f"- {s}")

In [None]:
if validate:
    print('Validation of Consistent Total Exports Amount:', pr.vars['total_exports'] == data['Trade Value'].sum())

In [None]:
data

# Positive PCI

In [28]:
def standardize_group(group, col):
    std = group[col].std()
    mean = group[col].mean()
    group[col] = (group[col] - mean) / std
    return group[col]

def scale_group(group, col):
    min_val = group[col].min()
    max_val = group[col].max()
    group[col] = (group[col] - min_val) / (max_val - min_val)
    return group[col]

def add_min_to_group(group, col):
    min_value = group[col].min()
    if min_value < 0:
        group[col] = group[col] + abs(min_value)
    return group[col]

In [29]:
base_pci = 'PCI'

In [None]:
pci_plus = data.groupby(['Year', 'Sector']).apply(lambda x: add_min_to_group(x, base_pci)).droplevel([0, 1]).to_frame()
pci_plus.columns = ['PCI+']
data = data.merge(pci_plus, left_on=['Year', 'Country', 'Sector', 'HS2', 'HS4', 'HS6'], right_index=True, how='left')

#### SCI Terms

In [None]:
pci = [c for c in data.columns if 'PCI' in c and c not in ['PCI']][0]
pci

In [32]:
data['PCI*RCA'] = data[pci] * data['RCA']
data['PCI*RCA*Value'] = data['PCI*RCA'] * data[value_col]

In [33]:
data['RCA*Value'] = data['RCA'] * data[value_col]
data['rawRCA*Value'] = data['rawRCA'] * data[value_col]

##### SCI Potential Terms

In [34]:
data['notRCA'] = (1 - data['RCA'])

In [35]:
data['notRCA*Rel*PCI'] = data['notRCA'] * data['Relatedness'] * data[pci]

#### Sectors and Countries Properties

###### Sector Diversity

In [36]:
sector_diversity_column = 'Sector_Diversity'
if sector_diversity_column not in data.columns or RECALCULATE:
    sector_diversity = data.groupby(by=['Year', 'Country', 'Sector'])['RCA'].sum().to_frame()
    sector_diversity.columns = [sector_diversity_column]
    data = data.merge(sector_diversity, left_on=['Year', 'Country', 'Sector'], right_index=True, how='left')
    data.to_parquet(all_data_file)

###### Not Sector Diversity

In [37]:
not_sector_diversity_col = 'Not_Sector_Diversity'
if not_sector_diversity_col not in data.columns or RECALCULATE:
    not_sector_diversity = data.groupby(by=['Year', 'Country', 'Sector'])['notRCA'].sum().to_frame()
    not_sector_diversity.columns = [not_sector_diversity_col]
    data = data.merge(not_sector_diversity, left_on=['Year', 'Country', 'Sector'], right_index=True, how='left')
    data.to_parquet(all_data_file)

###### Country Diversity

In [38]:
country_diversity_col = 'Country_Diversity'
if country_diversity_col not in data.columns or RECALCULATE:
    country_diversity = data.groupby(by=['Year', 'Country'])['RCA'].sum().to_frame()
    country_diversity.columns = [country_diversity_col]
    data = data.merge(country_diversity, left_on=['Year', 'Country'], right_index=True, how='left')
    data.to_parquet(all_data_file)

###### Countries with Invalid Values

In [None]:
data.groupby(by=['Year', 'Country', 'Sector'])['RCA*Value'].sum()[data.groupby(by=['Year', 'Country', 'Sector'])['RCA*Value'].sum() == 0.0].groupby(['Year', 'Country']).count()

#### Sectoral Complexity Index (SCI)

In [40]:
sci_col = "SumSCI"
if sci_col not in data.columns or RECALCULATE:
    sci = data.groupby(by=['Year', 'Country', 'Sector'])['PCI*RCA'].sum().to_frame()
    sci.columns = [sci_col]
    data = data.merge(sci, left_on=['Year', 'Country', 'Sector'], right_index=True, how='left')
    data.to_parquet(all_data_file)

###### Invalid Data Points

In [None]:
data

In [None]:
data[sci_col].loc[data[sci_col].isna()]

## Standardized SCI

In [None]:
sci = [c for c in data.columns if 'SumSCI' in c][0]
sci

In [44]:
std_sci_col = f"{sci.replace('Sum', 'Sc')}"
if std_sci_col not in data.columns or RECALCULATE:
    std_sci = data.groupby(['Sector']).apply(lambda x: scale_group(x, sci)).droplevel([0]).to_frame()
    std_sci.columns = [std_sci_col]
    data = data.merge(std_sci, left_on=['Year', 'Country', 'Sector', 'HS2', 'HS4', 'HS6'], right_index=True, how='left')
data.to_parquet(all_data_file)

# View Data

In [None]:
data

In [None]:
if validate:
    print('Validation of Consistent Total Exports Amount:', pr.vars['total_exports'] == data['Trade Value'].sum())

***