In [1]:
import os
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = "/opt/homebrew/lib:/opt/homebrew/opt/cairo/lib" # :/
import mitoolspro as mtp
import pandas as pd
from itertools import product
from mitoolspro.project import Project
from mitoolspro import economic_complexity as ec

In [None]:
pr = Project.load(auto_load=True)

In [3]:
show = True

# Load Data

In [4]:
db = mtp.databases.MainConnection(pr.get_path('database'))

#### Indicators Data

In [5]:
indicators_tablename = 'oecd_indicators'
footprint_tablename = 'ecological_footprint'
classification_tablename = 'income_classification'
wbindicators_tablename = 'wb_indicators'
world_bank_tablename = 'extra_world_bank'
internal_credit_tablename = 'internal_credit'
environmental_patents_tablename = 'environmental_patents'
envrionmental_policy_tablename = 'environmental_policy_stringency'
globalisation_index_tablename = 'globalisation_index'

In [6]:
oecd = pd.read_sql(f'SELECT * FROM {indicators_tablename}', db).set_index(['Year', 'Country'])
footprint = pd.read_sql(f'SELECT * FROM {footprint_tablename}', db).set_index(['Year', 'Country'])
classification = pd.read_sql(f'SELECT * FROM {classification_tablename}', db).set_index(['Year', 'Country'])
wbindicators = pd.read_sql(f'SELECT * FROM {wbindicators_tablename}', db).set_index(['Year', 'Country'])
world_bank = pd.read_sql(f'SELECT * FROM {world_bank_tablename}', db).set_index(['Year', 'Country'])
internal_credit = pd.read_sql(f'SELECT * FROM {internal_credit_tablename}', db).set_index(['Year', 'Country'])
environmental_patents = pd.read_sql(f'SELECT * FROM {environmental_patents_tablename}', db).set_index(['Year', 'Country'])
environmental_policy = pd.read_sql(f'SELECT * FROM {envrionmental_policy_tablename}', db).set_index(['Year', 'Country'])
globalisation_index = pd.read_sql(f'SELECT * FROM {globalisation_index_tablename}', db).set_index(['Year', 'Country'])

#### Handle Repeated World Bank Indexes

In [None]:
wbindicators

In [8]:
world_bank_columns = wbindicators.columns
world_bank_columns = world_bank_columns.intersection(world_bank.columns)
wbindicators = wbindicators[[c for c in wbindicators.columns if c not in world_bank_columns]]

#### All Data

In [9]:
all_data_file = pr.get_path('complexity_data')

In [10]:
data = pd.read_parquet(all_data_file).reset_index()

In [11]:
data['SCI'] = data['ScSCI']

In [None]:
data

In [None]:
print('There are ', data['Sector'].nunique(), 'considered sectors:')
print('='*20)
mtp.utils.functions.iprint(data['Sector'].sort_values().unique())

# Merge Indicators

In [14]:
all_countries = list(data['Country'].unique())
all_years = list(data['Year'].unique())
all_indexes = pd.MultiIndex.from_tuples([combination for combination in product(all_years, all_countries)])

In [15]:
classification = classification.loc[(classification.index.get_level_values('Country').isin(all_countries)) & (classification.index.get_level_values('Year').isin(all_years))]

In [16]:
indicators = [
    oecd,
    footprint,
    wbindicators,
    classification,
    world_bank,
    internal_credit,
    environmental_patents,
    environmental_policy,
    globalisation_index,
]
indicators = [i.loc[(i.index.get_level_values('Country').isin(all_countries)) & (i.index.get_level_values('Year').isin(all_years))] for i in indicators]

In [17]:
def find_df_with_duplicated_columns(dfs, columns):
    result = {column: [] for column in columns}
    for column in columns:
        for i, df in enumerate(dfs):
            if column in df.columns:
                result[column].append(f'DataFrame {i+1}')
    return result

In [None]:
all_columns = pd.Index([col for df in indicators for col in df.columns])
duplicated_columns = all_columns[all_columns.duplicated(keep=False)].unique()
df_with_duplicates = find_df_with_duplicated_columns(indicators, duplicated_columns)
if df_with_duplicates:
    for column, dfs in df_with_duplicates.items():
        print(f"Column '{column}' is duplicated in: {', '.join(dfs)}")
else:
    print("No duplicated columns across the DataFrames.")

In [19]:
indicators_df = pd.concat(indicators, axis=1).sort_index()

In [None]:
indicators_df['Income Group'].to_frame()[indicators_df['Income Group'].isna()]

# Continent and Income Group Classification

#### Continent Classification

In [21]:
countries = indicators_df.index.get_level_values('Country').unique()
continents = ec.name_converter.convert(countries, to='continent', not_found='Unknown')
countries_continents = {country: continent for country, continent in zip(countries, continents)}
indicators_df['Continent'] = indicators_df.index.get_level_values('Country').map(countries_continents)

#### Income Classification

In [22]:
current_income = indicators_df.query('Year == 2020').loc[:, 'Income Group'].droplevel(0).to_dict()
indicators_df['Current Income Group'] = indicators_df.index.get_level_values('Country').map(current_income)

In [None]:
mtp.utils.functions.iprint(indicators_df.columns)

# Re-Structure Data

In [24]:
group_col = 'Country'
subgroup_col = 'Sector'
time_col = 'Year'
products_cols = ['HS2', 'HS4', 'HS6']

In [None]:
data.columns

In [26]:
country_indexes = [
    'ECI',
    'Country_Diversity'
]

In [None]:
country_data = (data[[time_col, group_col] + country_indexes]
    .drop_duplicates()
    .set_index([time_col, group_col]))
country_data.head()

In [None]:
data.columns

In [None]:
sector_indexes = [c for c in data.columns if c == 'SCI'] + [
    'Sector_Diversity',
    'Not_Sector_Diversity',   
]
sector_indexes

In [None]:
sector_data = (data[[time_col, group_col, subgroup_col] + sector_indexes]
    .drop_duplicates()
    .set_index([time_col, group_col, subgroup_col])
    .rename(columns={'Sector_Diversity': 'Diversity', 'Not_Sector_Diversity': 'notDiversity'})
    .unstack(subgroup_col)
    )
sector_data.columns = [f"{c[1]} {c[0]}" for c in sector_data.columns]
sector_data.head()

In [None]:
data.columns

In [None]:
products_indexes = [c for c in data.columns if 'PCI' in c] + [
    'Trade Value',
    'Relatedness',
    'RCA',
    'rawRCA',
    'notRCA',
    'RCA*Value', 
    'rawRCA*Value', 
]
products_indexes

In [None]:
products_data = (data[[time_col, group_col, subgroup_col] + products_cols + products_indexes]
    .drop_duplicates()
    .set_index([time_col, group_col, subgroup_col] + products_cols)
    )
products_data

# Merge Data and Indicators

In [34]:
data_indicators_name = pr.get_path("indicators_data")

In [35]:
data_indicators = country_data.merge(sector_data, left_index=True, right_index=True)
data_indicators = data_indicators.merge(indicators_df, left_index=True, right_index=True)

In [None]:
data_indicators

In [None]:
data_indicators.to_parquet(data_indicators_name)
data_indicators

In [None]:
[c for c in data_indicators.columns if "SCI" in c]

In [39]:
data_of_products = pr.get_path("products_data")

In [None]:
products_data

In [41]:
products_data.to_parquet(data_of_products)

***