In [1]:
import mitoolspro as mtp
import pandas as pd
from pathlib import Path
import numpy as np
from mitoolspro.project import Project
from mitoolspro import economic_complexity as ec
from mitoolspro.utils import RECALCULATE

In [None]:
pr = Project.load(auto_load=True)

# DataBases

In [3]:
raw_db = mtp.databases.MainConnection(pr.get_path('raw_db_path'))
db = mtp.databases.MainConnection(pr.get_path("database"))
cc = ec.name_converter

# OEC Indicators

In [4]:
indicators_tablename = ec.create_data_name(pr.vars['data_id'], 'indicators')
create_indicators_table = not mtp.databases.check_if_table(db, indicators_tablename)

In [5]:
if create_indicators_table or RECALCULATE:
    indicators_dfs = []
    for year in pr.vars['years']:
        indicators_df = mtp.databases.read_sql_table(raw_db, f'oec_hs_indicators_{year}')
        indicators_df['Country'] = cc.convert(indicators_df['Country'], to='name_short', not_found=None)
        indicators_dfs.append(indicators_df)
    indicators_df = pd.concat(indicators_dfs, axis=0)
    indicators_df = indicators_df[['Year', 'Indicator', 'Country', 'Measure']]
    indicators_df = indicators_df.pivot(index=['Year', 'Country'], columns='Indicator', values='Measure').reset_index()
    indicators_df.to_sql(indicators_tablename, db, if_exists='replace', index=False)

#### Extra Indicators

In [6]:
oecd_folder = pr.get_path("oecd_folder")
indicators_tablename = 'oecd_indicators'
create_indicators_table = not mtp.databases.check_if_table(db, indicators_tablename)

In [7]:
def create_pretty_string(row):
    return f"{row['INDICATOR']}_{row['SUBJECT']}_{row['MEASURE']}"

In [8]:
cols_map = {
    'Year': 'Year',
    'Country': 'Country', 
    'AIREMISSION_CO2_MLN_TONNE': 'CO2EmissionsMillionTons',
    'AIREMISSION_CO2_TONNE_CAP': 'CO2EmissionsTonsCap',
    'AIREMISSION_GHG_THND_TONNECO2': 'GHGEmissionskTonsCO2Eq',
    'AIREMISSION_GHG_TONNE_CAP': 'GHGEmissionsTonsCap',
    'AQUAPROD_TOT_USD': 'AquaProdTotalUSD',
    'CROPYIELD_WHEAT_THND_TONNE': 'WheatYieldkTons', 
    'CROPYIELD_WHEAT_TONNE_HA': 'WheatYieldTonsperHA',
    'ELECTRICITY_TOT_GWH': 'TotalElectricityGenGWh',
    'FISHLAND_TOT_USD': 'FishlanTotalUSD',
    'FORESTRESOURCE_USEINTENSITY_RT': 'ForestsUseIntensity',
    'GDP_TOT_MLN_USD': 'GDPTotalMillUSD',
    'GDP_TOT_USD_CAP': 'GDPTotalUSDCap',
    'PRYENRGSUPPLY_TOT_TOE_1000USD': 'PrimaryESupplyTOEbykUSD',
    'RENEWABLE_TOT_PC_PRYENRGSUPPLY': 'RenewETotalpctESupply',
    'PRYENRGSUPPLY_TOT_MLN_TOE': 'EnergySupplyTotalMillToe'
}

In [9]:
if create_indicators_table or RECALCULATE:
    csvs = [oecd_folder / f for f in oecd_folder.iterdir() if not f.name.startswith('.')]
    dfs = {f: pd.read_csv(f) for f in csvs}

    for f, df in dfs.items():
        df['Indicator'] = df.apply(create_pretty_string, axis=1)
        df = df[['TIME', 'LOCATION', 'Indicator', 'Value']]
        df.columns = ['Year', 'Country', 'Indicator', 'Measure']
        dfs[f] = df
    df = pd.concat(list(dfs.values()), axis=0)
    df['Country'] = cc.convert(df['Country'], to='short_name', not_found=None)
    df = df.reset_index(drop=True).drop_duplicates()
    df = df.pivot_table(index=['Year', 'Country'], columns='Indicator', values='Measure', aggfunc='first').reset_index()
    df.columns = df.columns.map(cols_map)
    df.to_sql(indicators_tablename, db, if_exists='replace', index=False)

# Ecological Footprint Indicator

In [10]:
footprint_folder = pr.get_path("footprint_folder")
footprint_tablename = 'ecological_footprint'
create_footprint_table = not mtp.databases.check_if_table(db, footprint_tablename)

In [11]:
if create_footprint_table or RECALCULATE:
    excels = [footprint_folder / f for f in footprint_folder.iterdir() if f.suffix == '.xlsx' and not f.name.startswith('.')]
    footprint_dataframes = [pd.read_excel(f, index_col=0) for f in excels]
    footprint = pd.concat(footprint_dataframes, axis=0)
    footprint = footprint.loc[footprint['Record'] == 'EFConsTotGHA']
    footprint['Country Name'] = cc.convert(footprint['Country Name'], to='name_short', not_found=None)
    footprint = footprint.reset_index(drop=True)
    footprint = footprint.rename(columns={'year': 'Year', 'Country Name': 'Country'})
    footprint = footprint[[c for c in footprint.columns if c not in ['Short Name', 'Record', 'isoa2']]]
    footprint.columns = [f"{c} Eco Footprint" if c not in ['Country', 'Year'] else c for c in footprint.columns]
    footprint.to_sql(footprint_tablename, db, if_exists='replace', index=False)

# World Bank Indicators

In [12]:
world_bank_folder = pr.get_path("wb_folder")
classification_tablename = 'income_classification'
wbindicators_tablename = 'wb_indicators'
create_classification_table = not mtp.databases.check_if_table(db, classification_tablename)
create_wbindicators_table = not mtp.databases.check_if_table(db, wbindicators_tablename)

#### Income Classification

In [13]:
if create_classification_table or RECALCULATE:
    classification = pd.read_excel(world_bank_folder / 'OGHIST.xlsx', sheet_name='Country Analytical History', header=5).iloc[5:223, 1:]
    classification = classification.melt(id_vars='Data for calendar year :', var_name="Year", value_name="Income Level")
    classification.columns = ['Country', 'Year', 'Income Group']
    classification['Income Group'] = classification['Income Group'].replace('*', '').map({
        'L': 'Low income',
        'LM': 'Lower middle income',
        'UM': 'Upper middle income',
        'H': 'High income'
    })
    classification['Country'] = cc.convert(names=classification['Country'], to='short_name')
    classification.to_sql(classification_tablename, db, if_exists='replace', index=False)

#### Indicators

In [14]:
if create_wbindicators_table or RECALCULATE:
    wbindicators_excel = '1990_2022_World_Development_Indicators.xlsx'
    indicators = pd.read_excel(world_bank_folder / wbindicators_excel).iloc[:-5, :]
    indicators['Country Name'] = cc.convert(names=indicators['Country Name'], to='short_name', not_found=None)
    indicators = indicators[[c for c in indicators.columns if c not in ['Series Code']]]
    indicators.columns = ['Country', 'Country Code', 'Indicator', *[str(year) for year in range(1990, 2023)]]
    indicators = indicators.melt(id_vars=['Country', 'Country Code', 'Indicator'])
    indicators.columns = ['Country', 'Country Code', 'Indicator', 'Year', 'Measure']
    indicators = indicators[['Country', 'Indicator', 'Year', 'Measure']]
    indicators = indicators.pivot(index=['Year', 'Country'], columns='Indicator', values='Measure').reset_index()
    indicators = indicators.reset_index(drop=True)
    indicators['Year'] = pd.to_numeric(indicators['Year'])
    indicators = indicators.loc[indicators['Year'].isin(pr.vars['years'])].reset_index(drop=True)
    indicators.to_sql(wbindicators_tablename, db, if_exists='replace', index=False)

#### Extra Indicators

In [15]:
extra_world_bank_folder = pr.get_path("extra_wb_folder")
world_bank_tablename = 'extra_world_bank'
create_world_bank_table = not mtp.databases.check_if_table(db, world_bank_tablename)

In [None]:
if create_world_bank_table or RECALCULATE:
    excel = [x for x in extra_world_bank_folder.iterdir() if x.suffix == '.xlsx' and not x.stem.startswith('.')][0]
    world_bank = pd.read_excel(excel).replace('..', np.nan)
    world_bank['Country Name'] = cc.convert(world_bank['Country Name'], to='name_short', not_found=None)
    world_bank = world_bank.loc[world_bank['Country Name'].notna()]
    world_bank = world_bank.set_index(['Series Name', 'Country Name']).drop(columns=['Country Code', 'Series Code'])
    world_bank.columns = [c.split('[')[0] for c in world_bank.columns]
    world_bank = world_bank.swaplevel('Country Name', 'Series Name', axis=0)
    world_bank = world_bank.T
    world_bank.columns.names = ['Country', 'Indicator']
    world_bank.index.names = ['Year']
    world_bank.index = world_bank.index.astype(int)
    world_bank.stack(level=0).reset_index().to_sql(world_bank_tablename, db, if_exists='replace', index=False)

# Internal Credit Indicator

In [17]:
credit_folder = pr.get_path("internal_credit")
internal_credit_tablename = 'internal_credit'
create_credit_table = not mtp.databases.check_if_table(db, internal_credit_tablename)

In [None]:
if create_credit_table or RECALCULATE:
    excel = [x for x in credit_folder.iterdir() if x.suffix == '.xlsx'][0]
    internal_credit = pd.read_excel(excel).replace('..', np.nan).iloc[:-5]
    internal_credit = internal_credit.set_index(['Series Name', 'Country Name']).drop(columns=['Country Code', 'Series Code'])
    internal_credit.columns = [c.split('[')[0] for c in internal_credit.columns]
    internal_credit = internal_credit.swaplevel('Country Name', 'Series Name', axis=0)
    internal_credit = internal_credit.T
    internal_credit.columns.names = ['Country', 'Indicator']
    internal_credit.columns = internal_credit.columns.set_levels(cc.convert(internal_credit.columns.get_level_values('Country'), to='name_short', not_found=None), level=0)
    internal_credit.index = internal_credit.index.astype(int)
    internal_credit.index.names = ['Year']
    internal_credit.stack(level=0).reset_index().to_sql(internal_credit_tablename, db, if_exists='replace', index=False)

# Environmental Patents

In [19]:
env_patents_folder = pr.get_path("environmental_patents")
environmental_patents_tablename = 'environmental_patents'
create_env_patents_table = not mtp.databases.check_if_table(db, environmental_patents_tablename)

In [None]:
if create_env_patents_table or RECALCULATE:
    csv = [x for x in env_patents_folder.iterdir() if x.suffix == '.csv'][0]
    environmental_patents = pd.read_csv(csv)
    environmental_patents = environmental_patents.dropna(axis=1, how='all')
    environmental_patents =  environmental_patents[['REF_AREA', 'TIME_PERIOD', 'OBS_VALUE']]
    environmental_patents.columns = ['Country', 'Year', 'Environmental Patents']
    environmental_patents = environmental_patents.groupby(['Country', 'Year']).sum()
    environmental_patents.index = environmental_patents.index.set_levels(cc.convert(environmental_patents.index.levels[0], to='name_short', not_found=None), level=0)
    environmental_patents = environmental_patents.reindex(pd.MultiIndex.from_product([environmental_patents.index.levels[0], range(1995, 2021)], names=environmental_patents.index.names))
    environmental_patents = (environmental_patents.unstack(level=0)
        .swaplevel(0, 1, axis=1)
        .fillna(0.0))
    environmental_patents.index = environmental_patents.index.astype(int)
    environmental_patents.stack(level=0).reset_index().to_sql(environmental_patents_tablename, db, if_exists='replace', index=False)

# Environmental Policy Stringency

In [21]:
env_policy_folder = pr.get_path("policy_stringency")
envrionmental_policy_tablename = 'environmental_policy_stringency'
create_environemtal_policy_table = not mtp.databases.check_if_table(db, envrionmental_policy_tablename)

In [None]:
if create_environemtal_policy_table or RECALCULATE:
    csv = [x for x in env_policy_folder.iterdir() if x.suffix == '.csv'][0]
    policy_stringency = pd.read_csv(csv)
    policy_stringency = policy_stringency.dropna(axis=1, how='all')
    policy_stringency = policy_stringency[['Country', 'Year', 'Value']]
    policy_stringency.columns = ['Country', 'Year', 'Environmental Policy Stringency']
    policy_stringency['Country'] = cc.convert(policy_stringency['Country'], to='name_short', not_found=None)
    policy_stringency = policy_stringency.set_index(['Country', 'Year'])
    policy_stringency = policy_stringency.unstack(level=0).swaplevel(0, 1, axis=1).sort_index(axis=1)
    policy_stringency.index = policy_stringency.index.astype(int)
    policy_stringency.stack(level=0).reset_index().to_sql(envrionmental_policy_tablename, db, if_exists='replace', index=False)

# Globalisation Index

In [23]:
globalisation_folder = pr.get_path("globalisation")
globalisation_index_tablename = 'globalisation_index'
create_globalisation_index_table = not mtp.databases.check_if_table(db, globalisation_index_tablename)

In [None]:
if create_globalisation_index_table or RECALCULATE:
    dta = [x for x in globalisation_folder.iterdir() if x.suffix == '.dta'][0]
    globalisation_index = pd.read_stata(dta)
    vars_map = {
        'KOFGI': 'Globalisation Index', 
        'KOFGIdf': 'Globalisation Index, de facto', 
        'KOFGIdj': 'Globalisation Index, de jure', 
        'KOFEcGI': 'Economic Globalisation',
        'KOFEcGIdf': 'Economic Globalisation, de facto', 
        'KOFEcGIdj': 'Economic Globalisation, de jure', 
        'KOFTrGI': 'Trade Globalisation', 
        'KOFTrGIdf': 'Trade Globalisation, de facto', 
        'KOFTrGIdj': 'Trade Globalisation, de jure',
        'KOFFiGI': 'Financial Globalisation', 
        'KOFFiGIdf': 'Financial Globalisation, de facto', 
        'KOFFiGIdj': 'Financial Globalisation, de jure', 
        'KOFSoGI': 'Social Globalisation', 
        'KOFSoGIdf': 'Social Globalisation, de facto',
        'KOFSoGIdj': 'Social Globalisation, de jure', 
        'KOFIpGI': 'Interpersonal Globalisation', 
        'KOFIpGIdf': 'Interpersonal Globalisation, de facto', 
        'KOFIpGIdj': 'Interpersonal Globalisation, de jure', 
        'KOFInGI': 'Informational Globalisation',
        'KOFInGIdf': 'Informational Globalisation, de facto', 
        'KOFInGIdj': 'Informational Globalisation, de jure', 
        'KOFCuGI': 'Cultural Globalisation', 
        'KOFCuGIdf': 'Cultural Globalisation, de facto', 
        'KOFCuGIdj': 'Cultural Globalisation, de jure',
        'KOFPoGI': 'Political Globalisation', 
        'KOFPoGIdf': 'Political Globalisation, de facto', 
        'KOFPoGIdj': 'Political Globalisation, de jure',
        'country': 'Country',
        'year': 'Year'
    }
    globalisation_index.columns = globalisation_index.columns.map(vars_map)
    globalisation_index = globalisation_index.iloc[:, 1:].set_index(['Country', 'Year'])
    globalisation_index.index = globalisation_index.index.set_levels(cc.convert(globalisation_index.index.levels[0], to='name_short', not_found=None), level=0)
    globalisation_index = globalisation_index.unstack(level=0).swaplevel(0, 1, axis=1).sort_index(axis=1)
    globalisation_index = globalisation_index.loc[1995:2020, :]
    globalisation_index.index = globalisation_index.index.astype(int)
    globalisation_index.stack(level=0).reset_index().to_sql(globalisation_index_tablename, db, if_exists='replace', index=False)

# World Inequality Index

In [None]:
inequality_folder = pr.get_path("inequality")
income_inequality_tablename = 'income_inequality_index'
create_income_inequality_table = not mtp.databases.check_if_table(db, income_inequality_tablename)

In [27]:
if create_income_inequality_table or RECALCULATE:
    countries_data = inequality_folder / 'WID_countries.csv'
    countries_data = pd.read_csv(countries_data, delimiter=';')
    countries_data = countries_data[['alpha2', 'shortname']]
    countries_data.columns = ['alpha2', 'Country']
    countries_data = countries_data.loc[countries_data['alpha2'].str.find('-') == -1]
    valid_codes = countries_data['alpha2'].unique()
    inequalities_data = [f for f in inequality_folder.iterdir() if f.suffix == '.csv' and '_data_' in f.stem and 'WID_countries' not in f.stem and not f.stem.startswith('.') and f.stem.split('_')[-1] in valid_codes]
    inequalities_metadata = [f for f in inequality_folder.iterdir() if f.suffix == '.csv' and '_metadata_' in f.stem and 'WID_countries' not in f.stem and not f.stem.startswith('.') and f.stem.split('_')[-1] in valid_codes]

Left undone... No need

***