In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [170]:
from seminartools.data import read_inflation, read_commodity, read_gdp_growth, read_interest_rate, read_unemployment
import pandas as pd

dfs = {
    'inflation': read_inflation(mergeable_format=True),
    'commodity': read_commodity(mergeable_format=True),
    'gdp_growth': read_gdp_growth(mergeable_format=True),
    'interest_rate': read_interest_rate(mergeable_format=True),
    'unemployment': read_unemployment(mergeable_format=True),
}

In [171]:
dfs["commodity"]

Unnamed: 0_level_0,commodity_CRUDE_PETRO,commodity_iNATGAS,commodity_iAGRICULTURE,commodity_iMETMIN,commodity_iPRECIOUSMET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1977-01-01,0.000000,0.052425,0.144757,0.041940,0.112740
1977-04-01,-0.004000,0.043586,-0.066688,-0.045412,-0.060029
1977-07-01,0.014458,0.005967,-0.145538,0.019138,0.045236
1977-10-01,0.003959,0.005931,0.040048,0.046373,0.065161
1978-01-01,-0.001577,0.156056,0.072482,-0.002346,0.139776
...,...,...,...,...,...
2022-04-01,0.039177,-0.028903,-0.029440,-0.181045,-0.072004
2022-07-01,-0.244692,0.486523,-0.088860,-0.154988,-0.090042
2022-10-01,-0.115091,-0.355721,-0.016280,0.100554,0.093372
2023-01-01,-0.020410,-0.585458,0.002037,0.007418,0.043183


In [172]:
df = pd.concat([
    dfs['inflation'],
    #dfs['commodity'],
    dfs['gdp_growth'],
    dfs['interest_rate'],
    dfs['unemployment'],
], axis=1).join( # join level 1 of the multiindex with the commodity data
    dfs['commodity'],
    on='date',
)
df.dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,inflation,gdp_growth,interest_rate,unemployment_rate,commodity_CRUDE_PETRO,commodity_iNATGAS,commodity_iAGRICULTURE,commodity_iMETMIN,commodity_iPRECIOUSMET
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Japan,1977-01-01,0.022951,2.187257,6.333333,2.3,0.00000,0.052425,0.144757,0.041940,0.112740
Canada,1977-01-01,0.023904,1.533830,8.166667,8.8,0.00000,0.052425,0.144757,0.041940,0.112740
United States,1977-01-01,0.017231,1.186233,4.700000,8.2,0.00000,0.052425,0.144757,0.041940,0.112740
Japan,1977-04-01,0.027244,0.703788,5.000000,2.0,-0.00400,0.043586,-0.066688,-0.045412,-0.060029
Canada,1977-04-01,0.023346,0.510302,7.666667,8.0,-0.00400,0.043586,-0.066688,-0.045412,-0.060029
...,...,...,...,...,...,...,...,...,...,...
Denmark,2023-01-01,0.003145,0.906018,2.150000,5.0,-0.02041,-0.585458,0.002037,0.007418,0.043183
Switzerland,2023-01-01,0.010084,0.309543,1.166667,4.4,-0.02041,-0.585458,0.002037,0.007418,0.043183
Canada,2023-01-01,0.006080,0.610026,4.500000,5.4,-0.02041,-0.585458,0.002037,0.007418,0.043183
Sweden,2023-01-01,0.014955,0.656897,2.833333,7.8,-0.02041,-0.585458,0.002037,0.007418,0.043183


# Figure out countries

In [196]:
from collections import Counter, defaultdict

def get_country_stats(country_lists: dict[str, list[str]]):
    """
    We find 
    1. Overlapping countries
    2. Countries within a country list that do not appear in any other country list
    3. Countries that appear in all other country lists but not in the current one
    """
    n = len(country_lists)

    counter = Counter()
    containing_lists = defaultdict(list)
    for country_list in country_lists.values():
        counter.update(country_list)


    unique_countries = {}
    missing_countries = {}
    for name, country_list in country_lists.items():
        # find countries that are in the current list but do not have 5 counts in the counter
        unique_countries[name] = [country for country in country_list if counter[country] == 1 and country in country_list]
        missing_countries[name] = [country for country in counter if counter[country] == n and country not in country_list]
        for country in country_list:
            containing_lists[country].append(name)

    return counter, unique_countries, missing_countries, containing_lists

counter, unique_countries, missing_countries, containing_lists = get_country_stats({
    'inflation': dfs['inflation'].index.get_level_values(0).unique().tolist(),
    'gdp_growth': dfs['gdp_growth'].index.get_level_values(0).unique().tolist(),
    'interest_rate': dfs['interest_rate'].index.get_level_values(0).unique().tolist(),
    'unemployment': dfs['unemployment'].index.get_level_values(0).unique().tolist(),
})
print("Countries that are contained in all 4 datasets:")
[country for country in counter if counter[country] == 4]

Countries that are contained in all 4 datasets:


['Portugal',
 'New Zealand',
 'Japan',
 'South Africa',
 'Sweden',
 'Switzerland',
 'France',
 'Denmark',
 'Netherlands',
 'Germany',
 'Norway',
 'Belgium',
 'United States',
 'Canada',
 'Italy',
 'United Kingdom',
 'Australia',
 'Spain']

In [197]:
containing_lists = sorted(list(dict(containing_lists).items()), key=lambda x: len(x[1]), reverse=True)
print("Countries and the datasets they are contained in:")
containing_lists

Countries and the datasets they are contained in:


[('Portugal', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('New Zealand', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Japan', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('South Africa',
  ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Sweden', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Switzerland', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('France', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Denmark', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Netherlands', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Germany', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Norway', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('Belgium', ['inflation', 'gdp_growth', 'interest_rate', 'unemployment']),
 ('United States',
  ['inflation', 'gdp_growth', 'interest_rate', 'unempl

In [195]:
dfs["interest_rate"].index.get_level_values(0).unique().tolist()

['Belgium',
 'Canada',
 'Switzerland',
 'Germany',
 'Denmark',
 'Spain',
 'France',
 'United Kingdom',
 'India',
 'Italy',
 'Japan',
 'Netherlands',
 'Portugal',
 'Sweden',
 'United States',
 'Australia',
 'South Africa',
 'New Zealand',
 'Norway',
 'Philippines',
 'Brazil',
 'Hungary',
 'Russia',
 'Croatia',
 'Poland',
 'Argentina',
 'Israel',
 'Colombia',
 'Czechia',
 'Malaysia',
 'China',
 'Chile',
 'Serbia',
 'Iceland',
 'Hong Kong SAR',
 'Mexico',
 'Euro area',
 'Korea',
 'North Macedonia',
 'Saudi Arabia',
 'Thailand',
 'Morocco',
 'Türkiye',
 'Romania',
 'Peru',
 'Indonesia']