In [None]:
# Set project directory
import os
import sys


def project_dir():
    notebook_path = %pwd
    repo_name = "worldbank_data_exploration"
    repo_folder = notebook_path.split(repo_name)[0]
    return os.path.join(repo_folder, repo_name)


pwd = os.getenv("PWD", project_dir())
os.environ["PWD"] = pwd
sys.path.append(pwd)

import warnings

warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=FutureWarning)

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

from data_sources.get import get_indicators
from data_sources.load_dataset import (
    INDICATORS_SELECTED,
    INDICATORS_AGRICULTURE,
    INDICATORS_ECONOMY,
    INDICATORS_HEALTH,
    ALL_INDICATORS,
    INDICATORS_YEARS_RANGE,
)

In [None]:
df = get_indicators(INDICATORS_HEALTH)

print(df["Indicator Name"].unique())

['Life expectancy at birth, total (years)'
 'Mortality rate, under-5 (per 1,000 live births)'
 'Fertility rate, total (births per woman)'
 'Prevalence of undernourishment (% of population)'
 'Immunization, DPT (% of children ages 12-23 months)'
 'Population growth (annual %)'
 'Age dependency ratio (% of working-age population)'
 'Incidence of tuberculosis (per 100,000 people)'
 'Immunization, measles (% of children ages 12-23 months)'
 'Adolescent fertility rate (births per 1,000 women ages 15-19)'
 'Death rate, crude (per 1,000 people)'
 'Birth rate, crude (per 1,000 people)']


In [None]:
df = df.pivot_table(
    values="Value", index="Year", columns=["Indicator Name", "Country Name"]
)

In [None]:
df_nans = df.loc[INDICATORS_YEARS_RANGE].isnull().sum().reset_index()
df_nans

Unnamed: 0,Indicator Name,Country Name,0
0,"Adolescent fertility rate (births per 1,000 wo...",Afghanistan,0
1,"Adolescent fertility rate (births per 1,000 wo...",Africa Eastern and Southern,0
2,"Adolescent fertility rate (births per 1,000 wo...",Africa Western and Central,0
3,"Adolescent fertility rate (births per 1,000 wo...",Albania,0
4,"Adolescent fertility rate (births per 1,000 wo...",Algeria,0
...,...,...,...
2842,Prevalence of undernourishment (% of population),Vanuatu,1
2843,Prevalence of undernourishment (% of population),"Venezuela, RB",1
2844,Prevalence of undernourishment (% of population),Vietnam,1
2845,Prevalence of undernourishment (% of population),World,1


In [None]:
aggregates = np.array(
    [
        "Africa Eastern and Southern",
        "Africa Western and Central",
        "Arab World",
        "Caribbean small states",
        "Central Europe and the Baltics",
        "Early-demographic dividend",
        "East Asia & Pacific",
        "East Asia & Pacific (excluding high income)",
        "East Asia & Pacific (IDA & IBRD countries)",
        "Euro area",
        "Europe & Central Asia",
        "Europe & Central Asia (excluding high income)",
        "Europe & Central Asia (IDA & IBRD countries)",
        "European Union",
        "Fragile and conflict affected situations",
        "Heavily indebted poor countries (HIPC)",
        "High income",
        "IBRD only",
        "IDA & IBRD total",
        "IDA blend",
        "IDA only",
        "IDA total",
        "Late-demographic dividend",
        "Latin America & Caribbean",
        "Latin America & Caribbean (excluding high income)",
        "Latin America & the Caribbean (IDA & IBRD countries)",
        "Least developed countries: UN classification",
        "Low & middle income",
        "Low income",
        "Lower middle income",
        "Middle East & North Africa",
        "Middle East & North Africa (excluding high income)",
        "Middle East & North Africa (IDA & IBRD countries)",
        "Middle income",
        "North America",
        "OECD members",
        "Other small states",
        "Pacific island small states",
        "Post-demographic dividend",
        "Pre-demographic dividend",
        "Small states",
        "South Asia",
        "South Asia (IDA & IBRD)",
        "Sub-Saharan Africa",
        "Sub-Saharan Africa (excluding high income)",
        "Sub-Saharan Africa (IDA & IBRD countries)",
        "Upper middle income",
        "World",
    ]
)

aggregates.shape

(48,)

In [None]:
countries_with_nans = df_nans[df_nans[0] > 2]["Country Name"].unique()
countries_with_nans

array(['Eritrea', 'American Samoa', 'Andorra', 'Curacao', 'Dominica',
       'Isle of Man', 'Marshall Islands', 'Monaco',
       'Northern Mariana Islands', 'San Marino',
       'Sint Maarten (Dutch part)', 'St. Kitts and Nevis', 'Palau',
       'Montenegro', 'South Sudan', 'Serbia', 'Cayman Islands'],
      dtype=object)

In [None]:
df_cleared = df.loc[
    INDICATORS_YEARS_RANGE.start : INDICATORS_YEARS_RANGE.stop + 1
].stack()
print(df_cleared.shape)
df_cleared.drop(index=countries_with_nans, level=1, inplace=True)
print(df_cleared.shape)
df_cleared.drop(index=aggregates, level=1, inplace=True)
print(df_cleared.shape)

(5300, 12)
(4960, 12)
(4000, 12)


In [None]:
df_cleared = df_cleared.unstack()
df_cleared.bfill(inplace=True)
df_cleared.ffill(inplace=True)
df_cleared = df_cleared.stack()
df_cleared.dropna(axis=0, inplace=True)

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_cleared.values)
df_scaled = pd.DataFrame(
    data=data_scaled, index=df_cleared.index, columns=df_cleared.columns
)
df_scaled.head()

Unnamed: 0_level_0,Indicator Name,"Adolescent fertility rate (births per 1,000 women ages 15-19)",Age dependency ratio (% of working-age population),"Birth rate, crude (per 1,000 people)","Death rate, crude (per 1,000 people)","Fertility rate, total (births per woman)","Immunization, DPT (% of children ages 12-23 months)","Immunization, measles (% of children ages 12-23 months)","Incidence of tuberculosis (per 100,000 people)","Life expectancy at birth, total (years)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Prevalence of undernourishment (% of population)
Year,Country Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2000,Afghanistan,2.306114,2.477419,2.415543,1.019019,3.109483,-4.212668,-3.909625,0.212351,-1.561172,2.183886,1.203342,3.006081
2000,Albania,-0.804539,-0.091821,-0.523233,-0.76188,-0.503464,0.685641,0.607844,-0.586791,0.441328,-0.279652,-1.533635,-0.564508
2000,Algeria,-0.972428,0.097692,-0.233123,-1.054299,-0.26138,-0.05246,-0.388657,-0.334681,0.074855,0.022252,-0.021519,-0.306494
2000,Angola,3.291742,2.148089,2.427545,2.957634,2.535806,-3.742967,-3.577458,0.721328,-2.591386,4.000141,1.432274,4.645723
2000,Argentina,0.286453,0.025715,-0.238241,-0.169372,-0.242394,-0.253761,0.34211,-0.515439,0.399429,-0.46321,-0.216851,-0.722646


In [None]:
from data_sources.load_dataset import load_time_series, load_dataset

df = load_dataset(indicators=INDICATORS_HEALTH)
print(df.isnull().sum().sum())
df

0


Unnamed: 0_level_0,Indicator Name,"Adolescent fertility rate (births per 1,000 women ages 15-19)",Age dependency ratio (% of working-age population),"Birth rate, crude (per 1,000 people)","Death rate, crude (per 1,000 people)","Fertility rate, total (births per woman)","Immunization, DPT (% of children ages 12-23 months)","Immunization, measles (% of children ages 12-23 months)","Incidence of tuberculosis (per 100,000 people)","Life expectancy at birth, total (years)","Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Prevalence of undernourishment (% of population)
Year,Country Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2000,Afghanistan,153.9510,104.856398,48.021,11.718,7.485,24.0,27.0,190.0,55.841,129.2,2.975057,47.8
2000,Albania,19.0372,59.723586,16.436,5.914,2.157,97.0,95.0,22.0,73.955,27.2,-0.637357,4.9
2000,Algeria,11.7556,63.052679,19.554,4.961,2.514,86.0,80.0,75.0,70.640,39.7,1.358417,8.0
2000,Angola,196.6992,99.071186,48.150,18.036,6.639,31.0,32.0,297.0,46.522,204.4,3.277215,67.5
2000,Argentina,66.3552,61.788296,19.499,7.845,2.542,83.0,91.0,37.0,73.576,19.6,1.100608,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,Uzbekistan,24.2094,49.488438,23.300,4.700,2.604,98.0,96.0,70.0,71.573,15.8,1.736986,2.5
2018,Vanuatu,48.9082,73.878043,29.595,5.266,3.782,85.0,75.0,46.0,70.323,26.3,2.482425,9.5
2018,"Venezuela, RB",84.9792,53.689245,17.881,6.982,2.272,60.0,74.0,48.0,72.128,24.2,-1.768331,23.4
2018,Vietnam,29.1518,43.776605,16.745,6.317,2.049,75.0,97.0,182.0,75.317,21.3,0.994310,6.8


---

In [None]:
all_time_series = load_time_series(indicators=INDICATORS_SELECTED)
all_time_series[1], len(all_time_series[1])

(array(['Albania', 'Angola', 'Australia', 'Austria', 'Azerbaijan',
        'Bangladesh', 'Belarus', 'Belgium', 'Bolivia', 'Brazil',
        'Bulgaria', 'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada',
        'Chad', 'Chile', 'China', 'Colombia', 'Congo, Dem. Rep.',
        'Costa Rica', 'Croatia', 'Czech Republic', 'Denmark',
        'Dominican Republic', 'Ecuador', 'Egypt, Arab Rep.', 'El Salvador',
        'Estonia', 'Finland', 'France', 'Georgia', 'Germany', 'Ghana',
        'Greece', 'Guatemala', 'Honduras', 'Hungary', 'India', 'Indonesia',
        'Iran, Islamic Rep.', 'Ireland', 'Israel', 'Italy', 'Jamaica',
        'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Korea, Rep.', 'Kuwait',
        'Kyrgyz Republic', 'Latvia', 'Lithuania', 'Madagascar', 'Mali',
        'Mexico', 'Moldova', 'Mongolia', 'Morocco', 'Nepal', 'Netherlands',
        'New Zealand', 'Nicaragua', 'Nigeria', 'North Macedonia', 'Norway',
        'Oman', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines',
    

In [None]:
all_time_series = load_time_series(indicators=INDICATORS_AGRICULTURE)
all_time_series[1], len(all_time_series[1])

(array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
        'Australia', 'Austria', 'Azerbaijan', 'Bangladesh', 'Belarus',
        'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia',
        'Bosnia and Herzegovina', 'Brazil', 'Brunei Darussalam',
        'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
        'Canada', 'Chile', 'China', 'Colombia', 'Congo, Dem. Rep.',
        'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba',
        'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic',
        'Ecuador', 'Egypt, Arab Rep.', 'El Salvador', 'Estonia',
        'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia, The',
        'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea',
        'Guyana', 'Honduras', 'Hungary', 'India', 'Indonesia',
        'Iran, Islamic Rep.', 'Iraq', 'Ireland', 'Israel', 'Italy',
        'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Korea, Rep.',
        'Kuwait', 'Kyrgyz Republic', 'L

In [None]:
all_time_series = load_time_series(indicators=INDICATORS_ECONOMY)
all_time_series[1], len(all_time_series[1])

(array(['Albania', 'Angola', 'Australia', 'Azerbaijan', 'Bahamas, The',
        'Bangladesh', 'Belarus', 'Belgium', 'Bolivia', 'Botswana',
        'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burundi', 'Cambodia',
        'Cameroon', 'Canada', 'Chile', 'China', 'Colombia', 'Congo, Rep.',
        'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
        'Ecuador', 'Egypt, Arab Rep.', 'El Salvador', 'Estonia',
        'Eswatini', 'Fiji', 'Finland', 'France', 'Georgia', 'Germany',
        'Ghana', 'Guatemala', 'Haiti', 'Honduras', 'Hong Kong SAR, China',
        'Hungary', 'Iceland', 'India', 'Indonesia', 'Israel', 'Italy',
        'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Korea, Rep.',
        'Kyrgyz Republic', 'Lao PDR', 'Latvia', 'Lithuania', 'Luxembourg',
        'Malaysia', 'Malta', 'Mauritius', 'Mexico', 'Moldova', 'Mongolia',
        'Morocco', 'Nepal', 'Netherlands', 'New Zealand', 'Niger',
        'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Pakistan

In [None]:
all_time_series = load_time_series(indicators=INDICATORS_HEALTH)
all_time_series[1], len(all_time_series[1])

(array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
        'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bangladesh',
        'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bolivia',
        'Bosnia and Herzegovina', 'Botswana', 'Brazil',
        'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Cabo Verde',
        'Cambodia', 'Cameroon', 'Canada', 'Central African Republic',
        'Chad', 'Chile', 'China', 'Colombia', 'Congo, Dem. Rep.',
        'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba',
        'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti',
        'Dominican Republic', 'Ecuador', 'Egypt, Arab Rep.', 'El Salvador',
        'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France',
        'Gabon', 'Gambia, The', 'Georgia', 'Germany', 'Ghana', 'Greece',
        'Guatemala', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland',
        'India', 'Indonesia', 'Iran, Islamic Rep.', 'Iraq', 'Ireland',
        'Israel', 'Italy', '

In [None]:
all_time_series = load_time_series(indicators=ALL_INDICATORS)
all_time_series[1], len(all_time_series[1])

(array(['Albania', 'Angola', 'Australia', 'Azerbaijan', 'Bangladesh',
        'Belarus', 'Belgium', 'Bolivia', 'Brazil', 'Bulgaria', 'Cambodia',
        'Cameroon', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica',
        'Croatia', 'Czech Republic', 'Denmark', 'Ecuador',
        'Egypt, Arab Rep.', 'El Salvador', 'Estonia', 'Finland', 'France',
        'Georgia', 'Germany', 'Ghana', 'Guatemala', 'Honduras', 'Hungary',
        'India', 'Indonesia', 'Israel', 'Italy', 'Jamaica', 'Japan',
        'Jordan', 'Kazakhstan', 'Kenya', 'Korea, Rep.', 'Kyrgyz Republic',
        'Latvia', 'Lithuania', 'Mexico', 'Mongolia', 'Morocco', 'Nepal',
        'Netherlands', 'New Zealand', 'Nigeria', 'North Macedonia',
        'Norway', 'Oman', 'Pakistan', 'Panama', 'Paraguay', 'Peru',
        'Philippines', 'Poland', 'Portugal', 'Romania',
        'Russian Federation', 'Saudi Arabia', 'Senegal', 'Slovak Republic',
        'South Africa', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland',
        'Tanzani