In [5]:
# Set project directory
import os
import sys


def project_dir():
    notebook_path = %pwd
    repo_name = "worldbank_data_exploration"
    repo_folder = notebook_path.split(repo_name)[0]
    return os.path.join(repo_folder, repo_name)


pwd = os.getenv("PWD", project_dir())
os.environ["PWD"] = pwd
sys.path.append(pwd)

In [6]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

from data_sources.get import get_indicators

In [59]:
df = get_indicators(
    [
        "SP.POP.GROW",
        "FP.CPI.TOTL.ZG",
        "SP.DYN.LE00.IN",
        "NE.EXP.GNFS.ZS",
        "NY.GDP.MKTP.KD.ZG",
        "SL.UEM.TOTL.ZS",
        "NV.AGR.TOTL.ZS",
        "EG.ELC.ACCS.ZS",
        "AG.LND.FRST.ZS",
        "SH.DYN.MORT",
        "NY.GDP.TOTL.RT.ZS",
        "SP.DYN.TFRT.IN",
        "EN.URB.LCTY.UR.ZS",
        "TG.VAL.TOTL.GD.ZS",
        "MS.MIL.XPND.GD.ZS",
    ]
)

print(df["Indicator Name"].unique())

['Population growth (annual %)' 'Inflation, consumer prices (annual %)'
 'Life expectancy at birth, total (years)'
 'Exports of goods and services (% of GDP)' 'GDP growth (annual %)'
 'Unemployment, total (% of total labor force) (modeled ILO estimate)'
 'Agriculture, forestry, and fishing, value added (% of GDP)'
 'Access to electricity (% of population)' 'Forest area (% of land area)'
 'Mortality rate, under-5 (per 1,000 live births)'
 'Total natural resources rents (% of GDP)'
 'Fertility rate, total (births per woman)'
 'Population in the largest city (% of urban population)'
 'Merchandise trade (% of GDP)' 'Military expenditure (% of GDP)']


In [60]:
df = df.pivot_table(
    values="Value", index="Year", columns=["Indicator Name", "Country Name"]
)

In [61]:
df_nans = df.loc[slice(2000, 2019)].isnull().sum().reset_index()

In [62]:
countries_with_nans = df_nans[df_nans[0] > 2]["Country Name"].unique()
countries_with_nans

array(['Afghanistan', 'Algeria', 'Congo, Rep.', 'Equatorial Guinea',
       'Guinea-Bissau', 'Iraq', 'Kiribati', "Korea, Dem. People's Rep.",
       'Lebanon', 'Liberia', 'Malaysia', 'Sierra Leone', 'South Sudan',
       'Turkey', 'Armenia', 'Aruba', 'Bahrain', 'Barbados', 'Bermuda',
       'Cayman Islands', 'Central African Republic', 'Djibouti',
       'Eritrea', 'French Polynesia', 'Greenland', 'Isle of Man',
       'Kosovo', 'Libya', 'Liechtenstein', 'Maldives', 'Nauru',
       'San Marino', 'Sint Maarten (Dutch part)', 'Solomon Islands',
       'Somalia', 'St. Lucia', 'Tuvalu', 'Venezuela, RB', 'Curacao',
       'Dominica', 'Ethiopia', 'Guyana', 'Lao PDR', 'Lesotho', 'Myanmar',
       'Papua New Guinea', 'Suriname', 'Yemen, Rep.', 'Andorra',
       'Marshall Islands', 'Palau', 'St. Kitts and Nevis', 'Sudan',
       'World', 'American Samoa', 'Channel Islands', 'Guam',
       'New Caledonia', 'Northern Mariana Islands',
       'Turks and Caicos Islands', 'Virgin Islands (U.S.)',
  

In [63]:
df_cleared = df.loc[2000:2019].stack()
df_cleared.drop(index=countries_with_nans, level=1, inplace=True)

In [64]:
df_cleared.bfill(inplace=True)
df_cleared.ffill(inplace=True)

In [65]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_cleared.values)
df_scaled = pd.DataFrame(
    data=data_scaled, index=df_cleared.index, columns=df_cleared.columns
)
df_scaled.head()

Unnamed: 0_level_0,Indicator Name,Access to electricity (% of population),"Agriculture, forestry, and fishing, value added (% of GDP)",Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",Forest area (% of land area),GDP growth (annual %),"Inflation, consumer prices (annual %)","Life expectancy at birth, total (years)",Merchandise trade (% of GDP),Military expenditure (% of GDP),"Mortality rate, under-5 (per 1,000 live births)",Population growth (annual %),Population in the largest city (% of urban population),Total natural resources rents (% of GDP),"Unemployment, total (% of total labor force) (modeled ILO estimate)"
Year,Country Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000,Africa Eastern and Southern,-2.51039,0.337255,-0.516288,2.184692,0.133863,-0.157276,0.4297,-2.47246,-0.577691,0.006317,2.824619,1.231241,-0.043623,0.089659,0.056605
2000,Africa Western and Central,-1.938826,1.31758,-0.142854,2.526298,-0.512883,-0.007551,-0.257431,-2.724172,-0.447312,-0.722077,3.690481,1.315666,-0.061402,1.361974,-0.502309
2000,Albania,0.624115,1.691805,-0.730713,-0.365591,-0.174435,0.887072,-0.538223,0.303513,-0.620316,-0.471909,-0.168027,-1.648881,-0.174584,-0.519704,2.3101
2000,Angola,0.524322,-0.447382,1.807223,2.980123,1.614304,-0.196976,36.241832,-3.054382,1.490656,2.721396,4.637781,1.820571,0.274944,5.672131,-0.680031
2000,Antigua and Barbuda,0.524322,-0.908996,2.132473,-0.333492,-0.519045,0.680161,-0.456528,0.301677,-0.186298,2.683631,-0.485341,0.48416,0.07138,-0.644785,1.027781
