In [26]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

from data_sources.get import get_indicators

In [27]:
df = get_indicators(
    ['SP.POP.TOTL', 'FP.CPI.TOTL.ZG', 'SP.DYN.LE00.IN', 'NE.EXP.GNFS.ZS', 'NY.GDP.PCAP.KD', 'SL.UEM.TOTL.ZS', 'NV.AGR.TOTL.ZS', 'EG.ELC.ACCS.ZS', 'AG.LND.FRST.ZS', 'SH.DYN.MORT', 'NY.GDP.TOTL.RT.ZS', 'SP.DYN.TFRT.IN', 'EN.URB.LCTY.UR.ZS', 'IP.JRN.ARTC.SC', 'MS.MIL.XPND.GD.ZS']
)

print(df['Indicator Name'].unique())

['Population, total' 'Inflation, consumer prices (annual %)'
 'Life expectancy at birth, total (years)'
 'Exports of goods and services (% of GDP)'
 'GDP per capita (constant 2015 US$)'
 'Unemployment, total (% of total labor force) (modeled ILO estimate)'
 'Agriculture, forestry, and fishing, value added (% of GDP)'
 'Access to electricity (% of population)' 'Forest area (% of land area)'
 'Mortality rate, under-5 (per 1,000 live births)'
 'Total natural resources rents (% of GDP)'
 'Fertility rate, total (births per woman)'
 'Population in the largest city (% of urban population)'
 'Scientific and technical journal articles'
 'Military expenditure (% of GDP)']


In [28]:
df = df.pivot_table(values='Value',
                    index='Year',
                    columns=['Indicator Name', 'Country Name'])

In [29]:
df_nans = df.loc[2000:2019].isnull().sum().sort_values(ascending=False).reset_index()

In [30]:
countries_with_nans = df_nans[df_nans[0] > 2]['Country Name'].unique()
countries_with_nans

array(['Pacific island small states', 'Somalia', 'Turkmenistan',
       'Caribbean small states', 'Afghanistan', 'Tuvalu', 'Dominica',
       'New Caledonia', 'Isle of Man', 'Liechtenstein', 'Cayman Islands',
       'French Polynesia', 'San Marino', 'Faroe Islands',
       'Marshall Islands', 'St. Kitts and Nevis', 'Palau', 'Nauru',
       'Eritrea', 'Sint Maarten (Dutch part)', 'Andorra', 'Uzbekistan',
       'Papua New Guinea', 'Suriname', 'Maldives', 'Guyana', 'Barbados',
       'Djibouti', 'Libya', 'Solomon Islands', 'Haiti',
       'Syrian Arab Republic', 'Venezuela, RB', 'South Sudan',
       'Channel Islands', 'Armenia', 'Timor-Leste', 'Equatorial Guinea',
       'Turks and Caicos Islands', 'Yemen, Rep.', 'Small states',
       'Ethiopia', 'Curacao', 'Other small states', 'Qatar', 'Zimbabwe',
       'Aruba', 'Turkey', 'Bermuda', 'Cuba', 'Myanmar', 'Malaysia',
       'Lebanon', 'Sudan', "Korea, Dem. People's Rep.",
       'Central African Republic', 'Algeria', 'Kosovo',
       'U

In [31]:
df_cleared = df.loc[2000:2019].stack()
df_cleared.drop(index=countries_with_nans, level=1, inplace=True)

In [32]:
df_cleared.bfill(inplace=True)
df_cleared.ffill(inplace=True)

In [33]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_cleared.values)
df_scaled = pd.DataFrame(data=data_scaled, index=df_cleared.index, columns=df_cleared.columns)
df_scaled.head()

Unnamed: 0_level_0,Indicator Name,Access to electricity (% of population),"Agriculture, forestry, and fishing, value added (% of GDP)",Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",Forest area (% of land area),GDP per capita (constant 2015 US$),"Inflation, consumer prices (annual %)","Life expectancy at birth, total (years)",Military expenditure (% of GDP),"Mortality rate, under-5 (per 1,000 live births)",Population in the largest city (% of urban population),"Population, total",Scientific and technical journal articles,Total natural resources rents (% of GDP),"Unemployment, total (% of total labor force) (modeled ILO estimate)"
Year,Country Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000,Africa Eastern and Southern,-2.510399,0.342165,-0.512874,2.184429,0.125685,-0.662428,0.411844,-2.476043,0.002652,2.823783,-0.046175,0.017207,-0.316311,0.08861,0.04122
2000,Africa Western and Central,-1.938829,1.323454,-0.13956,2.525287,-0.526203,-0.664618,-0.259095,-2.728593,-0.725097,3.688629,-0.063954,-0.121909,-0.333835,1.36167,-0.511689
2000,Albania,0.624136,1.698047,-0.72723,-0.360271,-0.185064,-0.627431,-0.53327,0.30917,-0.475151,-0.165349,-0.177134,-0.402615,-0.34237,-0.52111,2.270505
2000,Angola,0.524342,-0.443243,1.809889,2.978119,1.617897,-0.603661,35.380077,-3.059901,2.715328,4.634816,0.272387,-0.388474,-0.342444,5.674349,-0.687502
2000,Antigua and Barbuda,0.524342,-0.905311,2.135033,-0.328243,-0.532414,-0.040773,-0.4535,0.307328,2.677596,-0.482289,0.068826,-0.405817,-0.342477,-0.646263,1.001963


In [34]:
df_cleared.to_csv('../data/dataset.csv')
df_scaled.to_csv('../data/dataset_normalized.csv')