This project attempts to implement a ML based approach towards forecasting GDP growth - as opposed to conventional, econometrics-based approaches.

Some caveats:
* Annual data is utilized, which might heavily decrease the accuracy; research has shown better results when using quarterly data
* Quarterly GDP growth is available in the OECD dataset, which would restrict the scope of countries
* Different data collection methodologies and standards of quality across governments might introduce extra bias in some observations and groups

This project is inspired by my MSc Statistics and Data Science thesis research; what you see here is part of the resulting work - in this notebook I am focusing exclusively on the development of supervised learning models

------------

Let us begin then:

# World Bank data mining
We collect data from the WB using the Python package *wbgapi*, which makes accessing the API very simple

In [1]:
#!pip install wbgapi

In [2]:
import pandas as pd
import numpy as np

import wbgapi as wb
import requests

In [3]:
# for searching all indicators within a database
# wb.series.info(db=2)

In [4]:
# For searching specific indicators
wb.series.info(q='GDP')

id,value
EG.GDP.PUSE.KO.PP,GDP per unit of energy use (PPP $ per kg of oil equivalent)
EG.GDP.PUSE.KO.PP.KD,GDP per unit of energy use (constant 2017 PPP $ per kg of oil equivalent)
EG.USE.COMM.GD.PP.KD,"Energy use (kg of oil equivalent) per $1,000 GDP (constant 2017 PPP)"
NY.GDP.DEFL.KD.ZG,"Inflation, GDP deflator (annual %)"
NY.GDP.DEFL.KD.ZG.AD,"Inflation, GDP deflator: linked series (annual %)"
NY.GDP.DEFL.ZS,GDP deflator (base year varies by country)
NY.GDP.DEFL.ZS.AD,GDP deflator: linked series (base year varies by country)
NY.GDP.DISC.CN,Discrepancy in expenditure estimate of GDP (current LCU)
NY.GDP.DISC.KN,Discrepancy in expenditure estimate of GDP (constant LCU)
NY.GDP.MKTP.CD,GDP (current US$)


In [5]:
# collecting all keys and placing them into a nice dataframe - these are the labels of
# variables to be fed into the ML model, so it helps to be careful in this section:

vars_dict = {'code': ['NY.GDP.MKTP.KD.ZG', 'NY.GDP.MKTP.KD','NY.GDP.PCAP.KD',
                      'NE.CON.GOVT.ZS', 'NE.CON.PRVT.ZS', 'GC.NFN.TOTL.GD.ZS','NY.GDP.DEFL.KD.ZG', 'EG.FEC.RNEW.ZS',
                      'SL.UEM.TOTL.NE.ZS','SL.EMP.TOTL.SP.NE.ZS','HD.HCI.OVRL','SL.AGR.EMPL.ZS','SL.EMP.VULN.ZS',
                      # environment:
                      'EN.POP.SLUM.UR.ZS','ER.H2O.INTR.PC',
                      'NE.IMP.GNFS.ZS','NE.EXP.GNFS.ZS','BX.KLT.DINV.CD.WD','GC.DOD.TOTL.GD.ZS','FR.INR.RINR',
                      'SI.POV.DDAY','SI.POV.LMIC','SI.POV.UMIC','SI.POV.MDIM','SI.POV.MDIM.XQ',
                      'SI.POV.GINI','BN.CAB.XOKA.GD.ZS','SP.POP.TOTL','EN.POP.DNST','SP.POP.GROW',
                      'SP.DYN.LE00.IN','SP.DYN.CDRT.IN', 'SH.DYN.MORT','SH.DTH.COMM.ZS',
                      # agric:
                      'EG.ELC.RNEW.ZS','AG.LND.AGRI.ZS','NV.AGR.TOTL.ZS','AG.LND.ARBL.ZS','SP.RUR.TOTL.ZS',
                      # climate change:
                      'EG.ELC.ACCS.ZS','EG.USE.ELEC.KH.PC',
                      # energy & mining:
                      'TX.VAL.FUEL.ZS.UN','TX.VAL.MMTL.ZS.UN',
                      'EN.ATM.CO2E.PC','EN.CLC.GHGR.MT.CE','NV.IND.TOTL.ZS','NV.SRV.EMPL.KD',
                      'SE.ENR.PRSC.FM.ZS','SE.ADT.LITR.ZS','SE.TER.CUAT.BA.ZS','VC.IHR.PSRC.P5','SP.POP.SCIE.RD.P6'],
            'label': ['gdp_real_gwt', 'gdp_real_us_fixed','gdp_per_capita',
                      'gdp_pp_govt', 'gdp_pp_private', 'investment', 'inflation_pp', 'renew_energy',
                      'unemployment','employment','HCI','agric_employment','vulnerable_employment',
                      # environment:
                      'pop_slums','renew_freshwater',
                      'imports','exports','foreign_inv','govt_debt','real_interest_rate',
                      'poverty_1.90','poverty_3.20','poverty_5.50','poverty_multidim','poverty_mult_index',
                      'gini_index','cab','population','pop_density','pop_growth',
                      'life_expectancy','death_rate', 'child_mortality','cause_of_death',
                      # agric:
                      'renewable_energy_output','agric_land','agff_gdp','arable_land','rural_pop',
                      # climate change:
                      'electricity_access','power_consumption',
                      # energy & mining:
                      'fuel_exports','metal_exports',
                      'co2_emissions','ghg_emissions','industry_gdp','service_value_added',
                      'school_enroll','literacy','bachelor','homicide','research']}

vars_df = pd.DataFrame(vars_dict)

In [6]:
len(vars_df)

52

In [7]:
# obtaining definition/description of each variable
vars_df = vars_df.assign(definition='')
for i in range(0,len(vars_df)):
    vars_df.iloc[i,2] = wb.series.get(id=vars_df.iloc[i,0])['value']

In [8]:
vars_df

Unnamed: 0,code,label,definition
0,NY.GDP.MKTP.KD.ZG,gdp_real_gwt,GDP growth (annual %)
1,NY.GDP.MKTP.KD,gdp_real_us_fixed,GDP (constant 2015 US$)
2,NY.GDP.PCAP.KD,gdp_per_capita,GDP per capita (constant 2015 US$)
3,NE.CON.GOVT.ZS,gdp_pp_govt,General government final consumption expenditu...
4,NE.CON.PRVT.ZS,gdp_pp_private,Households and NPISHs final consumption expend...
5,GC.NFN.TOTL.GD.ZS,investment,Net investment in nonfinancial assets (% of GDP)
6,NY.GDP.DEFL.KD.ZG,inflation_pp,"Inflation, GDP deflator (annual %)"
7,EG.FEC.RNEW.ZS,renew_energy,Renewable energy consumption (% of total final...
8,SL.UEM.TOTL.NE.ZS,unemployment,"Unemployment, total (% of total labor force) (..."
9,SL.EMP.TOTL.SP.NE.ZS,employment,"Employment to population ratio, 15+, total (%)..."


In [14]:
# function for obtaining a clean dataframe with columns "economy", "Year", and economic indicators:
def vert_df(gdp_df, name):
    gdp_df.columns = gdp_df.columns.str.replace('YR','')
    gdp_df = gdp_df.reset_index()
    gdp_df.drop('Country', inplace=True, axis=1) # dropping "Country" - we are using the ISO3 code instead
    year_drop = list(range(1960, 1990)) # dropping all years before 1990
    year_drop = [*map(str,year_drop)]
    gdp_df.drop(year_drop, inplace=True, axis=1)
    gdp_df = gdp_df.melt(id_vars = ['economy'], var_name = 'Year', value_name = name)
    gdp_df.Year = pd.to_numeric(gdp_df.Year)
    return gdp_df

In [15]:
# loop for creating dataframe to export:
gdp_df = vert_df(wb.data.DataFrame(vars_df.iloc[0,0], labels=True), vars_df.iloc[0,1])
for i in range(1, len(vars_df)):
    wbcode, wblabel = vars_df.iloc[i,0], vars_df.iloc[i,1]
    new_data = vert_df(wb.data.DataFrame(wbcode, labels=True), wblabel)
    gdp_df = pd.merge(gdp_df, new_data, how='left', on=['economy', 'Year'])

In [16]:
# replacing label:
gdp_df.rename(columns = {'economy':'ISO3'}, inplace = True)

In [17]:
gdp_df

Unnamed: 0,ISO3,Year,gdp_real_gwt,gdp_real_us_fixed,gdp_per_capita,gdp_pp_govt,gdp_pp_private,investment,inflation_pp,renew_energy,...,metal_exports,co2_emissions,ghg_emissions,industry_gdp,service_value_added,school_enroll,literacy,bachelor,homicide,research
0,ZWE,1990,6.988553,1.694150e+10,1623.930176,19.446133,63.105986,,-0.920431,63.976409,...,15.910397,1.585444,,29.803667,,0.97075,,,5.626697,
1,ZMB,1990,-0.481072,6.840844e+09,851.184841,,,,106.388920,82.982734,...,,0.340930,,45.294071,,,64.998283,,6.843478,
2,YEM,1990,,2.391779e+10,2042.512182,,,,,2.148638,...,,0.567037,,34.343661,,,,,,
3,PSE,1990,,,,,,,,22.082083,...,,,,,,,,,,
4,VIR,1990,,,,,,,,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8507,CEB,2021,5.564249,1.544115e+12,15187.580223,19.434876,55.023021,,5.451522,,...,2.752761,,,27.953207,,,,,,
8508,CSS,2021,5.154339,7.101314e+10,9491.666242,,,,1.983975,,...,0.989368,,,,,,,,,
8509,ARB,2021,3.530294,2.689385e+12,6050.118638,18.310636,,,8.990940,,...,,,,43.144994,,,,,,
8510,AFW,2021,3.914799,8.527638e+11,1810.927774,8.339461,63.593360,,4.821155,,...,,,,29.163412,,,,,,


In [18]:
gdp_df.to_csv('data/wb_data.csv', index=False)