Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import yfinance as yf


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Currency Codes

In [2]:
country_currency_codes_table = {
    'Burundi': 'BIF',
    'Benin': 'XOF',
    'Burkina Faso': 'XOF',
    'Central African Republic': 'XAF',
    "Côte d'Ivoire": 'XOF',
    'Cameroon': 'XAF',
    'Democratic Republic of the Congo': 'CDF',
    'Congo, Rep.': 'XAF',
    'Comoros': 'KMF',
    'Eritrea': 'ERN',
    'Ethiopia': 'ETB',
    'Ghana': 'GHS',
    'Guinea': 'GNF',
    'The Gambia': 'GMD',
    'Guinea-Bissau': 'XOF',
    'Liberia': 'LRD',
    'Madagascar': 'MGA',
    'Mali': 'XOF',
    'Mozambique': 'MZN',
    'Mauritania': 'MRU',
    'Malawi': 'MWK',
    'Niger': 'XOF',
    'Rwanda': 'RWF',
    'Sudan': 'SDG',
    'Senegal': 'XOF',
    'Sierra Leone': 'SLL',
    'Somalia': 'SOS',
    'Sao Tomee and Principe': 'STN',
    'Chad': 'XAF',
    'Togo': 'XOF',
    'Tanzania': 'TZS',
    'Uganda': 'UGX',
    'Zambia': 'ZMW'
}


Read Data

In [3]:
#read in stata dataset as dataframe "country_code_df"
country_code_df = pd.read_stata("codesffactors_Mar23_2019.dta") 
country_code_df.replace('--', np.nan, inplace=True) #replace all entries that are '--' with NaN data value

country_code_df.head()

Unnamed: 0,countryname,wbctry,wbcode,wbregion,wbigroup2017,wblcat2017,hipc,emu,cemac,waemu,imfcode,imfweocode,imfweoiso,imfweoctry,region
0,Aruba,Aruba,ABW,Latin America & Caribbean,High income,,0,0,0,0,,314.0,ABW,Aruba,LAC
1,Afghanistan,Afghanistan,AFG,South Asia,Low income,IDA,1,0,0,0,512.0,512.0,AFG,Afghanistan,SAS
2,Angola,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,0,0,0,0,614.0,614.0,AGO,Angola,SSA
3,Anguilla,,AIA,Latin America & Caribbean,,,0,0,0,0,312.0,,,,LAC
4,Albania,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,0,0,0,0,914.0,914.0,ALB,Albania,ECA


In [4]:
region_classification_ls = country_code_df['wbregion'].unique() #get a list of region classifications
income_classification_ls = country_code_df['wbigroup2017'].unique() #get a list of income classifications
income_classification_ls = [x for x in income_classification_ls if not pd.isna(x)] # remove '--' from income_classification_ls

print(f"The region classifications are: {', '.join(country for country in region_classification_ls)}")
print(f"The income classifications are: {', '.join(country for country in income_classification_ls)}")

The region classifications are: Latin America & Caribbean, South Asia, Sub-Saharan Africa, Europe & Central Asia, Middle East & North Africa, East Asia & Pacific, North America
The income classifications are: High income, Low income, Lower middle income, Upper middle income


In [5]:
ssa_hipc_countries = country_code_df[(country_code_df['hipc'] == 1) & (country_code_df['wbregion'] == 'Sub-Saharan Africa')]
ssa_hipc_countries_ls = ssa_hipc_countries['imfweoctry'].to_list()

ssa_hipc_countries_ls

['Burundi',
 'Benin',
 'Burkina Faso',
 'Central African Republic',
 "Côte d'Ivoire",
 'Cameroon',
 'Democratic Republic of the Congo',
 'Republic of Congo',
 'Comoros',
 'Eritrea',
 'Ethiopia',
 'Ghana',
 'Guinea',
 'The Gambia',
 'Guinea-Bissau',
 'Liberia',
 'Madagascar',
 'Mali',
 'Mozambique',
 'Mauritania',
 'Malawi',
 'Niger',
 'Rwanda',
 'Sudan',
 'Senegal',
 'Sierra Leone',
 'Somalia',
 'São Tomé and Príncipe',
 'Chad',
 'Togo',
 'Tanzania',
 'Uganda',
 'Zambia']

Read in IMF's Government Finance Statistics

In [6]:
master_table = {}
folder_path = "expenditure-data"

# Loop that goes through all files 'file_name' in the folder 'folder_path'
for file_name in os.listdir(folder_path):
  if file_name.endswith('.xlsx') and not file_name.startswith('~$'): #check to see if is spreadsheet .xlsx format
    raw_df = pd.read_excel(folder_path+ "/" + file_name) #read in .xlsx as dataframe
    master_table[file_name[:-5]] = raw_df #store dataframe as an entry in the master_table hash table

# master_table["Central African Republic"]
master_table["Benin"]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Detailed Expense Breakdown,,,,
1,Country: Benin,,,,
2,Sector: Budgetary central government,,,,
3,Unit: Domestic currency,,,,
4,Scale: Billions,,,,
5,,,,,
6,,2010.0,2011.0,2012.0,2013.0
7,Expense,459.01149,457.660567,534.669798,577.5
8,Compensation of employees,235.600075,242.117067,267.900275,296.2
9,Wages and salaries,235.600075,242.117067,267.900275,296.2


Convert values to USD

In [7]:
error_countries = []

#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  scale = master_table[key].iloc[4,0] #get if values have scale of billions or millions
  
  # Drop rows 0 to 5, reformatting indexes and column headers
  master_table[key] = master_table[key].drop([0, 1, 2, 3, 4, 5]) 
  master_table[key] = master_table[key].reset_index(drop=True) 
  master_table[key].columns = master_table[key].iloc[0]
  master_table[key] = master_table[key][1:]
  master_table[key] = master_table[key].set_index(master_table[key].columns[0])

  if scale == "Scale: Billions":
    master_table[key] = master_table[key]* (10 ** 9) #multiple every value in dataframe by 1e9 if scale is billion
  elif scale == "Scale: Millions":
    master_table[key] = master_table[key]* (10 ** 6) #multiple every value in dataframe by 1e9 if scale is billion


  #get annual 2010-2019 USD to domestic currency exchange rate using yfinance library
  currency = country_currency_codes_table[key] #retrieve currency symbol from 'country_currency_codes_table'
  symbol = f"{currency}=X"
  try:
    data = yf.download(symbol, start="2010-01-01", end="2020-01-01", interval="1mo") #download monthly USD-Currency dataset from 2010-2020
    yearly_data_df = data['Close'].resample('YE').mean() #get the annual average exchange rate outputted as datafram
    yearly_data_table = yearly_data_df.to_dict() # Convert the yearly_data to a hash table with the date as the key
    year_to_exchange_rate_dict = {key.year: value for key, value in yearly_data_table.items()} # Convert the keys to only the year part
    print(year_to_exchange_rate_dict)

    #convert DataFrame values to USD
    for year in master_table[key].columns: #loop through columns in dataframe
      year_int = int(year)  # Convert year to integer to match the dictionary keys
      master_table[key][year] = master_table[key][year] / year_to_exchange_rate_dict[year_int] #divide each column's values by that year's exchange rate

  except Exception as e: #if errored out
    pass
    print(f"Could not fetch rate for {key} ({currency}): {e}")
    error_countries.append(key) #add country to error list
    

for country in error_countries:
  del master_table[country]
 
master_table["Benin"]

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
Could not fetch rate for Senegal (XOF): 2009
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.794998

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

{2010: 1526.5750020345051, 2011: 1574.0833333333333, 2012: 1587.1249898274739, 2013: 1351.7416585286458, 2014: 849.4726765950521, 2015: 593.25, 2016: 573.0257975260416, 2017: 557.7736612955729, 2018: 567.1666666666666, 2019: 570.6666666666666}
Could not fetch rate for Somalia (SOS): 2020
{2010: 1422.5833231608074, 2011: 1568.4333394368489, 2012: 1563.4083455403645, 2013: 1581.8916727701824, 2014: 1632.8416646321614, 2015: 1989.9833170572917, 2016: 2136.6749674479165, 2017: 2193.6416625976562, 2018: 2273.074991861979, 2019: 2301.0916748046875}
Could not fetch rate for Tanzania (TZS): 2009
{2013: 5.374436378479004, 2014: 6.177958369255066, 2015: 8.771591663360596, 2016: 10.236883401870728, 2017: 9.491416613260904, 2018: 10.564791679382324, 2019: 13.000277121861776}
Could not fetch rate for Zambia (ZMW): 2005



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


{2010: 886.25, 2011: 910.0, 2012: 904.5833333333334, 2013: 906.6291656494141, 2014: 907.25, 2015: 910.0, 2016: 978.4496866861979, 2017: 1457.9642842610676, 2018: 1580.0, 2019: 1630.0}
{2010: 575.0333302815756, 2011: 590.4300028483073, 2012: 599.2916666666666, 2013: 640.4175059000651, 2014: 674.0025024414062, 2015: 696.072499593099, 2016: 768.2083384195963, 2017: 827.0033365885416, 2018: 847.0541687011719, 2019: 901.6158345540365}
Could not fetch rate for Rwanda (RWF): 2008
{2010: 26.853833357493084, 2011: 27.202499707539875, 2012: 30.04616641998291, 2013: 33.27666695912679, 2014: 39.641666094462074, 2015: 40.28999996185303, 2016: 41.46416632334391, 2017: 45.03499984741211, 2018: 47.809999783833824, 2019: 49.86249987284342}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
Could not fetch rate for T

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['STN=X']: YFInvalidPeriodError("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


{2010: 148.9291674296061, 2011: 155.58916854858398, 2012: 247.6258316040039, 2013: 362.4624989827474, 2014: 410.5541687011719, 2015: 495.21583557128906, 2016: 700.3816680908203, 2017: 720.1374969482422, 2018: 704.3308308919271, 2019: 718.8183339436849}
Could not fetch rate for Malawi (MWK): 2009
Could not fetch rate for Sao Tomee and Principe (STN): Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'
{2010: 34.550000508626304, 2011: 28.579166730244953, 2012: 28.048333326975506, 2013: 29.664999802907307, 2014: 31.18499978383382, 2015: 39.24833265940348, 2016: 62.973334312438965, 2017: 62.673333168029785, 2018: 59.819166819254555, 2019: 61.90666675567627}
Could not fetch rate for Mozambique (MZN): 2020
{2010: 3920.0, 2011: 4280.166666666667, 2012: 4277.166666666667, 2013: 4268.166666666667, 2014: 4302.333333333333, 2015: 4348.583333333333, 2016: 5087.916666666667, 2017: 7505.0, 2018: 7962.916666666667, 2019: 9048.333333333334}
Could not fetch rate

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 14.29200005531311, 2011: 16.807833194732666, 2012: 17.636333147684734, 2013: 18.556583563486736, 2014: 19.54866663614909, 2015: 20.516416549682617, 2016: 21.691750208536785, 2017: 23.84641679128011, 2018: 27.412750085194904, 2019: 29.055416584014893}
{2010: 1206.0750122070312, 2011: 1216.6916605631511, 2012: 1396.4749857584636, 2013: 1531.5916748046875, 2014: 1530.9249877929688, 2015: 1536.9166666666667, 2016: 1605.7583312988281, 2017: 1700.5, 2018: 1756.2916666666667, 2019: 1822.7166646321614}


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

{2010: 70.31111145019531, 2011: 71.36666742960612, 2012: 72.72916730244954, 2013: 75.09500058492024, 2014: 80.8000005086263, 2015: 83.20833333333333, 2016: 89.58333333333333, 2017: 105.09583409627278, 2018: 144.7375005086263, 2019: 186.4666659037272}
Could not fetch rate for Liberia (LRD): 2005
{2010: 2070.574991861979, 2011: 1995.8000183105469, 2012: 2154.4749755859375, 2013: 2174.9000040690103, 2014: 2460.4250284830728, 2015: 3095.6833699544272, 2016: 3135.3916625976562, 2017: 3076.9583333333335, 2018: 3307.1916707356772, 2019: 3290.9500080744424}
Could not fetch rate for Madagascar (MGA): 2005
{2010: 2150.6416625976562, 2011: 2493.4666951497397, 2012: 2482.1000162760415, 2013: 2550.25, 2014: 2583.9749959309897, 2015: 3224.1166381835938, 2016: 3384.675008138021, 2017: 3570.7333170572915, 2018: 3695.5916951497397, 2019: 3662.6139322916665}
Could not fetch rate for Uganda (UGX): 2005





Unnamed: 0_level_0,2010.0,2011.0,2012.0,2013.0
nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Expense,941348000.0,993632600.0,1067106000.0,1171592000.0
Compensation of employees,483172400.0,525663400.0,534681300.0,600909900.0
Wages and salaries,483172400.0,525663400.0,534681300.0,600909900.0
Employers' social contributions,0.0,0.0,0.0,0.0
Use of goods and services,146013100.0,149162900.0,188478400.0,215248300.0
Interest expense,36371080.0,32509960.0,46189660.0,40371730.0
To nonresidents,16624220.0,17002210.0,21165010.0,19881560.0
To residents other than government units,19746870.0,15507750.0,25024640.0,20490170.0
To other government units,0.0,0.0,0.0,0.0
Subsidies,42094150.0,72847880.0,109952300.0,100422100.0


Calculate Averages

In [8]:
master_averages_ls = [] #make master averages list as place to store averages

#Get average over decade
#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  per_category_average = master_table[key].mean(axis=1) #get the average of each category (ignoring np.NaN values) into a list
  per_category_average.drop('To other government units', inplace=True) #drop 'To other government units' since there's two rows named that and will error
  master_averages_ls.append(per_category_average) #add each country average to to the master averages list


merged_averages_df = pd.concat(master_averages_ls, axis=1, join='outer')
average__of_averages_df = merged_averages_df.mean(axis=1) #get the average of all the countries' averages
average__of_averages_df

nan
Expense                                     2.982732e+09
Compensation of employees                   9.973756e+08
Wages and salaries                          8.365154e+08
Employers' social contributions             6.235448e+07
Use of goods and services                   5.200425e+08
Interest expense                            3.443130e+08
To nonresidents                             9.527869e+07
To residents other than government units    1.611199e+08
Subsidies                                   2.286420e+08
To public corporations                      1.955056e+08
To private enterprises                      3.642134e+07
To other sectors                            2.690214e+07
Grants                                      6.682786e+08
To foreign governments                      0.000000e+00
To international organizations              7.911453e+06
Current                                     4.963575e+08
Capital                                     2.115085e+08
Social benefits            

Calculate Percentages

In [9]:
total_expenditure = average__of_averages_df['Expense']
average__of_averages_df = round(average__of_averages_df / total_expenditure * 100, 2) #take all values as a fraction of total expenditure, then convert decimal to percent, then round to two decimal places
average__of_averages_df

nan
Expense                                     100.00
Compensation of employees                    33.44
Wages and salaries                           28.05
Employers' social contributions               2.09
Use of goods and services                    17.44
Interest expense                             11.54
To nonresidents                               3.19
To residents other than government units      5.40
Subsidies                                     7.67
To public corporations                        6.55
To private enterprises                        1.22
To other sectors                              0.90
Grants                                       22.40
To foreign governments                        0.00
To international organizations                0.27
Current                                      16.64
Capital                                       7.09
Social benefits                               0.97
Social security benefits                      0.17
Social assistance benefits 