Import Libraries

In [279]:
import pandas as pd
import numpy as np
import os
import yfinance as yf


Currency Codes

In [280]:
country_currency_codes_table = {
    'Burundi': 'BIF',
    'Benin': 'XOF',
    'Burkina Faso': 'XOF',
    'Central African Republic': 'XAF',
    "Côte d'Ivoire": 'XOF',
    'Cameroon': 'XAF',
    'Democratic Republic of the Congo': 'CDF',
    'Congo, Rep.': 'XAF',
    'Comoros': 'KMF',
    'Eritrea': 'ERN',
    'Ethiopia': 'ETB',
    'Ghana': 'GHS',
    'Guinea': 'GNF',
    'Gambia, The': 'GMD',
    'Guinea-Bissau': 'XOF',
    'Liberia': 'LRD',
    'Madagascar': 'MGA',
    'Mali': 'XOF',
    'Mozambique': 'MZN',
    'Mauritania': 'MRU',
    'Malawi': 'MWK',
    'Niger': 'XOF',
    'Rwanda': 'RWF',
    'Sudan': 'SDG',
    'Senegal': 'XOF',
    'Sierra Leone': 'SLL',
    'Somalia': 'SOS',
    'São Tomé and Principe': 'STN',
    'Chad': 'XAF',
    'Togo': 'XOF',
    'Tanzania': 'TZS',
    'Uganda': 'UGX',
    'Zambia': 'ZMW'
}


Read Data

In [281]:
#read in stata dataset as dataframe "country_code_df"
country_code_df = pd.read_stata("codesffactors_Mar23_2019.dta") 
country_code_df.replace('--', np.nan, inplace=True) #replace all entries that are '--' with NaN data value

country_code_df.head()

Unnamed: 0,countryname,wbctry,wbcode,wbregion,wbigroup2017,wblcat2017,hipc,emu,cemac,waemu,imfcode,imfweocode,imfweoiso,imfweoctry,region
0,Aruba,Aruba,ABW,Latin America & Caribbean,High income,,0,0,0,0,,314.0,ABW,Aruba,LAC
1,Afghanistan,Afghanistan,AFG,South Asia,Low income,IDA,1,0,0,0,512.0,512.0,AFG,Afghanistan,SAS
2,Angola,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,0,0,0,0,614.0,614.0,AGO,Angola,SSA
3,Anguilla,,AIA,Latin America & Caribbean,,,0,0,0,0,312.0,,,,LAC
4,Albania,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,0,0,0,0,914.0,914.0,ALB,Albania,ECA


In [282]:
region_classification_ls = country_code_df['wbregion'].unique() #get a list of region classifications
income_classification_ls = country_code_df['wbigroup2017'].unique() #get a list of income classifications
income_classification_ls = [x for x in income_classification_ls if not pd.isna(x)] # remove '--' from income_classification_ls

print(f"The region classifications are: {', '.join(country for country in region_classification_ls)}")
print(f"The income classifications are: {', '.join(country for country in income_classification_ls)}")

The region classifications are: Latin America & Caribbean, South Asia, Sub-Saharan Africa, Europe & Central Asia, Middle East & North Africa, East Asia & Pacific, North America
The income classifications are: High income, Low income, Lower middle income, Upper middle income


In [283]:
ssa_hipc_countries = country_code_df[(country_code_df['hipc'] == 1) & (country_code_df['wbregion'] == 'Sub-Saharan Africa')]
ssa_hipc_countries_ls = ssa_hipc_countries['imfweoctry'].to_list()

ssa_hipc_countries_ls

['Burundi',
 'Benin',
 'Burkina Faso',
 'Central African Republic',
 "Côte d'Ivoire",
 'Cameroon',
 'Democratic Republic of the Congo',
 'Republic of Congo',
 'Comoros',
 'Eritrea',
 'Ethiopia',
 'Ghana',
 'Guinea',
 'The Gambia',
 'Guinea-Bissau',
 'Liberia',
 'Madagascar',
 'Mali',
 'Mozambique',
 'Mauritania',
 'Malawi',
 'Niger',
 'Rwanda',
 'Sudan',
 'Senegal',
 'Sierra Leone',
 'Somalia',
 'São Tomé and Príncipe',
 'Chad',
 'Togo',
 'Tanzania',
 'Uganda',
 'Zambia']

In [284]:
#convert to USD

#get average per country

#get overall average

#calculate breakdowns


argentina_df = pd.read_excel("Argentina.xlsx") 

argentina_df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Detailed Revenue Breakdown,Detailed Revenue Breakdown,,,,,,,,,
1,Country: Argentina,,,,,,,,,,
2,Sector: Budgetary central government,,,,,,,,,,
3,Unit: Domestic currency,,,,,,,,,,
4,Scale: Millions,,,,,,,,,,


Read in IMF's Government Finance Statistics

In [285]:
master_table = {}
folder_path = "expenditure-data-test"

# Loop that goes through all files 'file_name' in the folder 'folder_path'
for file_name in os.listdir(folder_path):
  if file_name.endswith('.xlsx') and not file_name.startswith('~$'): #check to see if is spreadsheet .xlsx format
    raw_df = pd.read_excel(folder_path+ "/" + file_name) #read in .xlsx as dataframe
    master_table[file_name[:-5]] = raw_df #store dataframe as an entry in the master_table hash table

# master_table["Central African Republic"]
master_table["Benin"]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Detailed Expense Breakdown,,,,
1,Country: Benin,,,,
2,Sector: Budgetary central government,,,,
3,Unit: Domestic currency,,,,
4,Scale: Billions,,,,
5,,,,,
6,,2010.0,2011.0,2012.0,2013.0
7,Expense,459.01149,457.660567,534.669798,577.5
8,Compensation of employees,235.600075,242.117067,267.900275,296.2
9,Wages and salaries,235.600075,242.117067,267.900275,296.2


Convert values to USD

In [286]:
error_countries = []

#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  scale = master_table[key].iloc[4,0] #get if values have scale of billions or millions
  
  # Drop rows 0 to 5, reformatting indexes and column headers
  master_table[key] = master_table[key].drop([0, 1, 2, 3, 4, 5]) 
  master_table[key] = master_table[key].reset_index(drop=True) 
  master_table[key].columns = master_table[key].iloc[0]
  master_table[key] = master_table[key][1:]
  master_table[key] = master_table[key].set_index(master_table[key].columns[0])

  if scale == "Scale: Billions":
    master_table[key] = master_table[key]* (10 ** 9) #multiple every value in dataframe by 1e9 if scale is billion
  elif scale == "Scale: Millions":
    master_table[key] = master_table[key]* (10 ** 6) #multiple every value in dataframe by 1e9 if scale is billion


  #get annual 2010-2019 USD to domestic currency exchange rate using yfinance library
  currency = country_currency_codes_table[key] #retrieve currency symbol from 'country_currency_codes_table'
  symbol = f"{currency}=X"
  try:
    data = yf.download(symbol, start="2010-01-01", end="2020-01-01", interval="1mo") #download monthly USD-Currency dataset from 2010-2020
    yearly_data_df = data['Close'].resample('YE').mean() #get the annual average exchange rate outputted as datafram
    yearly_data_table = yearly_data_df.to_dict() # Convert the yearly_data to a hash table with the date as the key
    year_to_exchange_rate_dict = {key.year: value for key, value in yearly_data_table.items()} # Convert the keys to only the year part
    print(year_to_exchange_rate_dict)

    #convert DataFrame values to USD
    for year in master_table[key].columns: #loop through columns in dataframe
      year_int = int(year)  # Convert year to integer to match the dictionary keys
      master_table[key][year] = master_table[key][year] / year_to_exchange_rate_dict[year_int] #divide each column's values by that year's exchange rate

  except Exception as e: #if errored out
    print(f"Could not fetch rate for {key} ({currency}): {e}")
    error_countries.append(key) #add country to error list
    pass

for country in error_countries:
  del master_table[country]
 
master_table["Benin"]

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}



[*********************100%%**********************]  1 of 1 completed


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

{2010: 487.5991668701172, 2011: 460.59083048502606, 2012: 501.04666900634766, 2013: 492.91916910807294, 2014: 497.4808349609375, 2015: 594.4341735839844, 2016: 594.0333353678385, 2017: 578.2424977620443, 2018: 556.1933390299479, 2019: 586.7139180501302}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 2070.574991861979, 2011: 1995.8000183105469, 2012: 2154.4749755859375, 2013: 2174.9000040690103, 2014: 2460.4250284830728, 2015: 3095.6833699544272, 2016: 3135.3916625976562, 2017: 3076.9583333333335, 2018: 3307.1916707356772, 2019: 3290.9500080744424}
Could not fetch rate for Madagascar (MGA): 2005





Unnamed: 0_level_0,2010.0,2011.0,2012.0,2013.0
nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Expense,941348000.0,993632600.0,1067106000.0,1171592000.0
Compensation of employees,483172400.0,525663400.0,534681300.0,600909900.0
Wages and salaries,483172400.0,525663400.0,534681300.0,600909900.0
Employers' social contributions,0.0,0.0,0.0,0.0
Use of goods and services,146013100.0,149162900.0,188478400.0,215248300.0
Interest expense,36371080.0,32509960.0,46189660.0,40371730.0
To nonresidents,16624220.0,17002210.0,21165010.0,19881560.0
To residents other than government units,19746870.0,15507750.0,25024640.0,20490170.0
To other government units,0.0,0.0,0.0,0.0
Subsidies,42094150.0,72847880.0,109952300.0,100422100.0


Calculate Averages

In [287]:
master_averages_ls = [] #make master averages list as place to store averages

#Get average over decade
#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  per_category_average = master_table[key].mean(axis=1) #get the average of each category (ignoring np.NaN values) into a list
  per_category_average.drop('To other government units', inplace=True) #drop 'To other government units' since there's two rows named that and will error
  master_averages_ls.append(per_category_average) #add each country average to to the master averages list


merged_averages_df = pd.concat(master_averages_ls, axis=1, join='outer')
average_df = merged_averages_df.mean(axis=1) #get the average of all the countries' averages
average_df

nan
Expense                                     1.804902e+09
Compensation of employees                   7.187126e+08
Wages and salaries                          2.404558e+08
Employers' social contributions             1.716641e+06
Use of goods and services                   3.099287e+08
Interest expense                            1.520446e+08
To nonresidents                             8.231896e+07
To residents other than government units    6.949644e+07
Subsidies                                   1.785343e+08
To public corporations                      1.672332e+08
To private enterprises                      6.314578e+07
To other sectors                            1.551655e+07
Grants                                      2.830172e+08
To foreign governments                      0.000000e+00
To international organizations              6.690818e+06
Current                                     2.066276e+08
Capital                                     6.969876e+07
Social benefits            

Calculate Percentages