Import Libraries

In [29]:
import pandas as pd
import numpy as np
import os
import yfinance as yf


Currency Codes

In [30]:
country_currency_codes_table = {
    'Burundi': 'BIF',
    'Benin': 'XOF',
    'Burkina Faso': 'XOF',
    'Central African Republic': 'XAF',
    "Côte d'Ivoire": 'XOF',
    'Cameroon': 'XAF',
    'Democratic Republic of the Congo': 'CDF',
    'Republic of Congo': 'XAF',
    'Comoros': 'KMF',
    'Eritrea': 'ERN',
    'Ethiopia': 'ETB',
    'Ghana': 'GHS',
    'Guinea': 'GNF',
    'The Gambia': 'GMD',
    'Guinea-Bissau': 'XOF',
    'Liberia': 'LRD',
    'Madagascar': 'MGA',
    'Mali': 'XOF',
    'Mozambique': 'MZN',
    'Mauritania': 'MRU',
    'Malawi': 'MWK',
    'Niger': 'XOF',
    'Rwanda': 'RWF',
    'Sudan': 'SDG',
    'Senegal': 'XOF',
    'Sierra Leone': 'SLL',
    'Somalia': 'SOS',
    'Sao Tomee and Principe': 'STN',
    'Chad': 'XAF',
    'Togo': 'XOF',
    'Tanzania': 'TZS',
    'Uganda': 'UGX',
    'Zambia': 'ZMW'
}


Read Data

In [31]:
#read in stata dataset as dataframe "country_code_df"
country_code_df = pd.read_stata("codesffactors_Mar23_2019.dta") 
country_code_df.replace('--', np.nan, inplace=True) #replace all entries that are '--' with NaN data value

country_code_df.head()

Unnamed: 0,countryname,wbctry,wbcode,wbregion,wbigroup2017,wblcat2017,hipc,emu,cemac,waemu,imfcode,imfweocode,imfweoiso,imfweoctry,region
0,Aruba,Aruba,ABW,Latin America & Caribbean,High income,,0,0,0,0,,314.0,ABW,Aruba,LAC
1,Afghanistan,Afghanistan,AFG,South Asia,Low income,IDA,1,0,0,0,512.0,512.0,AFG,Afghanistan,SAS
2,Angola,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,0,0,0,0,614.0,614.0,AGO,Angola,SSA
3,Anguilla,,AIA,Latin America & Caribbean,,,0,0,0,0,312.0,,,,LAC
4,Albania,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,0,0,0,0,914.0,914.0,ALB,Albania,ECA


In [32]:
region_classification_ls = country_code_df['wbregion'].unique() #get a list of region classifications
income_classification_ls = country_code_df['wbigroup2017'].unique() #get a list of income classifications
income_classification_ls = [x for x in income_classification_ls if not pd.isna(x)] # remove '--' from income_classification_ls

print(f"The region classifications are: {', '.join(country for country in region_classification_ls)}")
print(f"The income classifications are: {', '.join(country for country in income_classification_ls)}")

The region classifications are: Latin America & Caribbean, South Asia, Sub-Saharan Africa, Europe & Central Asia, Middle East & North Africa, East Asia & Pacific, North America
The income classifications are: High income, Low income, Lower middle income, Upper middle income


In [33]:
ssa_hipc_countries = country_code_df[(country_code_df['hipc'] == 1) & (country_code_df['wbregion'] == 'Sub-Saharan Africa')]
ssa_hipc_countries_ls = ssa_hipc_countries['imfweoctry'].to_list()

ssa_hipc_countries_ls

['Burundi',
 'Benin',
 'Burkina Faso',
 'Central African Republic',
 "Côte d'Ivoire",
 'Cameroon',
 'Democratic Republic of the Congo',
 'Republic of Congo',
 'Comoros',
 'Eritrea',
 'Ethiopia',
 'Ghana',
 'Guinea',
 'The Gambia',
 'Guinea-Bissau',
 'Liberia',
 'Madagascar',
 'Mali',
 'Mozambique',
 'Mauritania',
 'Malawi',
 'Niger',
 'Rwanda',
 'Sudan',
 'Senegal',
 'Sierra Leone',
 'Somalia',
 'São Tomé and Príncipe',
 'Chad',
 'Togo',
 'Tanzania',
 'Uganda',
 'Zambia']

Read in IMF's Government Finance Statistics

In [34]:
master_table = {}
folder_path = "expenditure-data"

# Loop that goes through all files 'file_name' in the folder 'folder_path'
for file_name in os.listdir(folder_path):
  if file_name.endswith('.xlsx') and not file_name.startswith('~$'): #check to see if is spreadsheet .xlsx format
    raw_df = pd.read_excel(folder_path+ "/" + file_name) #read in .xlsx as dataframe
    master_table[file_name[:-5]] = raw_df #store dataframe as an entry in the master_table hash table

# master_table["Central African Republic"]
master_table["Benin"]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Detailed Expense Breakdown,,,,
1,Country: Benin,,,,
2,Sector: Budgetary central government,,,,
3,Unit: Domestic currency,,,,
4,Scale: Billions,,,,
5,,,,,
6,,2010.0,2011.0,2012.0,2013.0
7,Expense,459.01149,457.660567,534.669798,577.5
8,Compensation of employees,235.600075,242.117067,267.900275,296.2
9,Wages and salaries,235.600075,242.117067,267.900275,296.2


Convert values to USD

In [35]:
#REMOVE "JUNK" ROWS AND CONVERT TO CORRECT SCALE (MILLION/BILLIONS)
for key in master_table: #loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
  scale = master_table[key].iloc[4,0] #get if values have scale of billions or millions
  
  # Drop rows 0 to 5, reformatting indexes and column headers
  master_table[key] = master_table[key].drop([0, 1, 2, 3, 4, 5]) 
  master_table[key] = master_table[key].reset_index(drop=True) 
  master_table[key].columns = master_table[key].iloc[0]
  master_table[key] = master_table[key][1:]
  master_table[key] = master_table[key].set_index(master_table[key].columns[0])
  master_table[key].columns = [int(float(col)) for col in master_table[key].columns]
  # master_table[key] = master_table[key].fillna(0)

  if scale == "Scale: Billions":
    master_table[key] = master_table[key]* (10 ** 9) #multiple every value in dataframe by 1e9 if scale is billion
  elif scale == "Scale: Millions":
    master_table[key] = master_table[key]* (10 ** 6) #multiple every value in dataframe by 1e9 if scale is billion


#FILTER OUT COUNTRIES WITH LESS THAN 7 YEARS OF DATA
less_than_seven_years_data_countries = [] #create list to store countries with less than 7 years of historial data
for key in master_table: #iterate through master_table
  num_columns = master_table[key].shape[1] #count the number of columns (each column represents a year)
  if num_columns < 7: #if less than 7 columns/year, add to list to delete
    less_than_seven_years_data_countries.append(key)

for country in less_than_seven_years_data_countries:
  del master_table[country] #delete countries with less than 7 years from data table



# #CONVERT LOCAL CURRENCY DENOMINATED VALUES TO USD
# error_countries = []
# for key in master_table:
#   #get annual 2010-2019 USD to domestic currency exchange rate using yfinance library
#   currency = country_currency_codes_table[key] #retrieve currency symbol from 'country_currency_codes_table'
#   symbol = f"{currency}=X"
#   try:
#     data = yf.download(symbol, start="2010-01-01", end="2020-01-01", interval="1mo") #download monthly USD-Currency dataset from 2010-2020
#     yearly_data_df = data['Close'].resample('YE').mean() #get the annual average exchange rate outputted as datafram
#     yearly_data_table = yearly_data_df.to_dict() # Convert the yearly_data to a hash table with the date as the key
#     year_to_exchange_rate_dict = {key.year: value for key, value in yearly_data_table.items()} # Convert the keys to only the year part
#     print(year_to_exchange_rate_dict)

#     #convert DataFrame values to USD
#     for year in master_table[key].columns: #loop through columns in dataframe
#       year_int = int(year)  # Convert year to integer to match the dictionary keys
#       master_table[key][year] = master_table[key][year] / year_to_exchange_rate_dict[year_int] #divide each column's values by that year's exchange rate

#   except Exception as e: #if errored out
#     print(f"Could not fetch rate for {key} ({currency}): {e}")
#     error_countries.append(key) #add country to error list
#     pass
    
# for country in error_countries:
#   del master_table[country] #delete countries where currency conversion failed
 
master_table["Central African Republic"]

Unnamed: 0_level_0,2010,2011,2012,2014,2015,2016,2017,2018,2019
nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Expense,173191800000.0,154688400000.0,104889000000.0,77390770000.0,93306810000.0,97567300000.0,104410000000.0,124885200000.0,129359000000.0
Compensation of employees,42810150000.0,45990960000.0,51212650000.0,53784000000.0,56675600000.0,49182000000.0,50003000000.0,61336000000.0,65052000000.0
Wages and salaries,40985250000.0,44003470000.0,49015950000.0,51464380000.0,54144200000.0,47346050000.0,47738440000.0,58886740000.0,57087000000.0
Employers' social contributions,1824898000.0,1987488000.0,2196698000.0,2319623000.0,2531400000.0,1835951000.0,2264559000.0,2449258000.0,7965000000.0
Use of goods and services,100287800000.0,75432880000.0,24993340000.0,6348000000.0,15233640000.0,21972000000.0,27744000000.0,30478000000.0,36371000000.0
Interest expense,9511550000.0,6983069000.0,5923666000.0,2169769000.0,4697813000.0,1155630000.0,2205000000.0,2801230000.0,6482000000.0
To nonresidents,2779550000.0,2709069000.0,2363420000.0,2032978000.0,334501800.0,1085000000.0,2091000000.0,2656230000.0,1642000000.0
To residents other than government units,6732000000.0,4274000000.0,3560247000.0,136790500.0,4363311000.0,70630000.0,114000000.0,145000000.0,0.0
To other government units,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4840000000.0
Subsidies,,,,0.0,0.0,0.0,0.0,0.0,0.0


Convert to Percent

In [36]:
for key in master_table: #iterate through master_table
  total_expense = master_table[key].loc['Expense']
  master_table[key] = master_table[key] / total_expense * 100 #take all values as a fraction of total expenditure, then convert decimal to percent, then round to two decimal places

master_table["Central African Republic"]

Unnamed: 0_level_0,2010,2011,2012,2014,2015,2016,2017,2018,2019
nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Expense,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
Compensation of employees,24.718338,29.731357,48.825564,69.496661,60.74112,50.408285,47.891007,49.113894,50.287958
Wages and salaries,23.664652,28.446524,46.731256,66.499375,58.028135,48.526557,45.722096,47.152687,44.130675
Employers' social contributions,1.053686,1.284834,2.094308,2.997286,2.712985,1.881728,2.16891,1.961207,6.157283
Use of goods and services,57.905588,48.764408,23.828373,8.202529,16.326395,22.519841,26.572167,24.404808,28.116327
Interest expense,5.491916,4.514281,5.647557,2.803653,5.034801,1.184444,2.111867,2.243043,5.010861
To nonresidents,1.604897,1.751308,2.253258,2.6269,0.358497,1.112053,2.002682,2.126937,1.269336
To residents other than government units,3.887019,2.762974,3.394299,0.176753,4.676305,0.072391,0.109185,0.116107,0.0
To other government units,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.741526
Subsidies,,,,0.0,0.0,0.0,0.0,0.0,0.0


Calculate Yearly Averages

In [37]:
data_by_year_table = {
  2010: [],
  2011: [],
  2012: [],
  2013: [],
  2014: [],
  2015: [],
  2016: [],
  2017: [],
  2018: [],
  2019: [],
} #make table that stores data by year

# master_averages_ls = [] #make master averages list as place to store averages

for key in master_table:
  years_available = master_table[key].columns
  for year in data_by_year_table:
    if year in years_available:
      data = master_table[key][year]
      data.drop('To other government units', inplace=True)
      data_by_year_table[year].append(data)


yearly_averages_ls = [] #make a list of the yearly averages
for year in data_by_year_table:
  merged_year_df = pd.concat(data_by_year_table[year], axis=1, join='outer')
  year_average_df = merged_year_df.mean(axis=1) #get the average of all the countries' averages
  yearly_averages_ls.append(year_average_df)

 
# #Get average over decade
# #loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
# for key in master_table:
#   per_category_average = master_table[key].mean(axis=1) #get the average of each category (ignoring np.NaN values) into a list
#   # per_category_average.drop('To other government units', inplace=True) #drop 'To other government units' since there's two rows named that and will error
#   master_averages_ls.append(per_category_average) #add each country average to to the master averages list



# merged_averages_df = pd.concat(master_averages_ls, axis=1, join='outer')
# average__of_averages_df = merged_averages_df.mean(axis=1) #get the average of all the countries' averages
# average__of_averages_df

for i in range(len(yearly_averages_ls)):
  print(f"YEAR: {2010+i}")
  print(yearly_averages_ls[i])

YEAR: 2010
nan
Expense                                     100.000000
Compensation of employees                    34.865685
Wages and salaries                           31.541157
Employers' social contributions               1.520964
Use of goods and services                    25.932302
Interest expense                              6.469680
To nonresidents                               1.913966
To residents other than government units      4.164547
Subsidies                                     7.193627
To public corporations                        5.991368
To private enterprises                        1.391076
To other sectors                              0.000000
Grants                                       18.084170
To foreign governments                        0.054208
To international organizations                0.836049
Current                                      16.515672
Capital                                       1.595776
Social benefits                               2.37

Calculate Average Across Period

In [38]:
merged_percentages_df = pd.concat(yearly_averages_ls, axis=1, join='outer')
average_df = merged_percentages_df.mean(axis=1)
rounded_average_df = average_df.apply(lambda x: round(x, 2))

rounded_average_df = rounded_average_df.to_frame()
rounded_average_df

Unnamed: 0_level_0,0
nan,Unnamed: 1_level_1
Expense,100.0
Compensation of employees,36.54
Wages and salaries,33.22
Employers' social contributions,1.96
Use of goods and services,22.13
Interest expense,8.81
To nonresidents,2.39
To residents other than government units,5.42
Subsidies,4.07
To public corporations,2.27


In [39]:
#one method filter for countries with 7+ years of data, aggregate in way that preserves identities
#unweighted - convert to share and take average (ratios period by period and then average)