Import Libraries

In [34]:
import pandas as pd
import numpy as np
import os
import yfinance as yf


Currency Codes

In [35]:
country_currency_codes_table = {
    'Burundi': 'BIF',
    'Benin': 'XOF',
    'Burkina Faso': 'XOF',
    'Central African Republic': 'XAF',
    "Côte d'Ivoire": 'XOF',
    'Cameroon': 'XAF',
    'Democratic Republic of the Congo': 'CDF',
    'Congo, Rep.': 'XAF',
    'Comoros': 'KMF',
    'Eritrea': 'ERN',
    'Ethiopia': 'ETB',
    'Ghana': 'GHS',
    'Guinea': 'GNF',
    'Gambia, The': 'GMD',
    'Guinea-Bissau': 'XOF',
    'Liberia': 'LRD',
    'Madagascar': 'MGA',
    'Mali': 'XOF',
    'Mozambique': 'MZN',
    'Mauritania': 'MRU',
    'Malawi': 'MWK',
    'Niger': 'XOF',
    'Rwanda': 'RWF',
    'Sudan': 'SDG',
    'Senegal': 'XOF',
    'Sierra Leone': 'SLL',
    'Somalia': 'SOS',
    'São Tomé and Principe': 'STN',
    'Chad': 'XAF',
    'Togo': 'XOF',
    'Tanzania': 'TZS',
    'Uganda': 'UGX',
    'Zambia': 'ZMW'
}


Read Data

In [36]:
#read in stata dataset as dataframe "country_code_df"
country_code_df = pd.read_stata("codesffactors_Mar23_2019.dta") 
country_code_df.replace('--', np.nan, inplace=True) #replace all entries that are '--' with NaN data value

country_code_df.head()

Unnamed: 0,countryname,wbctry,wbcode,wbregion,wbigroup2017,wblcat2017,hipc,emu,cemac,waemu,imfcode,imfweocode,imfweoiso,imfweoctry,region
0,Aruba,Aruba,ABW,Latin America & Caribbean,High income,,0,0,0,0,,314.0,ABW,Aruba,LAC
1,Afghanistan,Afghanistan,AFG,South Asia,Low income,IDA,1,0,0,0,512.0,512.0,AFG,Afghanistan,SAS
2,Angola,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,0,0,0,0,614.0,614.0,AGO,Angola,SSA
3,Anguilla,,AIA,Latin America & Caribbean,,,0,0,0,0,312.0,,,,LAC
4,Albania,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,0,0,0,0,914.0,914.0,ALB,Albania,ECA


In [37]:
region_classification_ls = country_code_df['wbregion'].unique() #get a list of region classifications
income_classification_ls = country_code_df['wbigroup2017'].unique() #get a list of income classifications
income_classification_ls = [x for x in income_classification_ls if not pd.isna(x)] # remove '--' from income_classification_ls

print(f"The region classifications are: {', '.join(country for country in region_classification_ls)}")
print(f"The income classifications are: {', '.join(country for country in income_classification_ls)}")

The region classifications are: Latin America & Caribbean, South Asia, Sub-Saharan Africa, Europe & Central Asia, Middle East & North Africa, East Asia & Pacific, North America
The income classifications are: High income, Low income, Lower middle income, Upper middle income


In [38]:
ssa_hipc_countries = country_code_df[(country_code_df['hipc'] == 1) & (country_code_df['wbregion'] == 'Sub-Saharan Africa')]
ssa_hipc_countries_ls = ssa_hipc_countries['imfweoctry'].to_list()

ssa_hipc_countries_ls

['Burundi',
 'Benin',
 'Burkina Faso',
 'Central African Republic',
 "Côte d'Ivoire",
 'Cameroon',
 'Democratic Republic of the Congo',
 'Republic of Congo',
 'Comoros',
 'Eritrea',
 'Ethiopia',
 'Ghana',
 'Guinea',
 'The Gambia',
 'Guinea-Bissau',
 'Liberia',
 'Madagascar',
 'Mali',
 'Mozambique',
 'Mauritania',
 'Malawi',
 'Niger',
 'Rwanda',
 'Sudan',
 'Senegal',
 'Sierra Leone',
 'Somalia',
 'São Tomé and Príncipe',
 'Chad',
 'Togo',
 'Tanzania',
 'Uganda',
 'Zambia']

In [39]:
#convert to USD

#get average per country

#get overall average

#calculate breakdowns


argentina_df = pd.read_excel("Argentina.xlsx") 

argentina_df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Detailed Revenue Breakdown,Detailed Revenue Breakdown,,,,,,,,,
1,Country: Argentina,,,,,,,,,,
2,Sector: Budgetary central government,,,,,,,,,,
3,Unit: Domestic currency,,,,,,,,,,
4,Scale: Millions,,,,,,,,,,


Read in IMF's Government Finance Statistics

In [40]:
master_table = {}
folder_path = "expenditure-data-test"

# Loop that goes through all files 'file_name' in the folder 'folder_path'
for file_name in os.listdir(folder_path):
  if file_name.endswith('.xlsx') and not file_name.startswith('~$'): #check to see if is spreadsheet .xlsx format
    raw_df = pd.read_excel(folder_path+ "/" + file_name) #read in .xlsx as dataframe
    master_table[file_name[:-5]] = raw_df #store dataframe as an entry in the master_table hash table

# master_table["Central African Republic"]
master_table["Benin"]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Detailed Expense Breakdown,,,,
1,Country: Benin,,,,
2,Sector: Budgetary central government,,,,
3,Unit: Domestic currency,,,,
4,Scale: Billions,,,,
5,,,,,
6,,2010.0,2011.0,2012.0,2013.0
7,Expense,459.01149,457.660567,534.669798,577.5
8,Compensation of employees,235.600075,242.117067,267.900275,296.2
9,Wages and salaries,235.600075,242.117067,267.900275,296.2


Convert values to USD

In [41]:
error_countries = []

#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  scale = master_table[key].iloc[4,0] #get if values have scale of billions or millions
  
  # Drop rows 0 to 5, reformatting indexes and column headers
  master_table[key] = master_table[key].drop([0, 1, 2, 3, 4, 5]) 
  master_table[key] = master_table[key].reset_index(drop=True) 
  master_table[key].columns = master_table[key].iloc[0]
  master_table[key] = master_table[key][1:]
  master_table[key] = master_table[key].set_index(master_table[key].columns[0])

  if scale == "Scale: Billions":
    master_table[key] = master_table[key]* (10 ** 9) #multiple every value in dataframe by 1e9 if scale is billion
  elif scale == "Scale: Millions":
    master_table[key] = master_table[key]* (10 ** 6) #multiple every value in dataframe by 1e9 if scale is billion


  #get annual 2010-2019 USD to domestic currency exchange rate using yfinance library
  currency = country_currency_codes_table[key] #retrieve currency symbol from 'country_currency_codes_table'
  symbol = f"{currency}=X"
  try:
    data = yf.download(symbol, start="2010-01-01", end="2020-01-01", interval="1mo") #download monthly USD-Currency dataset from 2010-2020
    yearly_data_df = data['Close'].resample('YE').mean() #get the annual average exchange rate outputted as datafram
    yearly_data_table = yearly_data_df.to_dict() # Convert the yearly_data to a hash table with the date as the key
    year_to_exchange_rate_dict = {key.year: value for key, value in yearly_data_table.items()} # Convert the keys to only the year part
    print(year_to_exchange_rate_dict)

    #convert DataFrame values to USD
    for year in master_table[key].columns: #loop through columns in dataframe
      year_int = int(year)  # Convert year to integer to match the dictionary keys
      master_table[key][year] = master_table[key][year] / year_to_exchange_rate_dict[year_int] #divide each column's values by that year's exchange rate

  except Exception as e: #if errored out
    print(f"Could not fetch rate for {key} ({currency}): {e}")
    error_countries.append(key) #add country to error list
    pass

for country in error_countries:
  del master_table[country]
 
master_table["Central African Republic"]

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.5991668701172, 2011: 460.59083048502606, 2012: 501.04666900634766, 2013: 492.91916910807294, 2014: 497.4808349609375, 2015: 594.4341735839844, 2016: 594.0333353678385, 2017: 578.2424977620443, 2018: 556.1933390299479, 2019: 586.7139180501302}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 2070.57499186197




Unnamed: 0_level_0,2010.0,2011.0,2012.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0
nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Expense,355193100.0,335847800.0,209339800.0,155565300.0,156967400.0,164245500.0,180564400.0,224535600.0,220480500.0
Compensation of employees,87797820.0,99852100.0,102211300.0,108112700.0,95343780.0,82793330.0,86474100.0,110278200.0,110875200.0
Wages and salaries,84055210.0,95537010.0,97827110.0,103450000.0,91085280.0,79702680.0,82557820.0,105874600.0,97299550.0
Employers' social contributions,3742620.0,4315085.0,4384219.0,4662738.0,4258503.0,3090653.0,3916279.0,4403609.0,13575610.0
Use of goods and services,205676600.0,163774200.0,49882260.0,12760290.0,25627120.0,36987820.0,47979870.0,54797490.0,61991030.0
Interest expense,19506900.0,15161110.0,11822580.0,4361513.0,7902999.0,1945396.0,3813279.0,5036432.0,11047970.0
To nonresidents,5700481.0,5881727.0,4716965.0,4086546.0,562723.0,1826497.0,3616130.0,4775731.0,2798638.0
To residents other than government units,13806420.0,9279386.0,7105619.0,274966.4,7340276.0,118899.1,197149.1,260700.7,0.0
To other government units,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8249336.0
Subsidies,,,,0.0,0.0,0.0,0.0,0.0,0.0


Get Averages

In [42]:
master_averages_ls = [] #make master averages list as place to store averages

#Get average over decade
#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  categories = master_table[key].index.tolist() #get the categories into a list (1d array)
  per_category_average = master_table[key].mean(axis=1) #get the average of each category (ignoring np.NaN values) into a list
  master_averages_ls.append(per_category_average)
  print(type(per_category_average))
  # per_category_average = master_table[key].apply(lambda row: np.nanmean(row), axis=1).to_numpy() #get the average of each category (ignoring np.NaN values) into a 1d array

  print(per_category_average)
  # average_table = dict(zip(categories, per_category_average)) #combine 'categories' and 'per_category_average' into a table
  # master_averages_table[key] = master_averages_table #store this table in master_averages_table

# master_averages_ls
  
merged_df = pd.DataFrame()

# Merge each Series into the DataFrame
for i, series in enumerate(master_averages_ls):
    merged_df = merged_df.merge(series.rename(f'series_{i}'), how='outer', left_index=True, right_index=True)
merged_df

<class 'pandas.core.series.Series'>
nan
Expense                                     1.043420e+09
Compensation of employees                   5.361067e+08
Wages and salaries                          5.361067e+08
Employers' social contributions             0.000000e+00
Use of goods and services                   1.747257e+08
Interest expense                            3.886061e+07
To nonresidents                             1.866825e+07
To residents other than government units    2.019236e+07
To other government units                   0.000000e+00
Subsidies                                   8.132911e+07
To public corporations                      8.132911e+07
To private enterprises                      0.000000e+00
To other sectors                            0.000000e+00
Grants                                      1.857708e+08
To foreign governments                      0.000000e+00
To international organizations              1.391523e+07
To other government units                   1.71

Unnamed: 0_level_0,series_0,series_1,series_2,series_3
nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capital,0.0,278795000.0,0.0,0.0
Compensation of employees,536106700.0,2148333000.0,98193170.0,92217440.0
Current,171855500.0,640056600.0,0.0,14598480.0
Employers' social contributions,0.0,,5149924.0,0.0
Employment-related social benefits,1780214.0,61773570.0,0.0,16583370.0
Expense,1043420000.0,5762191000.0,222526600.0,191472700.0
Expense on other transfers,24795710.0,494288200.0,30195500.0,1184530.0
Grants,185770800.0,930177600.0,0.0,16120480.0
Interest expense,38860610.0,548800700.0,8955355.0,11561580.0
Other expense,24795710.0,494288200.0,30195500.0,1184530.0
