Import Libraries

In [64]:
import pandas as pd
import numpy as np
import os
import yfinance as yf


Currency Codes

In [65]:
country_currency_codes_table = {
    'Burundi': 'BIF',
    'Benin': 'XOF',
    'Burkina Faso': 'XOF',
    'Central African Republic': 'XAF',
    "Côte d'Ivoire": 'XOF',
    'Cameroon': 'XAF',
    'Democratic Republic of the Congo': 'CDF',
    'Republic of Congo': 'XAF',
    'Comoros': 'KMF',
    'Eritrea': 'ERN',
    'Ethiopia': 'ETB',
    'Ghana': 'GHS',
    'Guinea': 'GNF',
    'The Gambia': 'GMD',
    'Guinea-Bissau': 'XOF',
    'Liberia': 'LRD',
    'Madagascar': 'MGA',
    'Mali': 'XOF',
    'Mozambique': 'MZN',
    'Mauritania': 'MRU',
    'Malawi': 'MWK',
    'Niger': 'XOF',
    'Rwanda': 'RWF',
    'Sudan': 'SDG',
    'Senegal': 'XOF',
    'Sierra Leone': 'SLL',
    'Somalia': 'SOS',
    'Sao Tomee and Principe': 'STN',
    'Chad': 'XAF',
    'Togo': 'XOF',
    'Tanzania': 'TZS',
    'Uganda': 'UGX',
    'Zambia': 'ZMW'
}


Read Data

In [66]:
#read in stata dataset as dataframe "country_code_df"
country_code_df = pd.read_stata("codesffactors_Mar23_2019.dta") 
country_code_df.replace('--', np.nan, inplace=True) #replace all entries that are '--' with NaN data value

country_code_df.head()

Unnamed: 0,countryname,wbctry,wbcode,wbregion,wbigroup2017,wblcat2017,hipc,emu,cemac,waemu,imfcode,imfweocode,imfweoiso,imfweoctry,region
0,Aruba,Aruba,ABW,Latin America & Caribbean,High income,,0,0,0,0,,314.0,ABW,Aruba,LAC
1,Afghanistan,Afghanistan,AFG,South Asia,Low income,IDA,1,0,0,0,512.0,512.0,AFG,Afghanistan,SAS
2,Angola,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,0,0,0,0,614.0,614.0,AGO,Angola,SSA
3,Anguilla,,AIA,Latin America & Caribbean,,,0,0,0,0,312.0,,,,LAC
4,Albania,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,0,0,0,0,914.0,914.0,ALB,Albania,ECA


In [67]:
region_classification_ls = country_code_df['wbregion'].unique() #get a list of region classifications
income_classification_ls = country_code_df['wbigroup2017'].unique() #get a list of income classifications
income_classification_ls = [x for x in income_classification_ls if not pd.isna(x)] # remove '--' from income_classification_ls

print(f"The region classifications are: {', '.join(country for country in region_classification_ls)}")
print(f"The income classifications are: {', '.join(country for country in income_classification_ls)}")

The region classifications are: Latin America & Caribbean, South Asia, Sub-Saharan Africa, Europe & Central Asia, Middle East & North Africa, East Asia & Pacific, North America
The income classifications are: High income, Low income, Lower middle income, Upper middle income


In [68]:
ssa_hipc_countries = country_code_df[(country_code_df['hipc'] == 1) & (country_code_df['wbregion'] == 'Sub-Saharan Africa')]
ssa_hipc_countries_ls = ssa_hipc_countries['imfweoctry'].to_list()

ssa_hipc_countries_ls

['Burundi',
 'Benin',
 'Burkina Faso',
 'Central African Republic',
 "Côte d'Ivoire",
 'Cameroon',
 'Democratic Republic of the Congo',
 'Republic of Congo',
 'Comoros',
 'Eritrea',
 'Ethiopia',
 'Ghana',
 'Guinea',
 'The Gambia',
 'Guinea-Bissau',
 'Liberia',
 'Madagascar',
 'Mali',
 'Mozambique',
 'Mauritania',
 'Malawi',
 'Niger',
 'Rwanda',
 'Sudan',
 'Senegal',
 'Sierra Leone',
 'Somalia',
 'São Tomé and Príncipe',
 'Chad',
 'Togo',
 'Tanzania',
 'Uganda',
 'Zambia']

Read in IMF's Government Finance Statistics

In [69]:
master_table = {}
folder_path = "revenue-data"

# Loop that goes through all files 'file_name' in the folder 'folder_path'
for file_name in os.listdir(folder_path):
  if file_name.endswith('.xlsx') and not file_name.startswith('~$'): #check to see if is spreadsheet .xlsx format
    raw_df = pd.read_excel(folder_path+ "/" + file_name) #read in .xlsx as dataframe
    master_table[file_name[:-5]] = raw_df #store dataframe as an entry in the master_table hash table

# master_table["Central African Republic"]
master_table["Benin"]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Detailed Revenue Breakdown,Detailed Revenue Breakdown,,,
1,Country: Benin,,,,
2,Sector: Budgetary central government,,,,
3,Unit: Domestic currency,,,,
4,Scale: Billions,,,,
5,,,,,
6,,2010,2011.0,2012.0,2013.0
7,Revenue,648.375468,676.747224,701.455221,803.0
8,Taxes,536.250675,546.563646,599.904265,696.0
9,"Taxes on income, profits, & capital gains",104.846262,114.433242,118.073622,134.3


Convert values to USD

In [70]:
error_countries = []

#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  scale = master_table[key].iloc[4,0] #get if values have scale of billions or millions
  
  # Drop rows 0 to 5, reformatting indexes and column headers
  master_table[key] = master_table[key].drop([0, 1, 2, 3, 4, 5]) 
  master_table[key] = master_table[key].reset_index(drop=True) 
  master_table[key].columns = master_table[key].iloc[0]
  master_table[key] = master_table[key][1:]
  master_table[key] = master_table[key].set_index(master_table[key].columns[0])

  if scale == "Scale: Billions":
    master_table[key] = master_table[key]* (10 ** 9) #multiple every value in dataframe by 1e9 if scale is billion
  elif scale == "Scale: Millions":
    master_table[key] = master_table[key]* (10 ** 6) #multiple every value in dataframe by 1e9 if scale is billion


  #get annual 2010-2019 USD to domestic currency exchange rate using yfinance library
  currency = country_currency_codes_table[key] #retrieve currency symbol from 'country_currency_codes_table'
  symbol = f"{currency}=X"
  try:
    data = yf.download(symbol, start="2010-01-01", end="2020-01-01", interval="1mo") #download monthly USD-Currency dataset from 2010-2020
    yearly_data_df = data['Close'].resample('YE').mean() #get the annual average exchange rate outputted as datafram
    yearly_data_table = yearly_data_df.to_dict() # Convert the yearly_data to a hash table with the date as the key
    year_to_exchange_rate_dict = {key.year: value for key, value in yearly_data_table.items()} # Convert the keys to only the year part
    print(year_to_exchange_rate_dict)

    #convert DataFrame values to USD
    for year in master_table[key].columns: #loop through columns in dataframe
      year_int = int(year)  # Convert year to integer to match the dictionary keys
      master_table[key][year] = master_table[key][year] / year_to_exchange_rate_dict[year_int] #divide each column's values by that year's exchange rate

  except Exception as e: #if errored out
    pass
    print(f"Could not fetch rate for {key} ({currency}): {e}")
    error_countries.append(key) #add country to error list
    

for country in error_countries:
  del master_table[country]
 
master_table["Benin"]

[*********************100%%**********************]  1 of 1 completed


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.5991668701172, 2011: 460.

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['STN=X']: YFInvalidPeriodError("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


{2010: 2.318599979082743, 2011: 2.648924986521403, 2012: 3.52014168103536, 2013: 4.590566674868266, 2014: 5.666366696357727, 2015: 5.987816691398621, 2016: 6.130283355712891, 2017: 6.609750032424927, 2018: 25.39816665649414, 2019: 45.693160692850746}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 487.61083221435547, 2011: 460.5933329264323, 2012: 501.0466715494792, 2013: 492.91916910807294, 2014: 497.47583770751953, 2015: 594.4266713460287, 2016: 594.0358378092448, 2017: 577.82666015625, 2018: 557.75, 2019: 583.7949981689453}
{2010: 148.9291674296061, 2011: 155.58916854858398, 2012: 247.6258316040039, 2013: 362.4624989827474, 2014: 410.5541687011719, 2015: 495.21583557128906, 2016: 700.3816680908203, 2017: 720.1374969482422, 2018: 704.3308308919271, 2019: 718.8183339436849}
Could not fetc

Unnamed: 0_level_0,2010,2011.0,2012.0,2013.0
nan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Revenue,1329698656.446114,1469294000.0,1399980000.0,1629070000.0
Taxes,1099751357.831325,1186651000.0,1197302000.0,1411996000.0
"Taxes on income, profits, & capital gains",215020370.76549,248447500.0,235653900.0,272458500.0
"Taxes on income, profits, & capital gains: individuals",94339618.19326,112726000.0,112418200.0,122332400.0
"Taxes on income, profits, & capital gains: corporations",97933415.952104,115078100.0,106016900.0,128824400.0
"Taxes on income, profits, & capital gains: other",22747336.620127,20643280.0,17218810.0,21301670.0
Taxes on payroll & workforce,10468055.387162,11519040.0,11429320.0,12172380.0
Taxes on property,3449468.155499,3611964.0,2555262.0,3651714.0
Taxes on goods & services,564951648.522647,614176500.0,602364500.0,709852700.0
General taxes on goods & services,477311398.204718,522009500.0,538061900.0,626471900.0


Calculate Averages

In [71]:
master_averages_ls = [] #make master averages list as place to store averages

#Get average over decade
#loop through table (where key is the country name) (master_table[key] is the dataframe stored in the master_table indexed by key)
for key in master_table:
  per_category_average = master_table[key].mean(axis=1) #get the average of each category (ignoring np.NaN values) into a list
  # per_category_average.drop('To other government units', inplace=True) #drop 'To other government units' since there's two rows named that and will error
  master_averages_ls.append(per_category_average) #add each country average to to the master averages list


merged_averages_df = pd.concat(master_averages_ls, axis=1, join='outer')
average__of_averages_df = merged_averages_df.mean(axis=1) #get the average of all the countries' averages
average__of_averages_df

nan
Revenue                                                    2770386295.843223
Taxes                                                      1931828113.621924
Taxes on income, profits, & capital gains                   568208799.042841
Taxes on income, profits, & capital gains: individuals      263959205.442601
Taxes on income, profits, & capital gains: corporations     300242508.651176
Taxes on income, profits, & capital gains: other             34424751.124167
Taxes on payroll & workforce                                  8519820.731848
Taxes on property                                            12666863.764264
Taxes on goods & services                                   953822285.904208
General taxes on goods & services                           581043851.712063
Excise taxes                                                196296288.874653
Taxes on int trade & transactions                           347427057.357882
Other taxes n.e.c.                                           42188755.00

Calculate Percentages

In [72]:
total_taxes = average__of_averages_df['Taxes']
average__of_averages_df = round(average__of_averages_df / total_taxes * 100, 2) #take all values as a fraction of total expenditure, then convert decimal to percent, then round to two decimal places
average__of_averages_df

nan
Revenue                                                    143.407495
Taxes                                                           100.0
Taxes on income, profits, & capital gains                    29.41301
Taxes on income, profits, & capital gains: individuals        13.6637
Taxes on income, profits, & capital gains: corporations     15.541885
Taxes on income, profits, & capital gains: other             1.781978
Taxes on payroll & workforce                                 0.441024
Taxes on property                                            0.655693
Taxes on goods & services                                   49.374076
General taxes on goods & services                           30.077409
Excise taxes                                                10.161167
Taxes on int trade & transactions                           17.984367
Other taxes n.e.c.                                           2.183877
Social contributions                                          0.64915
Social security 