In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.graph_objects as go

  import pandas.util.testing as tm


In [19]:
# read in data

# global vaccine data (sourced from UNICEF)
hpv_raw = pd.read_csv("https://raw.githubusercontent.com/mchxo/africa_vaccination_forecast/main/HPV_vaccination.csv") # HPV
yfv_raw = pd.read_csv("https://raw.githubusercontent.com/mchxo/africa_vaccination_forecast/main/YFV_vaccination.csv") # yellow fever
bcg_raw = pd.read_csv("https://raw.githubusercontent.com/mchxo/africa_vaccination_forecast/main/BCG_vaccination.csv") # TB

# population data
africa_population = pd.read_csv("https://raw.githubusercontent.com/mchxo/africa_vaccination_forecast/main/Africa_populations.csv", 
                                skiprows=1)
africa_population['Location'] = africa_population['Location'].str.lstrip() # clean dataset a bit

# Africa countries
countries = pd.read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")

In [15]:
# method to filter out only data on Africa from the global vaccine datasets

# param: left_code:  whatever code the left DF uses for ISO (values of these columns should be "AND", "ZWE", etc.)
# param: right_code: analagous, but for the right DF
# param: region: the column name that denotes the region (values of this column should be "Europe", "Africa", etc.)
def get_only_africa(df_left, df_right, left_code, right_code, region):
  new_df = df_left.merge(df_right, left_on=left_code, right_on=right_code).drop(columns=[right_code])
  return new_df[new_df[region]=='Africa']

In [16]:
# clean data for HPV vaccinations in Africa
hpv = get_only_africa(hpv_raw.drop(columns=['region']), countries, "iso3c", "alpha-3", "region")

In [17]:
# clean data for yellow fever vaccinations in Africa
yfv = get_only_africa(yfv_raw, countries, "iso3", "alpha-3", "region")

In [18]:
# clean data for tuberculosis vaccinations in Africa
bcg = get_only_africa(bcg_raw, countries, "iso3", "alpha-3", "region")

In [20]:
# method to compute a weighted average of a columns (param: cols), where the weights are specific by param: weights.

def weighted(df, cols, weights):
  x = df[cols]
  y = df[weights].str.strip().str.replace(" ", "").astype(int)
  return np.nansum(x.multiply(y)) / ((~np.isnan(x))*y).sum()

In [21]:
# compute the TB vaccination rate for all of Africa for each year between 1986 and 2020

bcg_merged = bcg.merge(africa_population, left_on='country', right_on='Location')
d = {year: weighted(bcg_merged, str(year)+'_x', str(year)+'_y') for year in range(1986, 2021)}
bcg_weighted = pd.DataFrame(data=d, index=[0])

In [22]:
bcg_weighted

Unnamed: 0,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,56.644789,62.946049,65.874585,72.292025,78.184581,70.360064,70.339314,72.51763,75.03805,73.368495,...,82.73343,82.47538,80.704822,81.360152,80.129673,82.045691,81.843646,81.228676,81.68684,79.796656


In [25]:
# compute the average, max and min changes in TB vaccination per year

diffs = bcg_weighted.diff(axis=1).iloc[0]
round(diffs.mean(), 2), round(diffs.max(), 2), round(diffs.min(), 2)

(0.68, 6.42, -7.82)

In [27]:
yfv_merged = yfv.merge(africa_population, left_on='country', right_on='Location')
d = {year: weighted(yfv_merged, str(year)+'_x', str(year)+'_y') for year in range(1986, 2021)}
yfv_weighted = pd.DataFrame(data=d, index=[0])

# compute the average, max and min changes in yellow fever vaccination per year

diffs = yfv_weighted.diff(axis=1).iloc[0]
round(diffs.mean(), 2), round(diffs.max(), 2), round(diffs.min(), 2)

  


(1.07, 9.63, -6.98)

In [29]:
# compute the average vaccination rates for HPV (data structured slightly differently)

years = [2015, 2016, 2017, 2018, 2019]
hpv_means = []
for year in years:
  recent = hpv[(hpv['vaccine_desc']=="Target population who received the first dose of HPV vaccine in the reporting year") & 
             (hpv['year']==year)]
  avg = recent['coverage'].mean()
  hpv_means.append(avg)

In [30]:
# compute average change in HPV vaccination rate between successive years

np.diff(np.array(hpv_means)).min()

-9.1875