<a href="https://colab.research.google.com/github/pandemic-tracking/global-vaccine/blob/main/Vaccine_Data_Frequency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [2]:
import altair as alt
import base64
import pandas as pd
import re
import requests

from collections import defaultdict
from datetime import datetime
from google.colab import drive
from pprint import pprint
from tqdm import tqdm

drive.mount('/content/drive')
alt.renderers.enable('html')

#@markdown Generate a private access token with repo access for the Github API by following [these](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) instructions.
#@markdown Save the token to your root folder in Google Drive in a text file called token.txt (if you use a different filepath, change the filepath below to match).
github_token_filepath = '/content/drive/MyDrive/token.txt' #@param {type:"string"}
#@markdown Make sure the [PTC Google Drive Folder](https://drive.google.com/drive/u/1/folders/1-XAAKCA1GdnRULS9ZmkOrcrQ8_rpjBjw) is added to your Drive root level. 
#@markdown (To add it, right click the name "PTC" and click add shortcut to Drive, then put it in My Drive). These paths point to the data file that will be read and updated in that folder.
cached_data_path = '/content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv'#@param {type:"string"} 
save_data_path = '/content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv'#@param {type:"string"} 
#@markdown Add a comma-separated list of columns from Our World in Data you would like included in your generated file.
date_df_columns = 'date,total_vaccinations,people_vaccinated' #@param {type:"string"}
date_df_columns = date_df_columns.split(",")

#@markdown Uncheck the following box if you'd like to overwrite the data file rather than using cached data. This is not recommended unless you are completely changing the structure of the file.
use_cached_data = True #@param {type:"boolean"}

#@markdown Finally, run this cell to save your configurations.


Mounted at /content/drive


# Get OWID Data

## Download commit metadata from GitHub

In [3]:
# Get last page of commit data
try:
  with open(github_token_filepath) as f:
    token = f.readlines()[0]
except FileNotFoundError:
  print('ERROR: GitHub PHA token file not found - check setup.')

else:
  headers={'Authorization': 'token ' + token}
  url = ('https://api.github.com/repos/owid/covid-19-data/commits'\
        '?path=public/data/vaccinations/vaccinations.csv')
  response = requests.head(url, headers=headers)
  last_page = int(re.split(',|<|>|;',response.headers['Link'])[5][-2:])
  print(str(response.status_code) + ' : ' + response.reason)
  print("Last page of data: %d" %  last_page)

  # Get SHAs
  all_shas = {}
  for i in range(1,last_page+1):
    url = ('https://api.github.com/repos/owid/covid-19-data/commits'\
          '?path=public/data/vaccinations/vaccinations.csv'\
          '&page=%d' % i)
    response = requests.get(url,headers=headers)
    for commit in response.json():
      all_shas[commit['commit']['author']['date']] = commit['sha']
  print("Retrieved SHAs")

  # Pare to last commit dates
  datetimes = defaultdict(str)
  for (datetime,sha) in all_shas.items():
    date = datetime.split("T")[0]
    time = datetime.split("T")[1]
    if datetimes[date] < time:
      datetimes[date] = time
  final_shas = dict((date,all_shas[date+'T'+time]) for (date,time) in datetimes.items())

  print("Selected latest SHAs. Days of data: %d" % len(final_shas))

200 : OK
Last page of data: 38
Retrieved SHAs
Selected latest SHAs. Days of data: 589


## Build data file

In [4]:
no_data, new_cols = False, False

# Check for data file and if there are any new columns we need to generate
try:
  dates_df = pd.read_csv(cached_data_path,index_col=0)
  new_cols = not all(col in dates_df.columns for col in date_df_columns)

except FileNotFoundError:
  no_data = True

if not use_cached_data or no_data or new_cols:
  print("Constructing dates_df from scratch...")
  responses = {}
  dates_df = pd.DataFrame(columns=['country','owid_date','sha'] + date_df_columns)
  final_sha_values = list(final_shas.items())
  for i in tqdm(range(len(final_sha_values))):
    owid_date = final_sha_values[i][0]
    sha = final_sha_values[i][1]
    url = ('https://raw.githubusercontent.com/owid/covid-19-data/'\
           + sha \
           + '/public/data/vaccinations/vaccinations.csv')
    
    df = pd.read_csv(url)
    try:
      for country in df['iso_code'].unique():
        row_vals = {'country': country, 'owid_date': owid_date, 'sha': sha}
        for col in date_df_columns:
          row_vals[col] = df[df['iso_code']==country][col].max()
        dates_df=dates_df.append(row_vals, ignore_index=True)
    except KeyError:
      None # old file

else:
  print("Checking for new dates to update...")
  final_sha_values = list(final_shas.items())
  for i in tqdm(range(len(final_sha_values))):
    owid_date = final_sha_values[i][0]
    sha = final_sha_values[i][1]
    if not any(dates_df.sha == sha):
      url = ('https://raw.githubusercontent.com/owid/covid-19-data/'\
             + sha \
             + '/public/data/vaccinations/vaccinations.csv')
      df = pd.read_csv(url)
      try:
        for country in df['iso_code'].unique():
          row_vals = {'country': country, 'owid_date': owid_date, 'sha': sha}
          for col in date_df_columns:
            country_df = df[df['iso_code']==country]
            row_vals[col] = country_df[col].max()
          dates_df=dates_df.append(row_vals, ignore_index=True)
      except KeyError:
        None # old file

dates_df

Checking for new dates to update...


100%|██████████| 589/589 [00:13<00:00, 45.10it/s]


Unnamed: 0,country,owid_date,sha,date,total_vaccinations,people_vaccinated
0,AFG,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-07-19,7885045.0,7139453.0
1,OWID_AFR,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-08-01,595678842.0,367151942.0
2,ALB,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-07-24,2934116.0,1330520.0
3,DZA,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-05-29,15205854.0,8210605.0
4,AND,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-07-10,153531.0,57888.0
...,...,...,...,...,...,...
119002,SWE,2021-01-12,6ae12fca5dfd0050395a4738f588fc2fa22ae873,2021-01-10,79095.0,79095.0
119003,ARE,2021-01-12,6ae12fca5dfd0050395a4738f588fc2fa22ae873,2021-01-12,1275652.0,1275652.0
119004,GBR,2021-01-12,6ae12fca5dfd0050395a4738f588fc2fa22ae873,2021-01-11,2843815.0,2431648.0
119005,USA,2021-01-12,6ae12fca5dfd0050395a4738f588fc2fa22ae873,2021-01-12,9327138.0,9327138.0


In [5]:
dates_df.to_csv(save_data_path)
print("Saved to %s" % save_data_path)

Saved to /content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv


# Graph Data Preprocessing

## Add stratifications and diff data

In [6]:
# Add income levels
income_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/scripts/input/wb/income_groups.csv')
dates_df_expanded = dates_df.merge(income_df,how='left',left_on='country',right_on='Code')
dates_df_expanded = dates_df_expanded.drop(labels=['Country','Code','Year'],axis=1)

# Add WHO regions
who_regions_df = pd.read_csv('/content/drive/MyDrive/PTC/GlobalVaxUpdates/who-regions.csv')
dates_df_expanded = dates_df_expanded.merge(who_regions_df,how='left',left_on='country',right_on='Code')
dates_df_expanded = dates_df_expanded.drop(labels=['Year','Code'],axis=1)
dates_df_expanded

# Add date differences
dates_df_expanded[['owid_date','date']] = (
    dates_df_expanded[['owid_date','date']].apply(pd.to_datetime))
dates_df_expanded['date_diff'] = (
    dates_df_expanded['owid_date'] -
    dates_df_expanded['date']
  ).dt.days

# Get most recent OWID data
most_recent_sha = final_shas[max(final_shas)]
url =  ('https://raw.githubusercontent.com/owid/covid-19-data/'\
        + most_recent_sha \
        + '/public/data/vaccinations/vaccinations.csv')
most_recent_df = pd.read_csv(url)
most_recent_df = most_recent_df.drop(
    labels=[col for col in most_recent_df.columns 
            if col not in date_df_columns 
            + ['iso_code']],
    axis=1)
most_recent_df[['date']]=most_recent_df[['date']].apply(pd.to_datetime)

# Merge most recent OWID data into dataframe and sort by country then date
dates_df_expanded = pd.merge(dates_df_expanded, 
                             most_recent_df, 
                             how='left', 
                             left_on=['country','owid_date'], 
                             right_on=['iso_code','date'],
                             suffixes=["_owid","_event"])  
dates_df_expanded = dates_df_expanded.drop(
    labels=["iso_code","date_event"],
    axis=1)
dates_df_expanded = dates_df_expanded.sort_values(
    by=['country','owid_date'],
    axis=0)

# Fill non-updated days for all columns to allow diff calculation
df_columns = [col for col in date_df_columns if col != 'date']
new_dates_df_expanded=pd.DataFrame(columns=dates_df_expanded.columns)
for country in dates_df_expanded.country.unique():
  country_df = dates_df_expanded[dates_df_expanded['country'] == country]
  country_df = country_df.copy(deep=True)
  for col in df_columns:
    country_df[col+'_event'] = country_df[col+'_event'].fillna(method='ffill')
  new_dates_df_expanded = new_dates_df_expanded.append(country_df)
dates_df_expanded = new_dates_df_expanded

# Calculate diffs
for col in df_columns:
  dates_df_expanded[col+'_diff'] = (
      dates_df_expanded[col+'_event']-dates_df_expanded[col+'_owid']
  )

dates_df_expanded

Unnamed: 0,country,owid_date,sha,date_owid,total_vaccinations_owid,people_vaccinated_owid,Income group,Entity,WHO region,date_diff,total_vaccinations_event,people_vaccinated_event,total_vaccinations_diff,people_vaccinated_diff
107201,ABW,2021-04-21,61e07ba791f81bcabe20ccb9c54802512efb9b06,2021-04-20,43945.0,27511.0,High income,,,1.0,62342.0,44421.0,18397.0,16910.0
107003,ABW,2021-04-22,f7c1b407f3a0f1baba138a976ec8b970061b242f,2021-04-20,43945.0,27511.0,High income,,,2.0,64288.0,45097.0,20343.0,17586.0
106802,ABW,2021-04-23,ec873e5352966b8614f083415e78df4fcda2b307,2021-04-22,64288.0,45097.0,High income,,,1.0,67950.0,48061.0,3662.0,2964.0
106601,ABW,2021-04-24,3f2a67d5347019f6e22a93cd7ad8e7f3e7fe7791,2021-04-23,67950.0,48061.0,High income,,,1.0,69279.0,49351.0,1329.0,1290.0
106400,ABW,2021-04-25,e258b748476aba28663064141f989895218bdc62,2021-04-24,69279.0,49351.0,High income,,,1.0,73103.0,53161.0,3824.0,3810.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174,ZWE,2022-07-29,9ba09b706f1b75b708a427c0dc75405b6ff3e685,2022-07-08,12045572.0,6331851.0,Lower middle income,Zimbabwe,Africa,21.0,12175457.0,6372686.0,129885.0,40835.0
939,ZWE,2022-07-30,ff51883dc3d88e62c991aa72e5aad6f2d53baf2c,2022-07-08,12045572.0,6331851.0,Lower middle income,Zimbabwe,Africa,22.0,12180877.0,6375973.0,135305.0,44122.0
704,ZWE,2022-07-31,483e0fef2f293a4204029986c857716717688fbc,2022-07-08,12045572.0,6331851.0,Lower middle income,Zimbabwe,Africa,23.0,12182698.0,6376117.0,137126.0,44266.0
469,ZWE,2022-08-01,0c76c90465ba4f411e4992c46344be0fda3f378a,2022-07-08,12045572.0,6331851.0,Lower middle income,Zimbabwe,Africa,24.0,12182698.0,6376117.0,137126.0,44266.0


## Aggregate datasets 

In [14]:
aggregate_dict = {'date_diff': ['mean', 'min', 'max']}
for col in df_columns:
  aggregate_dict[col+'_diff']=['sum']

# Create aggregated income dataset
income_order = ['Low income','Lower middle income','Upper middle income','High income']
income_aggregate = dates_df_expanded.groupby(
      ['Income group','owid_date']
    ).agg(aggregate_dict).reset_index()
income_aggregate.columns = income_aggregate.columns.map('_'.join)

# Create aggregated WHO region dataset
region_aggregate = dates_df_expanded.groupby([
    'WHO region','owid_date']
  ).agg(aggregate_dict).reset_index()

region_aggregate.columns = region_aggregate.columns.map('_'.join)

# Graphs

## Reporting lags

In [8]:
country = 'USA' #@param ['AFG', 'ALB', 'DZA', 'AND', 'AGO', 'AIA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BES', 'BIH', 'BWA', 'BRA', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COG', 'COK', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'COD', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'EST', 'SWZ', 'ETH', 'FRO', 'FLK', 'FJI', 'FIN', 'FRA', 'PYF', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GTM', 'GGY', 'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'IMN', 'ISR', 'ITA', 'JAM', 'JPN', 'JEY', 'JOR', 'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MRT', 'MUS', 'MEX', 'MDA', 'MCO', 'MNG', 'MNE', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NIU', 'MKD', 'NOR', 'OMN', 'PAK', 'PSE', 'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'PCN', 'POL', 'PRT', 'QAT', 'ROU', 'RUS', 'RWA', 'SHN', 'KNA', 'LCA', 'VCT', 'WSM', 'SMR', 'STP', 'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SXM', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'KOR', 'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TLS', 'TGO', 'TKL', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'WLF', 'YEM', 'ZMB', 'ZWE', 'PLW']
country_df = dates_df_expanded[dates_df_expanded['country'] == country]
alt.Chart(country_df).mark_point().encode(
    x=alt.X('owid_date',title='Date of OWID Report'),
    y=alt.Y('date_diff',title='Reporting Lag')
).properties(
    title=country
)

In [9]:
# Chart reporting lag by income average with facet
income = alt.Chart(income_aggregate).mark_point().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
).facet(column=alt.Column(
    "Income group_",title=None,sort=income_order), columns=4).properties(
        title="Mean Reporting Lag per Day by Country Income Group")

income.display()

# Chart reporting lag by WHO with facet
regions = alt.Chart(region_aggregate).mark_point().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
).facet(column=alt.Column(
    "WHO region_",title=None), columns=6).properties(
        title="Mean Reporting Lag per Day by WHO region")
    
regions.display()

In [10]:
today_df=dates_df_expanded[dates_df_expanded['owid_date']==dates_df_expanded['owid_date'].max()]
today_df_income = today_df[today_df['Income group'].notna()]
today_df_region = today_df[today_df['WHO region'].notna()]

# Current lag scatter
lag_by_income = alt.Chart(today_df_income).mark_point().encode(
    x=alt.X('Income group',sort=income_order),
    y=alt.Y('date_diff',title="Current reporting lag (days)")
).properties(
    width=300,
    height=600
)
lag_by_region = alt.Chart(today_df_region).mark_point().encode(
    x=alt.X('WHO region'),
    y=alt.Y('date_diff',title="Current reporting lag (days)")
).properties(
    width=300,
    height=600
)

alt.hconcat(lag_by_income,lag_by_region)

In [11]:
# Chart by income average with color
income_average = alt.Chart(income_aggregate).mark_point().encode(
        x=alt.X('owid_date_',title='Date of OWID Report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
        color=alt.Color('Income group_',sort=income_order,title='Income group')
).properties(
    title='Average OWID Reporting Lag per Day by Country Income Group'
)

# Chart by WHO region average with color
region_average = alt.Chart(region_aggregate).mark_point().encode(
        x=alt.X('owid_date_',title='Date of OWID Report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
        color=alt.Color('WHO region_',title='WHO region')
).properties(
    title='Average OWID Reporting Lag per Day by WHO Region'
)

income_average.display()
region_average.display()

## Impact

In [12]:
# Chart reporting lag by income average with facet
dose_income = alt.Chart(income_aggregate).mark_line().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('total_vaccinations_diff_sum',title='Dose reporting lag'),
).facet(column=alt.Column(
    "Income group_",title=None,sort=income_order), columns=4).properties(
        title="Dose Lag per Day by Country Income Group")
    
dose_income.display()

people_income = alt.Chart(income_aggregate).mark_line().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('people_vaccinated_diff_sum',title='People vaccinated reporting lag'),
).facet(column=alt.Column(
    "Income group_",title=None,sort=income_order), columns=4).properties(
        title="People Vaccinated Lag per Day by Country Income Group")

people_income.display()


In [13]:
# Chart reporting lag by WHO region with facet
dose_region=alt.Chart(region_aggregate).mark_line().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('total_vaccinations_diff_sum',title='Dose reporting lag'),
).facet(column=alt.Column(
    "WHO region_",title=None), columns=6).properties(
        title="Dose Lag per Day by WHO Region")
dose_region.display()
people_region=alt.Chart(region_aggregate).mark_line().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('people_vaccinated_diff_sum',title='People vaccinated reporting lag'),
).facet(column=alt.Column(
    "WHO region_",title=None), columns=6).properties(
        title="People Vaccinated Lag per Day by WHO Region")
    
people_region.display()