<a href="https://colab.research.google.com/github/pandemic-tracking/global-vaccine/blob/main/Vaccine_Data_Frequency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
import altair as alt
import base64
import pandas as pd
import re
import requests

from collections import defaultdict
from datetime import datetime
from google.colab import drive
from pprint import pprint
from tqdm import tqdm

drive.mount('/content/drive')

#@markdown # Setup

#@markdown Generate a private access token with repo access for the Github API by following [these](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) instructions.
#@markdown Save the token to your root folder in Google Drive in a text file called token.txt (if you use a different filepath, change the filepath below to match).
github_token_filepath = '/content/drive/MyDrive/token.txt' #@param {type:"string"}
#@markdown Make sure the [PTC Google Drive Folder](https://drive.google.com/drive/u/1/folders/1-XAAKCA1GdnRULS9ZmkOrcrQ8_rpjBjw) is added to your Drive root level. 
#@markdown (To add it, right click the name "PTC" and click add shortcut to Drive, then put it in My Drive). This path points to the data file that will be read/updated in that file; you shouldn't need to change it.
cached_data_path = '/content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv'#@param {type:"string"} 

#@markdown Add a comma-separated list of columns you would like included in your generated file.
date_df_columns = 'date,total_vaccinations' #@param {type:"string"}
date_df_columns = date_df_columns.split(",")

#@markdown Uncheck the following box if you'd like to overwrite the data file rather than using cached data. This is not recommended unless you are completely changing the structure of the file.
use_cached_data = True #@param {type:"boolean"}

#@markdown Finally, run this cell to save your configurations.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Download commit metadata from GitHub

In [110]:
# Get last page of commit data
try:
  with open(github_token_filepath) as f:
    token = f.readlines()[0]
except FileNotFoundError:
  print('ERROR: GitHub PHA token file not found - check setup.')

else:
  headers={'Authorization': 'token ' + token}
  url = ('https://api.github.com/repos/owid/covid-19-data/commits'\
        '?path=public/data/vaccinations/vaccinations.csv')
  response = requests.head(url, headers=headers)
  last_page = int(re.split(',|<|>|;',response.headers['Link'])[5][-2:])
  print(str(response.status_code) + ' : ' + response.reason)
  print("Last page of data: %d" %  last_page)

  # Get SHAs
  all_shas = {}
  for i in range(1,last_page+1):
    url = ('https://api.github.com/repos/owid/covid-19-data/commits'\
          '?path=public/data/vaccinations/vaccinations.csv'\
          '&page=%d' % i)
    response = requests.get(url,headers=headers)
    for commit in response.json():
      all_shas[commit['commit']['author']['date']] = commit['sha']
  print("Retrieved SHAs")

  # Pare to last commit dates
  datetimes = defaultdict(str)
  for (datetime,sha) in all_shas.items():
    date = datetime.split("T")[0]
    time = datetime.split("T")[1]
    if datetimes[date] < time:
      datetimes[date] = time
  final_shas = dict((date,all_shas[date+'T'+time]) for (date,time) in datetimes.items())

  print("Selected latest SHAs. Days of data: %d" % len(final_shas))

200 : OK
Last page of data: 38
Retrieved SHAs
Selected latest SHAs. Days of data: 588


# Build Data Files

In [120]:
no_data, new_cols = False, False

# Check for data file and if there are any new columns we need to generate
try:
  dates_df = pd.read_csv(cached_data_path,index_col=0)
  new_cols = not all(col in dates_df.columns for col in date_df_columns)

except FileNotFoundError:
  no_data = True

if not use_cached_data or no_data or new_cols:
  print("Constructing dates_df from scratch...")
  responses = {}
  dates_df = pd.DataFrame(columns=['country','owid_date'] + date_df_columns)
  final_sha_values = list(final_shas.items())
  for i in tqdm(range(len(final_shas_values))):
    owid_date = final_sha_values[i][0]
    sha = final_sha_values[i][1]
    url = ('https://raw.githubusercontent.com/owid/covid-19-data/'\
           + sha \
           + '/public/data/vaccinations/vaccinations.csv')
    
    df = pd.read_csv(url)
    try:
      for country in df['iso_code'].unique():
        row_vals = {'country': country, 'owid_date': owid_date, 'sha': sha}
        for col in date_df_columns:
          row_vals[col] = df[df['iso_code']==country][col].max()
          dates_df.append(row_vals, ignore_index=True)
    except KeyError:
      None

else:
  print("Checking for new dates to update...")
  final_sha_values = list(final_shas.items())
  for i in tqdm(range(len(final_sha_values))):
    owid_date = final_sha_values[i][0]
    sha = final_sha_values[i][1]
    if not any(dates_df.sha == sha):
      url = ('https://raw.githubusercontent.com/owid/covid-19-data/'\
             + sha \
             + '/public/data/vaccinations/vaccinations.csv')
      df = pd.read_csv(url)
      try:
        for country in df['iso_code'].unique():
          row_vals = {'country': country, 'owid_date': owid_date, 'sha': sha}
          if 'iso_code' in df.columns:
            for col in date_df_columns:
              country_df = df[df['iso_code']==country]
              row_vals[col] = country_df[col].max()
            dates_df.append(row_vals, ignore_index=True)
      except KeyError:
        None # old file

dates_df

Checking for new dates to update...


100%|██████████| 588/588 [00:13<00:00, 45.21it/s]






Unnamed: 0,country,owid_date,date,total_vaccinations,sha
0,AFG,7/31/22,7/19/22,7885045.0,483e0fef2f293a4204029986c857716717688fbc
1,OWID_AFR,7/31/22,7/30/22,595201224.0,483e0fef2f293a4204029986c857716717688fbc
2,ALB,7/31/22,7/24/22,2934116.0,483e0fef2f293a4204029986c857716717688fbc
3,DZA,7/31/22,5/29/22,15205854.0,483e0fef2f293a4204029986c857716717688fbc
4,AND,7/31/22,7/10/22,153531.0,483e0fef2f293a4204029986c857716717688fbc
...,...,...,...,...,...
118924,ROU,1/3/21,1/3/21,13596.0,c03d2080f168f6bea7b23cabc5368148c10a3a97
118925,RUS,1/3/21,1/2/21,800000.0,c03d2080f168f6bea7b23cabc5368148c10a3a97
118926,GBR,1/3/21,12/27/20,944539.0,c03d2080f168f6bea7b23cabc5368148c10a3a97
118927,USA,1/3/21,1/2/21,4225756.0,c03d2080f168f6bea7b23cabc5368148c10a3a97


In [112]:
dates_df.to_csv(cached_data_path)
print("Saved to %s" % cached_data_path)

Saved to /content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv


# Graphs

In [113]:
# Add income levels
income_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/scripts/input/wb/income_groups.csv')
dates_df_expanded = dates_df.merge(income_df,how='left',left_on='country',right_on='Code').drop(labels=['Code','Year'],axis=1)

# Add date differences
dates_df_expanded[['owid_date','date']]=dates_df_expanded[['owid_date','date']].apply(pd.to_datetime)
dates_df_expanded['date_diff'] = (dates_df_expanded['owid_date']-dates_df_expanded['date']).dt.days

# Add most recent OWID data
df_columns = [col for col in date_df_columns if col != 'date']
most_recent_sha = final_shas[max(final_shas)]
url =  ('https://raw.githubusercontent.com/owid/covid-19-data/'\
        + most_recent_sha \
        + '/public/data/vaccinations/vaccinations.csv')
most_recent_df = pd.read_csv(url)
most_recent_df = most_recent_df.drop(labels=[col for col in most_recent_df.columns if col not in df_columns + ['iso_code','date']],axis=1)
most_recent_df[['date']]=most_recent_df[['date']].apply(pd.to_datetime)

# Merge dataframes together
dates_df_expanded = pd.merge(dates_df_expanded, most_recent_df, how='left', left_on=['country','owid_date'], right_on=['iso_code','date'],suffixes=["_owid","_event"])  
dates_df_expanded = dates_df_expanded.drop(labels=["iso_code","date_event"],axis=1)
dates_df_expanded = dates_df_expanded.sort_values(by=['country','owid_date'],axis=0)

# Add diffs for total vaccinations (TODO: generalize this code for all metrics)
new_dates_df_expanded = pd.DataFrame(columns=dates_df_expanded.columns)
for country in dates_df_expanded.country.unique():
  country_df = dates_df_expanded[dates_df_expanded['country'] == country]
  country_df = country_df.copy(deep=True)
  country_df['total_vaccinations_event'] = country_df['total_vaccinations_event'].fillna(method='ffill')
  new_dates_df_expanded = new_dates_df_expanded.append(country_df)
dates_df_expanded = new_dates_df_expanded
dates_df_expanded['total_vaccinations_diff'] = (dates_df_expanded['total_vaccinations_event']-dates_df_expanded['total_vaccinations_owid'])

In [114]:
country = 'USA' #@param ['AFG', 'ALB', 'DZA', 'AND', 'AGO', 'AIA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BES', 'BIH', 'BWA', 'BRA', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COG', 'COK', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'COD', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'EST', 'SWZ', 'ETH', 'FRO', 'FLK', 'FJI', 'FIN', 'FRA', 'PYF', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GTM', 'GGY', 'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'IMN', 'ISR', 'ITA', 'JAM', 'JPN', 'JEY', 'JOR', 'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MRT', 'MUS', 'MEX', 'MDA', 'MCO', 'MNG', 'MNE', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NIU', 'MKD', 'NOR', 'OMN', 'PAK', 'PSE', 'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'PCN', 'POL', 'PRT', 'QAT', 'ROU', 'RUS', 'RWA', 'SHN', 'KNA', 'LCA', 'VCT', 'WSM', 'SMR', 'STP', 'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SXM', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'KOR', 'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TLS', 'TGO', 'TKL', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'WLF', 'YEM', 'ZMB', 'ZWE', 'PLW']
country_df = dates_df_expanded[dates_df_expanded['country'] == country]
alt.Chart(country_df).mark_point().encode(
    x=alt.X('owid_date',title='Date of OWID Report'),
    y=alt.Y('date_diff',title='Reporting Lag')
).properties(
    title=country
)

In [115]:
# Create aggregate income dataset
income_order = ['Low income','Lower middle income','Upper middle income','High income']
income_aggregate = dates_df_expanded.groupby(['Income group','owid_date']).agg(
    {'date_diff': ['mean', 'min', 'max'],
     'total_vaccinations_diff':['sum']}
  ).reset_index()
income_aggregate.columns = income_aggregate.columns.map('_'.join)

In [116]:
# Chart reporting lag by income average with facet
alt.Chart(income_aggregate).mark_point().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
).facet(column=alt.Column(
    "Income group_",title=None,sort=income_order), columns=4).properties(
        title="Mean Reporting Lag per Day by Country Income Group")

In [117]:
today_df=dates_df_expanded[dates_df_expanded['owid_date']==dates_df_expanded['owid_date'].max()]
today_df = today_df[today_df['Income group'].notna()]
# Current lag scatter
alt.Chart(today_df).mark_point().encode(
    x=alt.X('Income group',sort=income_order),
    y=alt.Y('date_diff',title="Current reporting lag (days)")
).properties(
    width=300,
    height=600
)

In [118]:
# Chart reporting lag by income average with facet
alt.Chart(income_aggregate).mark_line().encode(
        x=alt.X('owid_date_',title='Date of OWID report'),
        y=alt.Y('total_vaccinations_diff_sum',title='Dose reporting lag'),
).facet(column=alt.Column(
    "Income group_",title=None,sort=income_order), columns=4).properties(
        title="Dose Lag per Day by Country Income Group")

In [119]:
# Chart by income average with color
alt.Chart(income_aggregate).mark_point().encode(
        x=alt.X('owid_date_',title='Date of OWID Report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
        color=alt.Color('Income group_',sort=income_order,title='Income group')
).properties(
    title='Average OWID Reporting Lag per Day by Country Income Group'
)