<a href="https://colab.research.google.com/github/pandemic-tracking/global-vaccine/blob/main/Vaccine_Data_Frequency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%load_ext google.colab.data_table

import altair as alt
import base64
import gspread
import numpy as np
import pandas as pd
import re
import requests

from collections import defaultdict
from datetime import datetime
from google.auth import default
from google.colab import auth, drive
from pprint import pprint
from tqdm import tqdm

# Mount drive
drive.mount('/content/drive')

# altair settings
alt.renderers.enable('html')

# gspread setting
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)


#@markdown Generate a private access token with repo access for the Github API by following [these](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) instructions.
#@markdown Save the token to your root folder in Google Drive in a text file called token.txt (if you use a different filepath, change the filepath below to match).
github_token_filepath = '/content/drive/MyDrive/token.txt' #@param {type:"string"}
#@markdown Make sure the [PTC Google Drive Folder](https://drive.google.com/drive/u/1/folders/1-XAAKCA1GdnRULS9ZmkOrcrQ8_rpjBjw) is added to your Drive root level. 
#@markdown (To add it, right click the name "PTC" and click add shortcut to Drive, then put it in My Drive). These paths point to the data file that will be read and updated in that folder.
cached_data_path = '/content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv'#@param {type:"string"} 
save_data_path = '/content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv'#@param {type:"string"} 

#@markdown Add a comma-separated list of columns from Our World in Data you would like included in your generated file.
date_df_columns = 'date,total_vaccinations,people_vaccinated' #@param {type:"string"}
date_df_columns = date_df_columns.split(",")

#@markdown Uncheck the following box if you'd like to overwrite the data file rather than using cached data. This is not recommended unless you are completely changing the structure of the file.
use_cached_data = True #@param {type:"boolean"}

#@markdown Finally, run this cell to save your configurations.


Mounted at /content/drive


# Get Country Data & Metadata

## Download commit metadata from GitHub

In [4]:
# Get last page of commit data
try:
  with open(github_token_filepath) as f:
    token = f.readlines()[0]
except FileNotFoundError:
  print('ERROR: GitHub PHA token file not found - check setup.')

else:
  headers={'Authorization': 'token ' + token}
  url = 'https://api.github.com/repos/owid/covid-19-data/commits?path=public/data/vaccinations/vaccinations.csv'
  response = requests.head(url, headers=headers)
  last_page = int(re.split(',|<|>|;',response.headers['Link'])[5][-2:])
  print(str(response.status_code) + ' : ' + response.reason)
  print("Last page of data: %d" %  last_page)

  # Get SHAs
  all_shas = {}
  for i in range(1,last_page+1):
    url = 'https://api.github.com/repos/owid/covid-19-data/commits?path=public/data/vaccinations/vaccinations.csv&page=%d' % i
    response = requests.get(url,headers=headers)
    for commit in response.json():
      all_shas[commit['commit']['author']['date']] = commit['sha']
  print("Retrieved SHAs")

  # Pare to last commit dates
  datetimes = defaultdict(str)
  for (datetime,sha) in all_shas.items():
    date = datetime.split("T")[0]
    time = datetime.split("T")[1]
    if datetimes[date] < time:
      datetimes[date] = time
  final_shas = dict((date,all_shas[date+'T'+time]) for (date,time) in datetimes.items())

  print("Selected latest SHAs. Days of data: %d" % len(final_shas))

200 : OK
Last page of data: 38
Retrieved SHAs
Selected latest SHAs. Days of data: 602


## Build data file

In [5]:
no_data, new_cols = False, False

# Check for data file and if there are any new columns we need to generate
try:
  dates_df = pd.read_csv(cached_data_path,index_col=0)
  new_cols = not all(col in dates_df.columns for col in date_df_columns)

except FileNotFoundError:
  no_data = True

if not use_cached_data or no_data or new_cols:
  print("Constructing dates_df from scratch...")
  responses = {}
  dates_df = pd.DataFrame(columns=['country','owid_date','sha'] + date_df_columns)
  final_sha_values = list(final_shas.items())
  for i in tqdm(range(len(final_sha_values))):
    owid_date = final_sha_values[i][0]
    sha = final_sha_values[i][1]
    url = 'https://raw.githubusercontent.com/owid/covid-19-data/' + sha + '/public/data/vaccinations/vaccinations.csv'
    df = pd.read_csv(url)
    try:
      for country in df['iso_code'].unique():
        row_vals = {'country': country, 'owid_date': owid_date, 'sha': sha}
        for col in date_df_columns:
          row_vals[col] = df[df['iso_code']==country][col].max()
        dates_df=dates_df.append(row_vals, ignore_index=True)
    except KeyError:
      None # old file

else:
  print("Checking for new dates to update...")
  final_sha_values = list(final_shas.items())
  for i in tqdm(range(len(final_sha_values))):
    owid_date = final_sha_values[i][0]
    sha = final_sha_values[i][1]
    if not any(dates_df.sha == sha):
      url = 'https://raw.githubusercontent.com/owid/covid-19-data/' + sha + '/public/data/vaccinations/vaccinations.csv'
      df = pd.read_csv(url)
      try:
        for country in df['iso_code'].unique():
          row_vals = {'country': country, 'owid_date': owid_date, 'sha': sha}
          for col in date_df_columns:
            country_df = df[df['iso_code']==country]
            row_vals[col] = country_df[col].max()
          dates_df=dates_df.append(row_vals, ignore_index=True)
      except KeyError:
        None # old file

dates_df

Checking for new dates to update...


100%|██████████| 602/602 [00:12<00:00, 49.81it/s]






Unnamed: 0,country,owid_date,sha,date,total_vaccinations,people_vaccinated
0,AFG,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-07-19,7.885045e+06,7.139453e+06
1,OWID_AFR,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-08-01,5.956788e+08,3.671519e+08
2,ALB,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-07-24,2.934116e+06,1.330520e+06
3,DZA,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-05-29,1.520585e+07,8.210605e+06
4,AND,2022-08-02,d93d4c8747d936a1b83ea5b0784eaec00824adca,2022-07-10,1.535310e+05,5.788800e+04
...,...,...,...,...,...,...
122057,WLF,2022-08-11,386307907c0625789f55cea0c84c994ea5e7bb21,2022-05-02,1.642600e+04,6.483000e+03
122058,OWID_WRL,2022-08-11,386307907c0625789f55cea0c84c994ea5e7bb21,2022-08-10,1.243419e+10,5.329821e+09
122059,YEM,2022-08-11,386307907c0625789f55cea0c84c994ea5e7bb21,2022-08-02,8.806090e+05,7.127810e+05
122060,ZMB,2022-08-11,386307907c0625789f55cea0c84c994ea5e7bb21,2022-08-07,8.130436e+06,7.540676e+06


In [6]:
dates_df.to_csv(save_data_path)
print("Saved to %s" % save_data_path)

Saved to /content/drive/MyDrive/PTC/GlobalVaxUpdates/dates_df.csv


## Build metadata file

In [7]:
countries = [c for c in dates_df.country.unique() if not pd.isnull(c) and 'OWID' not in c]
countries_df = pd.DataFrame(countries,columns=['Code'])

# Income level
income_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/scripts/input/wb/income_groups.csv')
countries_df = countries_df.merge(income_df[['Code','Income group']],how='left',left_on='Code',right_on='Code')

# WHO region
who_regions_df = pd.read_csv('/content/drive/MyDrive/PTC/GlobalVaxUpdates/who-regions.csv')
countries_df = countries_df.merge(who_regions_df[['Code','WHO region']],how='left',left_on='Code',right_on='Code')

# OWID data sources
owid_sources_records = gc.open_by_key('1rZzJ5SITxCXBVi0DWwXmaBeBKNnHv4-OGw5EtuWDTiY').sheet1.get_all_values()
owid_sources_df = pd.DataFrame.from_records(owid_sources_records[1:],columns=owid_sources_records[0])
owid_sources_df = owid_sources_df.replace(r'^\s*$', np.nan, regex=True)
countries_df = countries_df.merge(owid_sources_df[['Country','Code','OWID Vax Source Category']],how='left', left_on='Code', right_on='Code')

countries_df

Unnamed: 0,Code,Income group,WHO region,Country,OWID Vax Source Category
0,AFG,Low income,Eastern Mediterranean,Afghanistan,World Health Organization
1,ALB,Upper middle income,Europe,Albania,World Health Organization
2,DZA,Lower middle income,Africa,Algeria,World Health Organization
3,AND,High income,Europe,Andorra,World Health Organization
4,AGO,Lower middle income,Africa,Angola,World Health Organization
...,...,...,...,...,...
213,WLF,,,Wallis and Futuna,Pacific Data Hub (PDH)
214,YEM,Low income,Eastern Mediterranean,Yemen,World Health Organization
215,ZMB,Lower middle income,Africa,Zambia,
216,ZWE,Lower middle income,Africa,Zimbabwe,Country


# Graph Data Preprocessing

## Add diff data

In [8]:
# Add date differences
dates_df_diff = dates_df.copy()
dates_df_diff[['owid_date','date']] = (dates_df_diff[['owid_date','date']].apply(pd.to_datetime))
dates_df_diff['date_diff'] = (dates_df_diff['owid_date'] - dates_df_diff['date']).dt.days

# Get most recent OWID data
most_recent_sha = final_shas[max(final_shas)]
url =  'https://raw.githubusercontent.com/owid/covid-19-data/' + most_recent_sha + '/public/data/vaccinations/vaccinations.csv'
most_recent_df = pd.read_csv(url)
most_recent_df = most_recent_df[date_df_columns + ['iso_code']]
most_recent_df[['date']]=most_recent_df[['date']].apply(pd.to_datetime)

# Merge most recent OWID data into dataframe and sort by country then date
dates_df_diff = pd.merge(dates_df_diff, most_recent_df, how='left', left_on=['country','owid_date'], right_on=['iso_code','date'],suffixes=["_report","_event"]).drop(labels=["iso_code","date_event"], axis=1).sort_values(by=['country','owid_date'], axis=0)

# Fill non-updated days and take diff
df_columns = [c for c in date_df_columns if c != 'date']
dates_df_diff[[c+'_event' for c in df_columns]] = dates_df_diff.groupby('country')[[c+'_event' for c in df_columns]].apply(lambda x: x.fillna(method='ffill'))
for c in df_columns:
  dates_df_diff[c+'_diff'] = (dates_df_diff[c+'_event']-dates_df_diff[c+'_report'])

dates_df_diff



Unnamed: 0,country,owid_date,sha,date_report,total_vaccinations_report,people_vaccinated_report,date_diff,total_vaccinations_event,people_vaccinated_event,total_vaccinations_diff,people_vaccinated_diff
107201,ABW,2021-04-21,61e07ba791f81bcabe20ccb9c54802512efb9b06,2021-04-20,43945.0,27511.0,1.0,62342.0,44421.0,18397.0,16910.0
107003,ABW,2021-04-22,f7c1b407f3a0f1baba138a976ec8b970061b242f,2021-04-20,43945.0,27511.0,2.0,64288.0,45097.0,20343.0,17586.0
106802,ABW,2021-04-23,ec873e5352966b8614f083415e78df4fcda2b307,2021-04-22,64288.0,45097.0,1.0,67950.0,48061.0,3662.0,2964.0
106601,ABW,2021-04-24,3f2a67d5347019f6e22a93cd7ad8e7f3e7fe7791,2021-04-23,67950.0,48061.0,1.0,69279.0,49351.0,1329.0,1290.0
106400,ABW,2021-04-25,e258b748476aba28663064141f989895218bdc62,2021-04-24,69279.0,49351.0,1.0,73103.0,53161.0,3824.0,3810.0
...,...,...,...,...,...,...,...,...,...,...,...
114112,,2021-03-11,7ab4bd8e36b5b576da5f81f91b6632da427756a0,NaT,,,,,,,
113982,,2021-03-12,5a32237775512b77d5b6911f79f2cb7d5c9f38d0,NaT,,,,,,,
113850,,2021-03-13,9be7fbe5e4c13a94296d5d9232aa53157c8da91e,NaT,,,,,,,
113718,,2021-03-14,99477fa2d05600101a6b377bf5ac9d6c3d426193,NaT,,,,,,,


## Aggregate datasets 

In [9]:
# Merge in metadata
dates_df_diff_expanded = dates_df_diff.merge(countries_df, how='left', left_on=['country'], right_on=['Code'])
aggregate_dict = {c + '_diff' : ['sum'] for c in df_columns}
aggregate_dict['date_diff']=['mean']

# Create aggregated income dataset
income_order = ['Low income','Lower middle income','Upper middle income','High income']
income = dates_df_diff_expanded.groupby(['Income group','owid_date']).agg(aggregate_dict).reset_index()
income.columns = ['_'.join(col).strip('_') for col in income.columns.values]

# Create aggregated WHO region dataset
region = dates_df_diff_expanded.groupby(['WHO region','owid_date']).agg(aggregate_dict).reset_index()
region.columns = ['_'.join(col).strip('_') for col in region.columns.values]

# Create aggregated source dataset
source = dates_df_diff_expanded.groupby(['OWID Vax Source Category','owid_date']).agg(aggregate_dict).reset_index()
source.columns = ['_'.join(col).strip('_') for col in source.columns.values]

# Graphs

## Reporting lags

In [10]:
country = 'USA' #@param ['AFG', 'ALB', 'DZA', 'AND', 'AGO', 'AIA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BES', 'BIH', 'BWA', 'BRA', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COG', 'COK', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'COD', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'EST', 'SWZ', 'ETH', 'FRO', 'FLK', 'FJI', 'FIN', 'FRA', 'PYF', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GTM', 'GGY', 'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'IMN', 'ISR', 'ITA', 'JAM', 'JPN', 'JEY', 'JOR', 'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MRT', 'MUS', 'MEX', 'MDA', 'MCO', 'MNG', 'MNE', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NIU', 'MKD', 'NOR', 'OMN', 'PAK', 'PSE', 'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'PCN', 'POL', 'PRT', 'QAT', 'ROU', 'RUS', 'RWA', 'SHN', 'KNA', 'LCA', 'VCT', 'WSM', 'SMR', 'STP', 'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SXM', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'KOR', 'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TLS', 'TGO', 'TKL', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'URY', 'UZB', 'VUT', 'VEN', 'VNM', 'WLF', 'YEM', 'ZMB', 'ZWE', 'PLW']
country_df = dates_df_diff_expanded[dates_df_diff_expanded['country'] == country]
alt.Chart(country_df).mark_point().encode(
    x=alt.X('owid_date',title='Date of OWID Report'),
    y=alt.Y('date_diff',title='Reporting Lag')
).properties(
    title=country
)

In [11]:
# Chart reporting lag by income average with facet
income_facet = alt.Chart(income).mark_point().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
).facet(column=alt.Column(
    "Income group",title=None,sort=income_order), columns=4).properties(
        title="Mean Reporting Lag per Day by Country Income Group")

income_facet.display()

# Chart reporting lag by WHO with facet
region_facet = alt.Chart(region).mark_point().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
).facet(column=alt.Column(
    "WHO region",title=None), columns=6).properties(
        title="Mean Reporting Lag per Day by WHO region")
    
region_facet.display()


# Chart reporting lag by data source with facet
source_facet = alt.Chart(source).mark_point().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
).facet(column=alt.Column(
    "OWID Vax Source Category",title=None), columns=6).properties(
        title="Mean Reporting Lag per Day by OWID Source Type")
    
source_facet.display()

In [12]:
today_df=dates_df_diff_expanded[dates_df_diff_expanded['owid_date']==dates_df_diff_expanded['owid_date'].max()]
today_df_income = today_df[today_df['Income group'].notna()]
today_df_region = today_df[today_df['WHO region'].notna()]
today_df_source = today_df[today_df['OWID Vax Source Category'].notna()]

# Current lag scatter
lag_by_income = alt.Chart(today_df_income).mark_point().encode(
    x=alt.X('Income group',sort=income_order),
    y=alt.Y('date_diff',title="Current reporting lag (days)")
).properties(
    width=300,
    height=600
)

lag_by_region = alt.Chart(today_df_region).mark_point().encode(
    x=alt.X('WHO region'),
    y=alt.Y('date_diff',title="Current reporting lag (days)")
).properties(
    width=300,
    height=600
)

lag_by_source = alt.Chart(today_df_source).mark_point().encode(
    x=alt.X('OWID Vax Source Category'),
    y=alt.Y('date_diff',title="Current reporting lag (days)")
).properties(
    width=300,
    height=600
)


alt.hconcat(lag_by_income,lag_by_region,lag_by_source)

In [13]:
# Chart by income average with color
income_average = alt.Chart(income).mark_point().encode(
        x=alt.X('owid_date',title='Date of OWID Report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
        color=alt.Color('Income group',sort=income_order,title='Income group')
).properties(
    title='Average OWID Reporting Lag per Day by Country Income Group'
)

# Chart by WHO region with color
region_average = alt.Chart(region).mark_point().encode(
        x=alt.X('owid_date',title='Date of OWID Report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
        color=alt.Color('WHO region',title='WHO region')
).properties(
    title='Average OWID Reporting Lag per Day by WHO Region'
)

# Chart by OWID source average with color
source_average = alt.Chart(source).mark_point().encode(
        x=alt.X('owid_date',title='Date of OWID Report'),
        y=alt.Y('date_diff_mean',title='Mean reporting lag'),
        color=alt.Color('OWID Vax Source Category',title='OWID Source Type')
).properties(
    title='Average OWID Reporting Lag per Day by OWID Source type'
)

income_average.display()
region_average.display()
source_average.display()

## Impact

In [14]:
# Chart reporting lag by income average with facet
dose_income = alt.Chart(income).mark_line().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('total_vaccinations_diff_sum',title='Dose reporting lag'),
).facet(column=alt.Column(
    "Income group",title=None,sort=income_order), columns=4).properties(
        title="Dose Lag per Day by Country Income Group")
    
dose_income.display()

people_income = alt.Chart(income).mark_line().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('people_vaccinated_diff_sum',title='People vaccinated reporting lag'),
).facet(column=alt.Column(
    "Income group",title=None,sort=income_order), columns=4).properties(
        title="People Vaccinated Lag per Day by Country Income Group")

people_income.display()


In [15]:
# Chart reporting lag by WHO region with facet
dose_region=alt.Chart(region).mark_line().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('total_vaccinations_diff_sum',title='Dose reporting lag'),
).facet(column=alt.Column(
    "WHO region",title=None), columns=6).properties(
        title="Dose Lag per Day by WHO Region")
dose_region.display()
people_region=alt.Chart(region).mark_line().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('people_vaccinated_diff_sum',title='People vaccinated reporting lag'),
).facet(column=alt.Column(
    "WHO region",title=None), columns=6).properties(
        title="People Vaccinated Lag per Day by WHO Region")
    
people_region.display()

In [16]:
# Chart reporting lag by source with facet
dose_source=alt.Chart(source).mark_line().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('total_vaccinations_diff_sum',title='Dose reporting lag'),
).facet(column=alt.Column(
    "OWID Vax Source Category",title=None), columns=6).properties(
        title="Dose Lag per Day by OWID Source Type")
dose_source.display()

people_source=alt.Chart(source).mark_line().encode(
        x=alt.X('owid_date',title='Date of OWID report'),
        y=alt.Y('people_vaccinated_diff_sum',title='People vaccinated reporting lag'),
).facet(column=alt.Column(
    "OWID Vax Source Category",title=None), columns=6).properties(
        title="People Vaccinated Lag per Day by OWID Source Type")
    
people_source.display()