<a href="https://colab.research.google.com/github/pandemic-tracking/global-vaccine/blob/main/OWID_WHO_vaccine_administration_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OWID/WHO comparison

# Prepare dataset

In [None]:
import pandas as pd
from datetime import datetime
from google.colab import data_table
data_table.enable_dataframe_formatter()

# get OWID vaccination timeseries from Github
owid_data = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv")

# drop non cumulative columns from OWID data
owid_data.drop(columns=[
    'daily_vaccinations','total_vaccinations_per_hundred','people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred','total_boosters_per_hundred','daily_vaccinations_per_million',
    'daily_people_vaccinated','daily_people_vaccinated_per_hundred','daily_vaccinations_raw'], inplace = True)

# forward fill empty values in owid dataset
#owid_data.fillna(method='ffill',inplace = True)
owid_data[['total_vaccinations','people_vaccinated','total_boosters']] = owid_data.groupby('iso_code')[['total_vaccinations','people_vaccinated','total_boosters']].apply(lambda x: x.fillna(method='ffill'))


# get latest date for each country in OWID dataset
latest_owid_dates = owid_data.groupby('iso_code')['date'].max().to_frame()

# merge full date 
latest_owid_data = pd.merge(latest_owid_dates,owid_data,on=['iso_code','date'])

# get WHO data
who_data = pd.read_csv("https://covid19.who.int/who-data/vaccination-data.csv")

# get PTC owid source classifications
owid_sources = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vTDKyIaQVtTIy7kn5pD2W8oKM3YoX3YOdSsH3q-r0INH2axjQl6YxgDHBi4HikKx_cmRElde_E-2vlr/pub?gid=2040574494&single=true&output=csv").filter(['Code','OWID Vax Source Category'])

# merge latest OWID data with WHO data
merged_data = pd.merge(latest_owid_data,who_data, how='inner', left_on='iso_code', right_on='ISO3').drop(columns=[
    'WHO_REGION','TOTAL_VACCINATIONS_PER100', 'PERSONS_VACCINATED_1PLUS_DOSE_PER100','PERSONS_FULLY_VACCINATED_PER100',
    'VACCINES_USED', 'FIRST_VACCINE_DATE', 'NUMBER_VACCINES_TYPES_USED', 'PERSONS_BOOSTER_ADD_DOSE_PER100',
    'PERSONS_BOOSTER_ADD_DOSE','PERSONS_VACCINATED_1PLUS_DOSE','PERSONS_FULLY_VACCINATED','people_vaccinated'
    ])
merged_data.rename(columns = {'date':'owid_date','DATE_UPDATED':'WHO_DATE',
'total_vaccinations':'owid_total_vaccinations',
'people_vaccinated':'owid_people_vaccinated',
'TOTAL_VACCINATIONS':'WHO_TOTAL_VACCINATIONS',
'PERSONS_VACCINATED_1PLUS_DOSE':'WHO_PERSONS_VACCINATED_1PLUS_DOSE',
'PERSONS_FULLY_VACCINATED':'WHO_PERSONS_FULLY_VACCINATED',
}, inplace = True)

# merge combined OWID+WHO data source with PTC owid source classifications
merged_data = pd.merge(merged_data,owid_sources, how='inner', left_on='ISO3', right_on='Code')

# calculate total vaccines diff
merged_data['diff_total_vaccinations'] = merged_data.WHO_TOTAL_VACCINATIONS - merged_data.owid_total_vaccinations
print('Diff = WHO - owid' )
merged_data





Diff = WHO - owid


Unnamed: 0,iso_code,owid_date,location,owid_total_vaccinations,people_fully_vaccinated,total_boosters,COUNTRY,ISO3,DATA_SOURCE,WHO_DATE,WHO_TOTAL_VACCINATIONS,Code,OWID Vax Source Category,diff_total_vaccinations
0,ABW,2022-08-22,Aruba,173010.0,83373.0,,Aruba,ABW,REPORTING,2022-08-12,172899.0,ABW,Country,-111.0
1,AFG,2022-08-15,Afghanistan,11216694.0,9613976.0,,Afghanistan,AFG,REPORTING,2022-08-15,11216694.0,AFG,World Health Organization,0.0
2,AGO,2022-08-14,Angola,21991698.0,7713554.0,1066507.0,Angola,AGO,REPORTING,2022-08-14,21991698.0,AGO,World Health Organization,0.0
3,AIA,2022-08-17,Anguilla,24102.0,10314.0,2942.0,Anguilla,AIA,REPORTING,2022-08-12,24060.0,AIA,Pan American Health Organization,-42.0
4,ALB,2022-07-31,Albania,2940709.0,1255342.0,341320.0,Albania,ALB,REPORTING,2022-07-31,2940709.0,ALB,World Health Organization,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,WSM,2022-08-08,Samoa,503009.0,198896.0,74822.0,Samoa,WSM,REPORTING,2022-05-29,439060.0,WSM,Pacific Data Hub (PDH),-63949.0
210,YEM,2022-08-02,Yemen,880609.0,452843.0,80.0,Yemen,YEM,REPORTING,2022-08-02,880609.0,YEM,World Health Organization,0.0
211,ZAF,2022-08-22,South Africa,37381696.0,19288156.0,3660254.0,South Africa,ZAF,REPORTING,2022-08-14,37318866.0,ZAF,Country,-62830.0
212,ZMB,2022-08-22,Zambia,8830640.0,5642960.0,33684.0,Zambia,ZMB,REPORTING,2022-08-07,8435595.0,ZMB,Country,-395045.0


# Using the merged dataset 

In [None]:
print('\ntotal')
print(len( merged_data))

print('\n Dates')
print('matching ' , len( merged_data.loc[merged_data['owid_date'] == merged_data['WHO_DATE']]))
print('owid greater ' , len(merged_data.loc[merged_data['owid_date'] > merged_data['WHO_DATE']]))
print('owid lesser ' , len(merged_data.loc[merged_data['owid_date'] < merged_data['WHO_DATE']]))

print('\nTotal Vaccinations')
col_list= ['owid_total_vaccinations', 'WHO_TOTAL_VACCINATIONS']
matching_total_vaccinations_df = merged_data.loc[merged_data['owid_total_vaccinations'] == merged_data['WHO_TOTAL_VACCINATIONS']]
print('matching ' , len( matching_total_vaccinations_df.index))
print('WHO total doses: %s' % "{:,}".format(int(matching_total_vaccinations_df['WHO_TOTAL_VACCINATIONS'].sum(axis=0))))
print('OWID total doses: %s' % "{:,}".format(int(matching_total_vaccinations_df['owid_total_vaccinations'].sum(axis=0))))

# owid greater

owid_greater_total_vaccinations_df = merged_data.loc[merged_data['owid_total_vaccinations'] > merged_data['WHO_TOTAL_VACCINATIONS']]
print('\nowid greater ' , len(owid_greater_total_vaccinations_df))
print('WHO total doses: %s' % "{:,}".format(int(owid_greater_total_vaccinations_df['WHO_TOTAL_VACCINATIONS'].sum(axis=0))))
print('OWID total doses: %s' % "{:,}".format(int(owid_greater_total_vaccinations_df['owid_total_vaccinations'].sum(axis=0))))
print('DIFF: %s' % "{:,}".format(int(owid_greater_total_vaccinations_df['owid_total_vaccinations'].sum(axis=0))-int(owid_greater_total_vaccinations_df['WHO_TOTAL_VACCINATIONS'].sum(axis=0))))


owid_lesser_total_vaccinations_df = merged_data.loc[merged_data['owid_total_vaccinations'] < merged_data['WHO_TOTAL_VACCINATIONS']]
print('\nowid lesser ' , len(owid_lesser_total_vaccinations_df))
print('WHO total doses: %s' % "{:,}".format(int(owid_lesser_total_vaccinations_df['WHO_TOTAL_VACCINATIONS'].sum(axis=0))))
print('OWID total doses: %s' % "{:,}".format(int(owid_lesser_total_vaccinations_df['owid_total_vaccinations'].sum(axis=0))))
print('DIFF: %s' % "{:,}".format(int(owid_lesser_total_vaccinations_df['WHO_TOTAL_VACCINATIONS'].sum(axis=0))-int(owid_lesser_total_vaccinations_df['owid_total_vaccinations'].sum(axis=0))))

col_list= ['owid_total_vaccinations', 'WHO_TOTAL_VACCINATIONS']
print('\n')
print('WHO total doses: %s' % "{:,}".format(int(merged_data['WHO_TOTAL_VACCINATIONS'].sum(axis=0))))
print('OWID total doses: %s' % "{:,}".format(int(merged_data['owid_total_vaccinations'].sum(axis=0))))
print('Overall diff total doses (WHO-Owid): %s' % "{:,}".format(int(merged_data['WHO_TOTAL_VACCINATIONS'].sum(axis=0)-merged_data['owid_total_vaccinations'].sum(axis=0))))



total
214

 Dates
matching  97
owid greater  88
owid lesser  29

Total Vaccinations
matching  108
WHO total doses: 2,241,537,165
OWID total doses: 2,241,537,165

owid greater  80
WHO total doses: 5,305,073,126
OWID total doses: 5,375,434,420
DIFF: 70,361,294

owid lesser  26
WHO total doses: 4,851,270,957
OWID total doses: 4,804,680,428
DIFF: 46,590,529


WHO total doses: 12,397,881,248
OWID total doses: 12,421,652,013
Overall diff total doses (WHO-Owid): -23,770,765


# Isolate anomalies

In [None]:
# OWID source is WHO but OWID total vaccinations do not match WHO
og = merged_data.loc[
          (merged_data["OWID Vax Source Category"] == 'World Health Organization') &
          (merged_data.diff_total_vaccinations!=0) 
].drop(columns=['people_fully_vaccinated','total_boosters','COUNTRY','ISO3','DATA_SOURCE','Code','OWID Vax Source Category']).sort_values(by=['diff_total_vaccinations'])
print('found mismatches: ', len(og))
og

found mismatches:  3


Unnamed: 0,iso_code,owid_date,location,owid_total_vaccinations,WHO_DATE,WHO_TOTAL_VACCINATIONS,diff_total_vaccinations
169,SLE,2022-08-07,Sierra Leone,3526495.0,2022-08-07,3493386.0,-33109.0
186,TGO,2022-08-14,Togo,3290821.0,2022-08-14,3262548.0,-28273.0
114,LSO,2022-07-17,Lesotho,1077116.0,2022-08-14,1102069.0,24953.0
