In [437]:
from functools import reduce
import pandas as pd
import datetime
from polio_utils import download_polio_data, extract_wild_cases, extract_vd_cases, owid_population, standardise_countries


Download and extract the data from the latest wild polio virus pdf

In [418]:
res = download_polio_data(url_stub='https://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-WPV-')

with open('data/polio_wild_cases.pdf', 'wb') as f:
    f.write(res.content)

wt_df = extract_wild_cases(file_path='data/polio_wild_cases.pdf')

Calculate cases per million population

In [419]:
population = owid_population()
wt_df['entity'] = standardise_countries(wt_df['entity'])
wt_df['year'] = wt_df['year'].astype(int)
wt_df['wild_polio_1_cases'] = wt_df['wild_polio_1_cases'].astype(int)

wt_pop = pd.DataFrame(pd.merge(left = population, right = wt_df, how="right"))
wt_pop['wild_polio_1_cases_per_million'] = (wt_pop['wild_polio_1_cases']/wt_pop['population']) * 1000000
wt_pop['wild_polio_1_cases_per_million'] = wt_pop['wild_polio_1_cases_per_million'].fillna(0).round(decimals=3)

wt_pop

Unnamed: 0,entity,year,population,wild_polio_1_cases,wild_polio_1_cases_per_million
0,Pakistan,2016,203631360,20,0.098
1,Afghanistan,2016,35383028,13,0.367
2,Malawi,2016,17205254,0,0.0
3,Nigeria,2016,185960256,4,0.022
4,Iran,2016,79563992,0,0.0
5,World,2016,7464344332,37,0.005
6,Pakistan,2017,207906208,8,0.038
7,Afghanistan,2017,36296108,14,0.386
8,Malawi,2017,17670194,0,0.0
9,Nigeria,2017,190873248,0,0.0


Download and extract the data from the latest vaccine derived polio cases pdf

In [420]:
res = download_polio_data(url_stub='http://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-cVDPV-')

with open('data/polio_vaccine_derived_cases.pdf', 'wb') as f:
    f.write(res.content)

vd_df = extract_vd_cases(file_path = 'data/polio_vaccine_derived_cases.pdf')

Calculate cases per million population

In [421]:
population = owid_population()
vd_df['entity'] = standardise_countries(vd_df['entity'])
vd_df['year'] = vd_df['year'].astype(int)

vd_pop = pd.DataFrame(pd.merge(left = population, right = vd_df, how="right"))
vd_pop['cVDPV1_per_million'] = (vd_pop['cVDPV1']/vd_pop['population']) * 1000000
vd_pop['cVDPV2_per_million'] = (vd_pop['cVDPV2']/vd_pop['population']) * 1000000
vd_pop['cVDPV3_per_million'] = (vd_pop['cVDPV3']/vd_pop['population']) * 1000000
vd_pop['total_cVDPV_per_million'] = (vd_pop['total_cVDPV']/vd_pop['population']) * 1000000

vd_pop

Unnamed: 0,entity,year,population,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million
0,Afghanistan,2020,38928340,0.0,308.0,0.0,308.0,0.000000,7.911974,0.0,7.911974
1,Afghanistan,2021,39835428,0.0,43.0,0.0,43.0,0.000000,1.079441,0.0,1.079441
2,Angola,2019,31825298,0.0,138.0,0.0,138.0,0.000000,4.336173,0.0,4.336173
3,Angola,2020,32866270,0.0,3.0,0.0,3.0,0.000000,0.091279,0.0,0.091279
4,Benin,2019,11801151,0.0,8.0,0.0,8.0,0.000000,0.677900,0.0,0.677900
...,...,...,...,...,...,...,...,...,...,...,...
93,Ukraine,2021,43466820,0.0,2.0,0.0,2.0,0.000000,0.046012,0.0,0.046012
94,Yemen,2019,29161922,1.0,0.0,0.0,1.0,0.034291,0.000000,0.0,0.034291
95,Yemen,2020,29825968,31.0,0.0,0.0,31.0,1.039363,0.000000,0.0,1.039363
96,Yemen,2021,30490638,3.0,13.0,0.0,16.0,0.098391,0.426360,0.0,0.524751


Combine the Wild and Vaccine Derived cases

In [422]:
polio_dataframes = [wt_pop, vd_pop]

polio_df = reduce(
        lambda left, right: pd.merge(left, right, on=["entity", "year", "population"], how="outer"),
        polio_dataframes,
    )
polio_df = polio_df.drop(columns = ['population'])


Combine cases for entities where strains are currently recorded on different rows, just Total I think.

In [423]:
polio_df = polio_df.groupby(['entity', 'year']).sum()
polio_df = polio_df.reset_index()
polio_df

Unnamed: 0,entity,year,wild_polio_1_cases,wild_polio_1_cases_per_million,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million
0,Afghanistan,2016,13.0,0.367,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
1,Afghanistan,2017,14.0,0.386,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
2,Afghanistan,2018,21.0,0.565,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
3,Afghanistan,2019,29.0,0.762,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000
4,Afghanistan,2020,56.0,1.439,0.0,308.0,0.0,308.0,0.000000,7.911974,0.0,7.911974
...,...,...,...,...,...,...,...,...,...,...,...,...
100,World,2021,18.0,0.003,16.0,622.0,0.0,638.0,0.002032,0.078984,0.0,0.081016
101,Yemen,2019,0.0,0.000,1.0,0.0,0.0,1.0,0.034291,0.000000,0.0,0.034291
102,Yemen,2020,0.0,0.000,31.0,0.0,0.0,31.0,1.039363,0.000000,0.0,1.039363
103,Yemen,2021,0.0,0.000,3.0,13.0,0.0,16.0,0.098391,0.426360,0.0,0.524751


Add columns for: total polio = wild + vaccine derived

In [424]:
polio_df['total_polio'] = polio_df['wild_polio_1_cases'] + polio_df['total_cVDPV']
polio_df['total_polio_per_million'] = polio_df['wild_polio_1_cases_per_million'] + polio_df['total_cVDPV_per_million']
polio_df[['total_polio','total_polio_per_million']] = polio_df[['total_polio','total_polio_per_million']].fillna(0)

Data from 1980 onwards from WHO - download from http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1

In [448]:
who_polio = pd.read_excel('data/incidence_series.xls', sheet_name='Polio')
who_polio

Unnamed: 0,WHO_REGION,ISO_code,Cname,Disease,2019,2018,2017,2016,2015,2014,...,1989,1988,1987,1986,1985,1984,1983,1982,1981,1980
0,EMR,AFG,Afghanistan,polio,0.0,0.0,0.0,13.0,20.0,28.0,...,55.0,307.0,628.0,1843.0,1981.0,552.0,1991.0,1390.0,837.0,880.0
1,EUR,ALB,Albania,polio,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
2,AFR,DZA,Algeria,polio,0.0,0.0,0.0,0.0,0.0,0.0,...,18.0,9.0,35.0,29.0,66.0,108.0,132.0,71.0,114.0,116.0
3,EUR,AND,Andorra,polio,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,AFR,AGO,Angola,polio,138.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,15.0,37.0,14.0,3.0,0.0,6.0,12.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,AMR,VEN,Venezuela (Bolivarian Republic of),polio,0.0,0.0,0.0,0.0,0.0,0.0,...,16.0,17.0,46.0,27.0,8.0,9.0,9.0,30.0,68.0,11.0
190,WPR,VNM,Viet Nam,polio,,,0.0,0.0,0.0,0.0,...,427.0,839.0,1449.0,938.0,1600.0,1158.0,1109.0,897.0,644.0,1741.0
191,EMR,YEM,Yemen,polio,1.0,0.0,0.0,0.0,0.0,0.0,...,701.0,114.0,179.0,601.0,336.0,767.0,633.0,235.0,541.0,722.0
192,AFR,ZMB,Zambia,polio,2.0,0.0,0.0,0.0,0.0,0.0,...,47.0,85.0,69.0,134.0,128.0,177.0,182.0,243.0,429.0,276.0


Calculating WHO regional totals

In [449]:
regions = who_polio[['WHO_REGION', 'Cname']].drop_duplicates().rename(columns = {'Cname':'entity'})
who_polio.drop(columns = ['Disease','WHO_REGION','ISO_code',], inplace=True)
who_df = pd.melt(who_polio, id_vars=['Cname'])

In [450]:
who_df['entity'] = standardise_countries(who_df['Cname'])
who_df = who_df[['entity', 'variable', 'value']].rename(columns = {'variable':'year', 'value':'total_polio'})
who_df[['year']]=who_df[['year']].astype(int)
who_df = who_df[who_df['year'] < 2016]

In [452]:
population = owid_population()
who_df = pd.DataFrame(pd.merge(left = population, right = who_df, how="right"))
who_df['total_polio_per_million'] = (who_df['total_polio']/who_df['population']) * 1000000
who_df.fillna(0)
who_df.drop(columns = "population", inplace = True)

In [453]:
fill_df = pd.DataFrame([(x, y) for x in who_df['entity'].drop_duplicates() for y in range(2016,datetime.datetime.now().year)])
fill_df.rename(columns={0:'entity', 1:'year'},inplace=True)
fill_df['total_polio'] = None
fill_df['total_polio_per_million'] = None
who_df = who_df.append(pd.DataFrame(data = fill_df), ignore_index=True)


In [461]:
total_df = polio_df.merge(who_df, on = ['entity', 'year', 'total_polio', 'total_polio_per_million'], how = "outer")

Unnamed: 0,entity,year,wild_polio_1_cases,wild_polio_1_cases_per_million,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million,total_polio,total_polio_per_million
101,Yemen,2019,0.0,0.0,1.0,0.0,0.0,1.0,0.034291,0.0,0.0,0.034291,1.0,0.034291
102,Yemen,2020,0.0,0.0,31.0,0.0,0.0,31.0,1.039363,0.0,0.0,1.039363,31.0,1.039363
103,Yemen,2021,0.0,0.0,3.0,13.0,0.0,16.0,0.098391,0.42636,0.0,0.524751,16.0,0.524751
296,Yemen,2015,,,,,,,,,,,0.0,0.0
490,Yemen,2014,,,,,,,,,,,0.0,0.0
684,Yemen,2013,,,,,,,,,,,1.0,0.039766
878,Yemen,2012,,,,,,,,,,,3.0,0.122583
1072,Yemen,2011,,,,,,,,,,,9.0,0.378031
1266,Yemen,2010,,,,,,,,,,,0.0,0.0
1460,Yemen,2009,,,,,,,,,,,0.0,0.0


In [462]:
total_df[['total_polio', 'total_polio_per_million']] = total_df[['total_polio', 'total_polio_per_million']].fillna(0)


Renaming WHO region abbreviations

In [463]:
regional_total = regions.merge(total_df,on = 'entity').groupby(['WHO_REGION', 'year']).sum().reset_index()
regional_total['WHO_REGION'].replace(['AFR', 'AMR', 'SEAR', 'EUR', 'EMR', 'WPR'], ['Africa', 'Americas', 'South-East Asia', 'Europe', 'Eastern Mediterranean', 'Western Pacific'], inplace = True)
regional_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)



In [464]:
total_df = pd.concat([regional_total, total_df])
total_df[['total_polio', 'total_polio_per_million']] = total_df[['total_polio', 'total_polio_per_million']].fillna(0)
total_df = total_df.fillna("")
total_df['total_polio_per_million']  = round(total_df['total_polio_per_million'],3)


In [465]:
total_df.to_csv('data/polio_cases_to_upload.csv', index=False)