In [1]:
from functools import reduce
import pandas as pd
import requests
import datetime
import numpy as np
from polio_utils import download_polio_data, extract_wild_cases, extract_vd_cases, owid_population, standardise_countries, extract_historical_wild_cases, get_who_data_and_regions


Download and extract the data from the latest wild polio virus pdf from polioeradication.org

Data from 1980-2019 from WHO - download from http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1

In [2]:
who_melt, regions = get_who_data_and_regions()

We'll sum the VD and WT polio cases for total polio after 2016, the WHO sheet seems to undercount.

In [3]:
who_melt = who_melt[who_melt.year < 2016]

Get wild type data

In [4]:
res = download_polio_data(url_stub='https://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-WPV-')
date_today = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

wt_df = extract_wild_cases(file_path = fp)
wt_df['entity'] = standardise_countries(wt_df['entity'])


res = requests.get('https://polioeradication.org/wp-content/uploads/2017/01/WPV_2011-2016_03JAN17.pdf')

fp = f"data/polio_historical_wild_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

wth_df = extract_historical_wild_cases(file_path = fp)
wth_df['entity'] = standardise_countries(wth_df['entity'])

wt_df = wt_df.append(wth_df)
wt_df.year = wt_df.year.astype(int)
wt_df.wild_polio_cases = wt_df.wild_polio_cases.astype(int)


Get vaccine derived cases

In [5]:
res = download_polio_data(url_stub='http://polioeradication.org/wp-content/uploads/2022/03/weekly-polio-analyses-cVDPV-')
date_today = datetime.date.today().strftime("%Y-%m-%d")
fp = f"data/polio_vaccine_derived_cases_{date_today}.pdf"

with open(fp, 'wb') as f:
    f.write(res.content)

vd_df = extract_vd_cases()
vd_df['entity'] = standardise_countries(vd_df['entity'])
vd_df.year = vd_df.year.astype(int)
vd_df.total_cVDPV = vd_df.total_cVDPV.astype(int)

vd_df = vd_df.groupby(['entity', 'year']).sum()
vd_df = vd_df.reset_index()

In [6]:
polio_dataframes = [wt_df, vd_df, who_melt]

polio_df = reduce(
        lambda left, right: pd.merge(left, right, on=["entity", "year"], how="outer"),
        polio_dataframes,
    )


In [7]:
df = polio_df.set_index(['entity','year'])
mux = pd.MultiIndex.from_product([df.index.levels[0], df.index.levels[1]],names=['entity','year'])
df = df.reindex(mux, fill_value=np.nan).reset_index()


Calculating total polio for 2020 and 2021 as these aren't in the WHO file.

In [8]:
df['total_polio'][df.year >= 2016 ] = df['wild_polio_cases'][df.year >= 2016].fillna(0) + df['total_cVDPV'][df.year >= 2016].fillna(0)
df[df.entity == 'Afghanistan']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_polio'][df.year >= 2016 ] = df['wild_polio_cases'][df.year >= 2016].fillna(0) + df['total_cVDPV'][df.year >= 2016].fillna(0)


Unnamed: 0,entity,year,wild_polio_cases,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,total_polio
0,Afghanistan,1980,,,,,,880.0
1,Afghanistan,1981,,,,,,837.0
2,Afghanistan,1982,,,,,,1390.0
3,Afghanistan,1983,,,,,,1991.0
4,Afghanistan,1984,,,,,,552.0
5,Afghanistan,1985,,,,,,1981.0
6,Afghanistan,1986,,,,,,1843.0
7,Afghanistan,1987,,,,,,628.0
8,Afghanistan,1988,,,,,,307.0
9,Afghanistan,1989,,,,,,55.0


Add the regional totals

In [31]:
regional_total = regions.merge(df,on = 'entity').groupby(['WHO_REGION', 'year']).sum(min_count=1).reset_index()
regional_total['WHO_REGION'].replace(['AFR', 'AMR', 'SEAR', 'EUR', 'EMR', 'WPR'], ['Africa', 'Americas', 'South-East Asia', 'Europe', 'Eastern Mediterranean', 'Western Pacific'], inplace = True)
regional_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)
regional_total

Unnamed: 0,entity,year,wild_polio_cases,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,total_polio
0,Africa,1980,,,,,,5126.0
1,Africa,1981,,,,,,4191.0
2,Africa,1982,,,,,,3897.0
3,Africa,1983,,,,,,3066.0
4,Africa,1984,,,,,,2968.0
...,...,...,...,...,...,...,...,...
247,Western Pacific,2017,,,,,,0.0
248,Western Pacific,2018,,26.0,0.0,0.0,26.0,26.0
249,Western Pacific,2019,,5.0,13.0,0.0,18.0,18.0
250,Western Pacific,2020,,1.0,1.0,0.0,2.0,2.0


Adding a global total 

In [32]:
global_entities = regions
global_entities['WHO_REGION'] = 'World'
global_total = global_entities.merge(df,on = 'entity').groupby(['WHO_REGION', 'year']).sum(min_count=1).reset_index()
global_total.rename(columns = {'WHO_REGION':'entity'}, inplace = True)

In [33]:
df.drop(df[df['entity'] == 'World'].index, inplace=True)
total_df = pd.concat([regional_total, df, global_total])
total_df['total_polio'] = total_df['total_polio'].fillna(0)


Add per million variables

In [34]:
population = owid_population()

pop_df = pd.DataFrame(pd.merge(left = population, right = total_df, how="right"))
per_mil_df = pop_df[['entity', 'year']]
per_mil_df[['wild_polio_cases_per_million','cVDPV1_per_million','cVDPV2_per_million','cVDPV3_per_million','total_cVDPV_per_million','total_polio_per_million']] = pop_df[['wild_polio_cases','cVDPV1','cVDPV2','cVDPV3','total_cVDPV','total_polio']].div(pop_df.population, axis=0).mul(1000000)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


For wild polio we can fill NA for 2011 with 0, for vaccine derived it is 2016 onwards. To ensure that countries with 0 cases show up in the grapher as such.

In [35]:
final_df = pop_df.merge(per_mil_df).drop(columns = 'population')
final_df['wild_polio_cases'][(final_df.year >=2011) & (final_df['wild_polio_cases'].isna())] = 0
final_df['wild_polio_cases_per_million'][(final_df.year >=2011) & (final_df['wild_polio_cases_per_million'].isna())] = 0

final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
final_df['total_cVDPV_per_million'][(final_df.year >=2016) & (final_df['total_cVDPV_per_million'].isna())] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['wild_polio_cases'][(final_df.year >=2011) & (final_df['wild_polio_cases'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['wild_polio_cases_per_million'][(final_df.year >=2011) & (final_df['wild_polio_cases_per_million'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['total_cVDPV'][(final_df.year >=2016) & (final_df['total_cVDPV'].isna())] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See 

In [36]:
final_df

Unnamed: 0,entity,year,wild_polio_cases,cVDPV1,cVDPV2,cVDPV3,total_cVDPV,total_polio,wild_polio_cases_per_million,cVDPV1_per_million,cVDPV2_per_million,cVDPV3_per_million,total_cVDPV_per_million,total_polio_per_million
0,Africa,1980,,,,,,5126.0,,,,,,10.760177
1,Africa,1981,,,,,,4191.0,,,,,,8.552993
2,Africa,1982,,,,,,3897.0,,,,,,7.731619
3,Africa,1983,,,,,,3066.0,,,,,,5.913442
4,Africa,1984,,,,,,2968.0,,,,,,5.564877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8479,World,2017,22.0,0.0,96.0,0.0,96.0,118.0,0.002915,0.000000,0.012718,0.000000,0.012718,0.015633
8480,World,2018,33.0,27.0,130.0,75.0,232.0,265.0,0.004324,0.003538,0.017036,0.009828,0.030402,0.034726
8481,World,2019,176.0,12.0,366.0,0.0,378.0,554.0,0.022817,0.001556,0.047449,0.000000,0.049005,0.071822
8482,World,2020,140.0,34.0,1079.0,0.0,1113.0,1253.0,0.017961,0.004362,0.138426,0.000000,0.142788,0.160748


In [37]:
final_df.to_csv(f'data/polio_cases_to_upload_{date_today}.csv', index=False)